Browse Source

Fix deep recursion issues on videos with many comments

master
JustAnotherArchivist 4 years ago
parent
commit
1df5702c02
2 changed files with 37 additions and 38 deletions
  1. +1
    -2
      README.md
  2. +36
    -36
      comments.py

+ 1
- 2
README.md View File

@@ -5,7 +5,6 @@ A method to grab the comments from YouTube videos
* You can pass multiple video IDs at once as well: `comments VIDEOID1 VIDEOID2 ...`. They get executed sequentially.
* Comments are grabbed in all available sort orders (i.e. "top" and "new", though "top" is retrieved twice since YT returns two continuation tokens for it), including replies and nesting.
* The "top" sort order sometimes doesn't return all comments but might be missing some. The reason for this is unclear.
* On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error.
* Also on videos with many comments, the "new" sort order retrieval will fail a bit less rapidly due to deep recursion.
* On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error. (The "new" sort order should succeed.)
* Everything's written to a few files in the current directory called `youtube-comments-VIDEOID-DATE*`.
* After the retrieval finished cleanly and you're satisfied with the results, you can delete the `.db` and `.log` files. The former is essentially useless, and the latter is contained in the `-meta.warc.gz` file.

+ 36
- 36
comments.py View File

@@ -1,3 +1,4 @@
import collections
import itertools
import os
import qwarc
@@ -43,40 +44,6 @@ class Comments(qwarc.Item):
itct = itct.decode('ascii')
return continuationToken, itct

async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False):
'''
Fetch the comments, recursively, including other sort orders.
nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies".
'''
response, _ = await self.fetch(
f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
method = 'POST',
data = {'session_token': sessionToken},
headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
responseHandler = responseHandler,
)
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
self.logger.error('Could not fetch initial comments')
return
content = await response.read()
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
continuations = self.get_json_obj_from_pos(content, continuationsPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuations)
await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations)

if initial:
sortMenuPos = content.find(b'"sortMenu":')
if sortMenuPos < 0:
self.logger.error('Could not find sort menu')
return
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuation)
await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct)

async def process(self):
videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -114,7 +81,40 @@ class Comments(qwarc.Item):
return
itct = itct.decode('ascii')

await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True)

queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
queue.append((continuationToken, itct, False))
first = True
while queue:
continuationToken, itct, nested = queue.popleft()
response, _ = await self.fetch(
f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
method = 'POST',
data = {'session_token': sessionToken},
headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
responseHandler = responseHandler,
)
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
self.logger.error('Error fetching comments, skipping')
continue
content = await response.read()
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
continuations = self.get_json_obj_from_pos(content, continuationsPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuations)
queue.append((subContinuationToken, subItct, b'"label":' in continuations))

if first:
sortMenuPos = content.find(b'"sortMenu":')
if sortMenuPos < 0:
self.logger.error('Could not find sort menu')
else:
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuation)
queue.append((subContinuationToken, subItct, False))

first = False

specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))

Loading…
Cancel
Save