diff --git a/README.md b/README.md index 003a915..fb3b1a8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ A method to grab the comments from YouTube videos * You can pass multiple video IDs at once as well: `comments VIDEOID1 VIDEOID2 ...`. They get executed sequentially. * Comments are grabbed in all available sort orders (i.e. "top" and "new", though "top" is retrieved twice since YT returns two continuation tokens for it), including replies and nesting. * The "top" sort order sometimes doesn't return all comments but might be missing some. The reason for this is unclear. -* On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error. -* Also on videos with many comments, the "new" sort order retrieval will fail a bit less rapidly due to deep recursion. +* On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error. (The "new" sort order should succeed.) * Everything's written to a few files in the current directory called `youtube-comments-VIDEOID-DATE*`. * After the retrieval finished cleanly and you're satisfied with the results, you can delete the `.db` and `.log` files. The former is essentially useless, and the latter is contained in the `-meta.warc.gz` file. diff --git a/comments.py b/comments.py index da90c45..00e480d 100644 --- a/comments.py +++ b/comments.py @@ -1,3 +1,4 @@ +import collections import itertools import os import qwarc @@ -43,40 +44,6 @@ class Comments(qwarc.Item): itct = itct.decode('ascii') return continuationToken, itct - async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False): - ''' - Fetch the comments, recursively, including other sort orders. - nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies". - ''' - response, _ = await self.fetch( - f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}', - method = 'POST', - data = {'session_token': sessionToken}, - headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)], - responseHandler = responseHandler, - ) - if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'): - self.logger.error('Could not fetch initial comments') - return - content = await response.read() - # Yes, the response is JSON and I could parse that into an object, but where's the fun in that? - - for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'): - continuations = self.get_json_obj_from_pos(content, continuationsPos) - subContinuationToken, subItct = self.get_continuation_parameters(continuations) - await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations) - - if initial: - sortMenuPos = content.find(b'"sortMenu":') - if sortMenuPos < 0: - self.logger.error('Could not find sort menu') - return - sortMenu = self.get_json_obj_from_pos(content, sortMenuPos) - for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'): - continuation = self.get_json_obj_from_pos(sortMenu, continuationPos) - subContinuationToken, subItct = self.get_continuation_parameters(continuation) - await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct) - async def process(self): videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}' response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler) @@ -114,7 +81,40 @@ class Comments(qwarc.Item): return itct = itct.decode('ascii') - await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True) - + queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments") + queue.append((continuationToken, itct, False)) + first = True + while queue: + continuationToken, itct, nested = queue.popleft() + response, _ = await self.fetch( + f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}', + method = 'POST', + data = {'session_token': sessionToken}, + headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)], + responseHandler = responseHandler, + ) + if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'): + self.logger.error('Error fetching comments, skipping') + continue + content = await response.read() + # Yes, the response is JSON and I could parse that into an object, but where's the fun in that? + + for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'): + continuations = self.get_json_obj_from_pos(content, continuationsPos) + subContinuationToken, subItct = self.get_continuation_parameters(continuations) + queue.append((subContinuationToken, subItct, b'"label":' in continuations)) + + if first: + sortMenuPos = content.find(b'"sortMenu":') + if sortMenuPos < 0: + self.logger.error('Could not find sort menu') + else: + sortMenu = self.get_json_obj_from_pos(content, sortMenuPos) + for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'): + continuation = self.get_json_obj_from_pos(sortMenu, continuationPos) + subContinuationToken, subItct = self.get_continuation_parameters(continuation) + queue.append((subContinuationToken, subItct, False)) + + first = False specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))