Fix deep recursion issues on videos with many comments

4 years ago · 1df5702c02
--- a/README.md
+++ b/README.md
@@ -5,7 +5,6 @@ A method to grab the comments from YouTube videos
 * You can pass multiple video IDs at once as well: `comments VIDEOID1 VIDEOID2 ...`. They get executed sequentially.
 * Comments are grabbed in all available sort orders (i.e. "top" and "new", though "top" is retrieved twice since YT returns two continuation tokens for it), including replies and nesting.
 * The "top" sort order sometimes doesn't return all comments but might be missing some. The reason for this is unclear.
 * On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error.
 * Also on videos with many comments, the "new" sort order retrieval will fail a bit less rapidly due to deep recursion.
 * On videos with many comments, the "top" sort order retrieval will fail rapidly due to the continuation token being too long and causing an HTTP 413 Request Entity Too Large error. (The "new" sort order should succeed.)
 * Everything's written to a few files in the current directory called `youtube-comments-VIDEOID-DATE*`.
 * After the retrieval finished cleanly and you're satisfied with the results, you can delete the `.db` and `.log` files. The former is essentially useless, and the latter is contained in the `-meta.warc.gz` file.
--- a/comments.py
+++ b/comments.py
@@ -1,3 +1,4 @@
 import collections
 import itertools
 import os
 import qwarc
@@ -43,40 +44,6 @@ class Comments(qwarc.Item):
 		itct = itct.decode('ascii')
 		return continuationToken, itct

 	async def continue_recursively(self, videoPageUrl, sessionToken, continuationToken, itct, nested = False, initial = False):
 		'''
 		Fetch the comments, recursively, including other sort orders.
 		nested indicates whether the continuationToken is for nested comments (which require a different URL parameter), i.e. "View N replies" or "Show more replies".
 		'''
 		response, _ = await self.fetch(
 			f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
 			method = 'POST',
 			data = {'session_token': sessionToken},
 			headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
 			responseHandler = responseHandler,
 		  )
 		if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
 			self.logger.error('Could not fetch initial comments')
 			return
 		content = await response.read()
 		# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

 		for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
 			continuations = self.get_json_obj_from_pos(content, continuationsPos)
 			subContinuationToken, subItct = self.get_continuation_parameters(continuations)
 			await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct, nested = b'"label":' in continuations)

 		if initial:
 			sortMenuPos = content.find(b'"sortMenu":')
 			if sortMenuPos < 0:
 				self.logger.error('Could not find sort menu')
 				return
 			sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
 			for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
 				continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
 				subContinuationToken, subItct = self.get_continuation_parameters(continuation)
 				await self.continue_recursively(videoPageUrl, sessionToken, subContinuationToken, subItct)

 	async def process(self):
 		videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
 		response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -114,7 +81,40 @@ class Comments(qwarc.Item):
 			return
 		itct = itct.decode('ascii')

 		await self.continue_recursively(videoPageUrl, sessionToken, continuationToken, itct, initial = True)

 		queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
 		queue.append((continuationToken, itct, False))
 		first = True
 		while queue:
 			continuationToken, itct, nested = queue.popleft()
 			response, _ = await self.fetch(
 				f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
 				method = 'POST',
 				data = {'session_token': sessionToken},
 				headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
 				responseHandler = responseHandler,
 			  )
 			if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
 				self.logger.error('Error fetching comments, skipping')
 				continue
 			content = await response.read()
 			# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

 			for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
 				continuations = self.get_json_obj_from_pos(content, continuationsPos)
 				subContinuationToken, subItct = self.get_continuation_parameters(continuations)
 				queue.append((subContinuationToken, subItct, b'"label":' in continuations))

 			if first:
 				sortMenuPos = content.find(b'"sortMenu":')
 				if sortMenuPos < 0:
 					self.logger.error('Could not find sort menu')
 				else:
 					sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
 					for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
 						continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
 						subContinuationToken, subItct = self.get_continuation_parameters(continuation)
 						queue.append((subContinuationToken, subItct, False))

 			first = False

 specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))