Refactor pagination parsing to use the JSON objects instead of stupidly extracting all continuation tokens, and only retrieve the 'newest first' sort order

3 years ago · 2ddcec9fbb
--- a/comments.py
+++ b/comments.py
@@ -16,34 +16,6 @@ class Comments(qwarc.Item):
 	def generate(cls):
 		yield os.environ['YOUTUBE_VIDEOID']

 	def get_json_obj_from_pos(self, content, startPos):
 		# Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
 		openParens = None
 		for pos in itertools.count(start = startPos):
 			char = content[pos:pos+1]
 			if char in (b'{', b'['):
 				if openParens is None: # First {[ in the string
 					openParens = 0
 				openParens += 1
 			elif char in (b'}', b']'):
 				openParens -= 1
 			if openParens == 0:
 				break
 		return content[startPos:pos]

 	def get_continuation_parameters(self, content):
 		continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
 		if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
 			self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
 			return
 		continuationToken = continuationToken.decode('ascii')
 		itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
 		if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
 			self.logger.error(f'Unexpected itct value: {itct!r}')
 			return
 		itct = itct.decode('ascii')
 		return continuationToken, itct

 	async def process(self):
 		videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
 		response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -81,7 +53,7 @@ class Comments(qwarc.Item):
 			return
 		itct = itct.decode('ascii')

 		queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
 		queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
 		queue.append((continuationToken, itct, False))
 		first = True
 		while queue:
@@ -96,25 +68,42 @@ class Comments(qwarc.Item):
 			if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
 				self.logger.error('Error fetching comments, skipping')
 				continue
 			content = await response.read()
 			# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

 			for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
 				continuations = self.get_json_obj_from_pos(content, continuationsPos)
 				subContinuationToken, subItct = self.get_continuation_parameters(continuations)
 				queue.append((subContinuationToken, subItct, b'"label":' in continuations))
 			obj = await response.json()

 			if first:
 				sortMenuPos = content.find(b'"sortMenu":')
 				if sortMenuPos < 0:
 					self.logger.error('Could not find sort menu')
 				sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
 				for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
 					if subMenuItem['title'] != 'Newest first':
 						continue
 					subContinuation = subMenuItem['continuation']['reloadContinuationData']
 					queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
 					break
 				else:
 					self.logger.error('Could not find newest first sort continuation')
 				first = False
 			else:
 				if not nested:
 					o = obj
 					continuationKey = 'itemSectionContinuation'
 				else:
 					sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
 					for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
 						continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
 						subContinuationToken, subItct = self.get_continuation_parameters(continuation)
 						queue.append((subContinuationToken, subItct, False))
 					# Of course the data format is different here...
 					for o in obj:
 						if 'response' in o:
 							break
 					continuationKey = 'commentRepliesContinuation'
 				if 'continuationContents' not in o['response']:
 					# Empty response
 					continue
 				for reply in o['response']['continuationContents'][continuationKey]['contents']:
 					if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
 						# Nested continuations
 						continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
 						assert len(continuations) == 1
 						queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
 				if 'continuations' in o['response']['continuationContents'][continuationKey]:
 					assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
 					continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
 					queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))

 			first = False

 specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))