Browse Source

Refactor pagination parsing to use the JSON objects instead of stupidly extracting all continuation tokens, and only retrieve the 'newest first' sort order

master
JustAnotherArchivist 3 years ago
parent
commit
2ddcec9fbb
1 changed files with 34 additions and 45 deletions
  1. +34
    -45
      comments.py

+ 34
- 45
comments.py View File

@@ -16,34 +16,6 @@ class Comments(qwarc.Item):
def generate(cls):
yield os.environ['YOUTUBE_VIDEOID']

def get_json_obj_from_pos(self, content, startPos):
# Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
openParens = None
for pos in itertools.count(start = startPos):
char = content[pos:pos+1]
if char in (b'{', b'['):
if openParens is None: # First {[ in the string
openParens = 0
openParens += 1
elif char in (b'}', b']'):
openParens -= 1
if openParens == 0:
break
return content[startPos:pos]

def get_continuation_parameters(self, content):
continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
return
continuationToken = continuationToken.decode('ascii')
itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
self.logger.error(f'Unexpected itct value: {itct!r}')
return
itct = itct.decode('ascii')
return continuationToken, itct

async def process(self):
videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
@@ -81,7 +53,7 @@ class Comments(qwarc.Item):
return
itct = itct.decode('ascii')

queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
queue = collections.deque() # of (continuationToken, itct, nested) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
queue.append((continuationToken, itct, False))
first = True
while queue:
@@ -96,25 +68,42 @@ class Comments(qwarc.Item):
if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
self.logger.error('Error fetching comments, skipping')
continue
content = await response.read()
# Yes, the response is JSON and I could parse that into an object, but where's the fun in that?

for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
continuations = self.get_json_obj_from_pos(content, continuationsPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuations)
queue.append((subContinuationToken, subItct, b'"label":' in continuations))
obj = await response.json()

if first:
sortMenuPos = content.find(b'"sortMenu":')
if sortMenuPos < 0:
self.logger.error('Could not find sort menu')
sortMenu = obj['response']['continuationContents']['itemSectionContinuation']['header']['commentsHeaderRenderer']['sortMenu']
for subMenuItem in sortMenu['sortFilterSubMenuRenderer']['subMenuItems']:
if subMenuItem['title'] != 'Newest first':
continue
subContinuation = subMenuItem['continuation']['reloadContinuationData']
queue.append((subContinuation['continuation'], subContinuation['clickTrackingParams'], False))
break
else:
self.logger.error('Could not find newest first sort continuation')
first = False
else:
if not nested:
o = obj
continuationKey = 'itemSectionContinuation'
else:
sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
subContinuationToken, subItct = self.get_continuation_parameters(continuation)
queue.append((subContinuationToken, subItct, False))
# Of course the data format is different here...
for o in obj:
if 'response' in o:
break
continuationKey = 'commentRepliesContinuation'
if 'continuationContents' not in o['response']:
# Empty response
continue
for reply in o['response']['continuationContents'][continuationKey]['contents']:
if 'commentThreadRenderer' in reply and 'replies' in reply['commentThreadRenderer']:
# Nested continuations
continuations = reply['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']
assert len(continuations) == 1
queue.append((continuations[0]['nextContinuationData']['continuation'], continuations[0]['nextContinuationData']['clickTrackingParams'], True))
if 'continuations' in o['response']['continuationContents'][continuationKey]:
assert len(o['response']['continuationContents'][continuationKey]['continuations']) == 1
continuation = o['response']['continuationContents'][continuationKey]['continuations'][0]['nextContinuationData']
queue.append((continuation['continuation'], continuation['clickTrackingParams'], nested))

first = False

specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))

Loading…
Cancel
Save