A method to grab the comments from YouTube videos
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

121 lines
5.4 KiB

  1. import collections
  2. import itertools
  3. import os
  4. import qwarc
  5. import qwarc.utils
  6. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  7. class Comments(qwarc.Item):
  8. itemType = 'comments'
  9. # itemValue = '{videoId}'
  10. @classmethod
  11. def generate(cls):
  12. yield os.environ['YOUTUBE_VIDEOID']
  13. def get_json_obj_from_pos(self, content, startPos):
  14. # Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
  15. openParens = None
  16. for pos in itertools.count(start = startPos):
  17. char = content[pos:pos+1]
  18. if char in (b'{', b'['):
  19. if openParens is None: # First {[ in the string
  20. openParens = 0
  21. openParens += 1
  22. elif char in (b'}', b']'):
  23. openParens -= 1
  24. if openParens == 0:
  25. break
  26. return content[startPos:pos]
  27. def get_continuation_parameters(self, content):
  28. continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
  29. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  30. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  31. return
  32. continuationToken = continuationToken.decode('ascii')
  33. itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
  34. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  35. self.logger.error(f'Unexpected itct value: {itct!r}')
  36. return
  37. itct = itct.decode('ascii')
  38. return continuationToken, itct
  39. async def process(self):
  40. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  41. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  42. if not response or response.status != 200:
  43. self.logger.error('Could not fetch video page')
  44. return
  45. content = await response.read()
  46. sessionToken = qwarc.utils.str_get_between(content, b'"XSRF_TOKEN":"', b'"')
  47. if not sessionToken:
  48. self.logger.error('Could not find session token')
  49. return
  50. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  51. self.logger.error(f'Unexpected session token value: {sessionToken!r}')
  52. return
  53. sessionToken = sessionToken.decode('ascii')
  54. sectionIdentifierPos = content.find(b'"comment-item-section"')
  55. if sectionIdentifierPos < 0:
  56. self.logger.error('Could not find comment section identifier')
  57. return
  58. continuationStartPos = content.rfind(b'"continuation":', 0, sectionIdentifierPos)
  59. if continuationStartPos < 0:
  60. self.logger.error('Could not find continuation start position')
  61. return
  62. section = content[continuationStartPos:sectionIdentifierPos]
  63. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  64. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  65. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  66. return
  67. continuationToken = continuationToken.decode('ascii')
  68. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  69. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  70. self.logger.error(f'Unexpected itct value: {itct!r}')
  71. return
  72. itct = itct.decode('ascii')
  73. queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
  74. queue.append((continuationToken, itct, False))
  75. first = True
  76. while queue:
  77. continuationToken, itct, nested = queue.popleft()
  78. response, _ = await self.fetch(
  79. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  80. method = 'POST',
  81. data = {'session_token': sessionToken},
  82. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  83. responseHandler = responseHandler,
  84. )
  85. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  86. self.logger.error('Error fetching comments, skipping')
  87. continue
  88. content = await response.read()
  89. # Yes, the response is JSON and I could parse that into an object, but where's the fun in that?
  90. for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
  91. continuations = self.get_json_obj_from_pos(content, continuationsPos)
  92. subContinuationToken, subItct = self.get_continuation_parameters(continuations)
  93. queue.append((subContinuationToken, subItct, b'"label":' in continuations))
  94. if first:
  95. sortMenuPos = content.find(b'"sortMenu":')
  96. if sortMenuPos < 0:
  97. self.logger.error('Could not find sort menu')
  98. else:
  99. sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
  100. for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
  101. continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
  102. subContinuationToken, subItct = self.get_continuation_parameters(continuation)
  103. queue.append((subContinuationToken, subItct, False))
  104. first = False
  105. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))