A method to grab the comments from YouTube videos
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

121 lignes
5.4 KiB

  1. import collections
  2. import itertools
  3. import os
  4. import qwarc
  5. import qwarc.utils
  6. responseHandler = qwarc.utils.handle_response_limit_error_retries(5)
  7. class Comments(qwarc.Item):
  8. itemType = 'comments'
  9. # itemValue = '{videoId}'
  10. @classmethod
  11. def generate(cls):
  12. yield os.environ['YOUTUBE_VIDEOID']
  13. def get_json_obj_from_pos(self, content, startPos):
  14. # Given a startPos in content, extracts the content until braces or brackets are matching (requiring at least one set of parentheses)
  15. openParens = None
  16. for pos in itertools.count(start = startPos):
  17. char = content[pos:pos+1]
  18. if char in (b'{', b'['):
  19. if openParens is None: # First {[ in the string
  20. openParens = 0
  21. openParens += 1
  22. elif char in (b'}', b']'):
  23. openParens -= 1
  24. if openParens == 0:
  25. break
  26. return content[startPos:pos]
  27. def get_continuation_parameters(self, content):
  28. continuationToken = qwarc.utils.str_get_between(content, b'"continuation":"', b'"')
  29. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  30. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  31. return
  32. continuationToken = continuationToken.decode('ascii')
  33. itct = qwarc.utils.str_get_between(content, b'"clickTrackingParams":"', b'"')
  34. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  35. self.logger.error(f'Unexpected itct value: {itct!r}')
  36. return
  37. itct = itct.decode('ascii')
  38. return continuationToken, itct
  39. async def process(self):
  40. videoPageUrl = f'https://www.youtube.com/watch?v={self.itemValue}'
  41. response, _ = await self.fetch(videoPageUrl, responseHandler = responseHandler)
  42. if not response or response.status != 200:
  43. self.logger.error('Could not fetch video page')
  44. return
  45. content = await response.read()
  46. sessionToken = qwarc.utils.str_get_between(content, b'"XSRF_TOKEN":"', b'"')
  47. if not sessionToken:
  48. self.logger.error('Could not find session token')
  49. return
  50. if sessionToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  51. self.logger.error(f'Unexpected session token value: {sessionToken!r}')
  52. return
  53. sessionToken = sessionToken.decode('ascii')
  54. sectionIdentifierPos = content.find(b'"comment-item-section"')
  55. if sectionIdentifierPos < 0:
  56. self.logger.error('Could not find comment section identifier')
  57. return
  58. continuationStartPos = content.rfind(b'"continuation":', 0, sectionIdentifierPos)
  59. if continuationStartPos < 0:
  60. self.logger.error('Could not find continuation start position')
  61. return
  62. section = content[continuationStartPos:sectionIdentifierPos]
  63. continuationToken = qwarc.utils.str_get_between(section, b'"continuation":"', b'"')
  64. if continuationToken.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  65. self.logger.error(f'Unexpected continuation token value: {continuationToken!r}')
  66. return
  67. continuationToken = continuationToken.decode('ascii')
  68. itct = qwarc.utils.str_get_between(section, b'"clickTrackingParams":"', b'"')
  69. if itct.lstrip(b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-') not in (b'', b'=', b'==', b'%3D', b'%3D%3D'):
  70. self.logger.error(f'Unexpected itct value: {itct!r}')
  71. return
  72. itct = itct.decode('ascii')
  73. queue = collections.deque() # of (continuationToken, itct, nested, initial) where nested indicates that it's a comment's replies ("View N replies" or "Show more comments")
  74. queue.append((continuationToken, itct, False))
  75. first = True
  76. while queue:
  77. continuationToken, itct, nested = queue.popleft()
  78. response, _ = await self.fetch(
  79. f'https://www.youtube.com/comment_service_ajax?action_get_{"comments" if not nested else "comment_replies"}=1&pbj=1&ctoken={continuationToken}&continuation={continuationToken}&itct={itct}',
  80. method = 'POST',
  81. data = {'session_token': sessionToken},
  82. headers = [('X-YouTube-Client-Name', '1'), ('X-YouTube-Client-Version', '2.20191212.06.02'), ('X-SPF-Referer', videoPageUrl), ('X-SPF-Previous', videoPageUrl)],
  83. responseHandler = responseHandler,
  84. )
  85. if not response or response.status != 200 or (await response.read(16) == b'{"reload":"now"}'):
  86. self.logger.error('Error fetching comments, skipping')
  87. continue
  88. content = await response.read()
  89. # Yes, the response is JSON and I could parse that into an object, but where's the fun in that?
  90. for continuationsPos in qwarc.utils.find_all(content, b'"continuations":'):
  91. continuations = self.get_json_obj_from_pos(content, continuationsPos)
  92. subContinuationToken, subItct = self.get_continuation_parameters(continuations)
  93. queue.append((subContinuationToken, subItct, b'"label":' in continuations))
  94. if first:
  95. sortMenuPos = content.find(b'"sortMenu":')
  96. if sortMenuPos < 0:
  97. self.logger.error('Could not find sort menu')
  98. else:
  99. sortMenu = self.get_json_obj_from_pos(content, sortMenuPos)
  100. for continuationPos in qwarc.utils.find_all(sortMenu, b'"continuation":{'):
  101. continuation = self.get_json_obj_from_pos(sortMenu, continuationPos)
  102. subContinuationToken, subItct = self.get_continuation_parameters(continuation)
  103. queue.append((subContinuationToken, subItct, False))
  104. first = False
  105. specDependencies = qwarc.utils.SpecDependencies(extra = (('videoId', os.environ['YOUTUBE_VIDEOID']),))