archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

322 linhas
13 KiB

  1. # This function adapted from https://github.com/cdown/srt/blob/11089f1e021f2e074d04c33fc7ffc4b7b52e7045/srt.py, lines 69 and 189 (MIT License)
  2. def timedelta_to_sbv_timestamp(timedelta_timestamp):
  3. r"""
  4. Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
  5. .. doctest::
  6. >>> import datetime
  7. >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
  8. >>> timedelta_to_sbv_timestamp(delta)
  9. '01:23:04,000'
  10. :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
  11. SBV timestamp
  12. :returns: The timestamp in SBV format
  13. :rtype: str
  14. """
  15. SECONDS_IN_HOUR = 3600
  16. SECONDS_IN_MINUTE = 60
  17. HOURS_IN_DAY = 24
  18. MICROSECONDS_IN_MILLISECOND = 1000
  19. hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
  20. hrs += timedelta_timestamp.days * HOURS_IN_DAY
  21. mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
  22. msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
  23. return "%1d:%02d:%02d.%03d" % (hrs, mins, secs, msecs)
  24. from datetime import timedelta
  25. from json import dumps
  26. from gc import collect
  27. # import requests
  28. from time import sleep
  29. # https://docs.python.org/3/library/html.parser.html
  30. from html.parser import HTMLParser
  31. backend = "requests"
  32. failcnt = 0
  33. from switchable_request import get
  34. class MyHTMLParser(HTMLParser):
  35. def __init__(self):
  36. HTMLParser.__init__(self)
  37. self.captions = []
  38. self.title = ""
  39. self.description = ""
  40. self.inittitle = ""
  41. self.initdescription = ""
  42. def check_attr(self, attrs, attr, value):
  43. for item in attrs:
  44. if item[0] == attr and item[1] == value:
  45. return True
  46. return False
  47. def get_attr(self, attrs, attr):
  48. for item in attrs:
  49. if item[0] == attr:
  50. return item[1]
  51. return False
  52. def handle_starttag(self, tag, attrs):
  53. if tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-start-time"):
  54. self.captions.append({"startTime": int(self.get_attr(attrs, "data-start-ms")), "text": ""})
  55. elif tag == "input" and self.check_attr(attrs, "class", "yt-uix-form-input-text event-time-field event-end-time"):
  56. self.captions[len(self.captions)-1]["endTime"] = int(self.get_attr(attrs, "data-end-ms"))
  57. elif tag == "input" and self.check_attr(attrs, "id", "metadata-title"):
  58. self.title = self.get_attr(attrs, "value")
  59. elif tag == "textarea" and self.check_attr(attrs, "id", "metadata-description"):
  60. self.initdescription = self.get_attr(attrs, "data-original-description")
  61. def handle_data(self, data):
  62. if self.get_starttag_text() and self.get_starttag_text().startswith("<textarea "):
  63. if 'name="serve_text"' in self.get_starttag_text():
  64. self.captions[len(self.captions)-1]["text"] += data
  65. elif 'id="metadata-description"' in self.get_starttag_text():
  66. self.description += data
  67. elif self.get_starttag_text() and self.get_starttag_text().startswith('<div id="original-video-title"'):
  68. self.inittitle += data
  69. def subprrun(mysession, langcode, vid, mode, needforcemetadata, needforcecaptions, allheaders):
  70. global backend
  71. global failcnt
  72. if mode == "forceedit-metadata":
  73. while needforcemetadata[langcode] == None: #extra logic
  74. print("Awaiting forcemetadata")
  75. sleep(1)
  76. if needforcemetadata[langcode] == False:
  77. #print("forcemetadata not needed")
  78. return True #nothing needs to be done, otherwise, continue
  79. if mode == "forceedit-captions":
  80. while needforcecaptions[langcode] == None: #extra logic
  81. print("Awaiting forcecaptions")
  82. sleep(1)
  83. if needforcecaptions[langcode] == False:
  84. #print("forcecaptions not needed")
  85. return True #nothing needs to be done, otherwise, continue
  86. collect() #cleanup memory
  87. vid = vid.strip()
  88. print(langcode, vid)
  89. while True:
  90. try:
  91. if mode == "default":
  92. pparams = (
  93. ("v", vid),
  94. ("lang", langcode),
  95. ("action_mde_edit_form", 1),
  96. ("bl", "vmp"),
  97. ("ui", "hd"),
  98. ("tab", "captions"),
  99. ("o", "U")
  100. )
  101. page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
  102. elif mode == "forceedit-metadata":
  103. pparams = (
  104. ("v", vid),
  105. ("lang", langcode),
  106. ("action_mde_edit_form", 1),
  107. ('forceedit', 'metadata'),
  108. ('tab', 'metadata')
  109. )
  110. page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
  111. elif mode == "forceedit-captions":
  112. pparams = (
  113. ("v", vid),
  114. ("lang", langcode),
  115. ("action_mde_edit_form", 1),
  116. ("bl", "vmp"),
  117. ("ui", "hd"),
  118. ('forceedit', 'captions'),
  119. ("tab", "captions"),
  120. ("o", "U")
  121. )
  122. page = get("https://www.youtube.com/timedtext_editor", params=pparams, mysession=mysession, backend=backend, http3headers=allheaders)
  123. if not "accounts.google.com" in page.url and page.status_code != 429 and 'Subtitles/CC' in page.text and 'Title &amp; description' in page.text:
  124. break
  125. else:
  126. if backend == "requests" and failcnt > 30:
  127. backend = "http3"
  128. print("Rate limit or login failure, switching export to HTTP3/QUIC...")
  129. elif backend == "http3" and failcnt < 30:
  130. failcnt += 1
  131. print("Rate limit or login failure, waiting 30 seconds... ", 30-failcnt, "attempts left until switching export to HTTP3/QUIC.")
  132. sleep(30)
  133. else:
  134. print("[Retrying in 30 seconds for rate limit or login failure] Please supply authentication cookie information in config.json or environment variables. See README.md for more information.")
  135. sleep(30)
  136. except:
  137. print("Error in request, retrying in 5 seconds...")
  138. sleep(5)
  139. inttext = page.text
  140. try:
  141. initlang = page.text.split("'metadataLanguage': \"", 1)[1].split('"', 1)[0]
  142. except:
  143. initlang = ""
  144. del page
  145. filestring = "_community_draft"
  146. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  147. filestring = "_community_published"
  148. if mode == "forceedit-captions":
  149. filestring = "_community_draft"
  150. if 'title="The video owner already provided subtitles/CC"' in inttext:
  151. filestring = "_uploader_provided"
  152. if not "forceedit" in mode:
  153. if '&amp;forceedit=metadata&amp;tab=metadata">See latest</a>' in inttext:
  154. print("Need forcemetadata")
  155. needforcemetadata[langcode] = True
  156. else:
  157. needforcemetadata[langcode] = False
  158. if '<li id="captions-editor-nav-captions" role="tab" data-state="published" class="published">' in inttext:
  159. print("Need forcecaptions")
  160. needforcecaptions[langcode] = True
  161. else:
  162. needforcecaptions[langcode] = False
  163. if 'id="reject-captions-button"' in inttext or 'id="reject-metadata-button"' in inttext or 'data-state="published"' in inttext or 'title="The video owner already provided subtitles/CC"' in inttext: #quick way of checking if this page is worth parsing
  164. parser = MyHTMLParser()
  165. parser.feed(inttext)
  166. captiontext = False
  167. for item in parser.captions:
  168. if item["text"][:-9]:
  169. captiontext = True
  170. if captiontext and (mode == "default" or mode == "forceedit-captions"):
  171. myfs = open("out/"+vid+"/"+vid+"_"+langcode+filestring+".sbv", "w", encoding="utf-8")
  172. captions = parser.captions
  173. captions.pop(0) #get rid of the fake one
  174. while captions:
  175. item = captions.pop(0)
  176. myfs.write(timedelta_to_sbv_timestamp(timedelta(milliseconds=item["startTime"])) + "," + timedelta_to_sbv_timestamp(timedelta(milliseconds=item["endTime"])) + "\n" + item["text"][:-9] + "\n")
  177. del item
  178. if captions:
  179. myfs.write("\n")
  180. del captions
  181. myfs.close()
  182. del myfs
  183. del captiontext
  184. if (parser.title or parser.description[:-16]) and (mode == "default" or mode == "forceedit-metadata"):
  185. metadata = {}
  186. metadata["title"] = parser.title
  187. if metadata["title"] == False:
  188. metadata["title"] = ""
  189. metadata["description"] = parser.description[:-16]
  190. filestring = "_community_draft"
  191. if '<li id="captions-editor-nav-metadata" role="tab" data-state="published" class="published">' in inttext:
  192. filestring = "_community_published"
  193. if mode == "forceedit-metadata":
  194. filestring = "_community_draft"
  195. open("out/"+vid+"/"+vid+"_"+langcode+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  196. del metadata
  197. if (parser.inittitle[9:-17] or parser.initdescription) and (mode == "default" or mode == "forceedit-metadata" and initlang):
  198. metadata = {}
  199. metadata["title"] = parser.inittitle[9:-17]
  200. if metadata["title"] == False:
  201. metadata["title"] = ""
  202. metadata["description"] = parser.initdescription
  203. filestring = "_uploader_provided"
  204. open("out/"+vid+"/"+vid+"_"+initlang+filestring+".json", "w", encoding="utf-8").write(dumps(metadata))
  205. del metadata
  206. del inttext
  207. del langcode
  208. del vid
  209. del pparams
  210. return True
  211. # if __name__ == "__main__":
  212. # from os import environ, mkdir
  213. # from os.path import isfile
  214. # from json import loads
  215. # #HSID, SSID, SID cookies required
  216. # if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  217. # cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  218. # elif isfile("config.json"):
  219. # cookies = loads(open("config.json").read())
  220. # else:
  221. # print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  222. # assert False
  223. # if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  224. # print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  225. # assert False
  226. # mysession = requests.session()
  227. # mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  228. # del cookies
  229. # from sys import argv
  230. # from queue import Queue
  231. # from threading import Thread
  232. # langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  233. # 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  234. # 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  235. # 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  236. # 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  237. # 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  238. # 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  239. # 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  240. # 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  241. # 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  242. # 'xh', 'yi', 'yo', 'zu']
  243. # vidl = argv
  244. # vidl.pop(0)
  245. # try:
  246. # mkdir("out")
  247. # except:
  248. # pass
  249. # jobs = Queue()
  250. # for video in vidl:
  251. # try:
  252. # mkdir("out/"+video.strip())
  253. # except:
  254. # pass
  255. # for lang in langs:
  256. # jobs.put((lang, video, "default"))
  257. # subthreads = []
  258. # for r in range(50):
  259. # subrunthread = Thread(target=subprrun, args=(jobs,mysession))
  260. # subrunthread.start()
  261. # subthreads.append(subrunthread)
  262. # del subrunthread
  263. # for xa in subthreads:
  264. # xa.join() #bug (occurred once: the script ended before the last thread finished)
  265. # subthreads.remove(xa)
  266. # del xa