archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

339 lines
21 KiB

  1. # Let's remind people who still have this running to shut it down
  2. from os.path import isfile
  3. from json import loads
  4. from os import environ
  5. import requests
  6. from sys import exit
  7. if "TRACKER_USERNAME" in environ.keys():
  8. TRACKER_USERNAME = environ["TRACKER_USERNAME"]
  9. elif isfile("config.json"):
  10. try:
  11. TRACKER_USERNAME = loads(open("config.json").read())["TRACKER_USERNAME"]
  12. except:
  13. TRACKER_USERNAME = "Unnamed"
  14. else:
  15. TRACKER_USERNAME = "Unnamed"
  16. print("=============================")
  17. print("This project is now complete, and we are working on sorting and finalizing the data. Thank you to everyone who contributed!")
  18. print("=============================")
  19. print()
  20. print("Just a heads up, we will send your TRACKER_USERNAME to the script admins just so we can remind you to shut down your worker if you've forgotten.")
  21. requests.post("https://discord.com/api/webhooks/771212810877141032/dj9WCWZ2oE5t_vzdyc_OEdTaGbAP92bJFe8CEfYXlRXKJfPewOHWYAgBrLwx596k0CJC", json={"content": str(TRACKER_USERNAME)+" just tried to start a worker."})
  22. exit(0)
  23. from threading import Thread
  24. import requests
  25. from time import sleep
  26. from os import mkdir, rmdir, listdir, system, environ
  27. from os.path import isdir, isfile, getsize
  28. from json import loads
  29. from youtube_channel import process_channel
  30. import signal
  31. import tracker
  32. from youtube_dl import YoutubeDL
  33. from shutil import rmtree, which
  34. from queue import Queue
  35. from gc import collect
  36. from discovery import getmetadata
  37. from export import subprrun
  38. #useful Queue example: https://stackoverflow.com/a/54658363
  39. jobs = Queue()
  40. try:
  41. mkdir("out")
  42. except:
  43. pass
  44. try:
  45. mkdir("directory")
  46. except:
  47. pass
  48. HEROKU = False
  49. if isfile("../Procfile"):
  50. HEROKU = True
  51. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  52. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  53. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  54. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  55. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  56. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  57. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  58. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  59. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  60. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  61. 'xh', 'yi', 'yo', 'zu']
  62. assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
  63. #HSID, SSID, SID cookies required
  64. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  65. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  66. elif isfile("config.json"):
  67. cookies = loads(open("config.json").read())
  68. else:
  69. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  70. assert False
  71. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  72. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  73. assert False
  74. mysession = requests.session()
  75. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  76. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  77. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  78. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  79. Language:
  80. </span>
  81. English
  82. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  83. del validationtest
  84. open("cookies.txt", "w").write("""# HTTP Cookie File
  85. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  86. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  87. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  88. del cookies
  89. validationtimes = 0
  90. #Graceful Shutdown
  91. class GracefulKiller:
  92. kill_now = False
  93. def __init__(self):
  94. signal.signal(signal.SIGINT, self.exit_gracefully)
  95. signal.signal(signal.SIGTERM, self.exit_gracefully)
  96. def exit_gracefully(self, signum, frame):
  97. print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
  98. self.kill_now = True
  99. gkiller = GracefulKiller()
  100. #microtasks
  101. def threadrunner():
  102. global validationtimes
  103. jobs = Queue()
  104. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  105. while True:
  106. if not jobs.empty():
  107. task, vid, args = jobs.get()
  108. if task == "submitdiscovery":
  109. tracker.add_item_to_tracker(args, vid)
  110. elif task == "discovery":
  111. while True:
  112. try:
  113. info = getmetadata(mysession, str(vid).strip())
  114. break
  115. except BaseException as e:
  116. print(e)
  117. print("Error in retrieving information, waiting 30 seconds and trying again")
  118. sleep(30)
  119. if info[0]: # ccenabled
  120. if not isdir("out/"+str(vid).strip()):
  121. mkdir("out/"+str(vid).strip())
  122. if info[0]:
  123. for langcode in langs:
  124. jobs.put(("subtitles", vid, langcode))
  125. for langcode in langs:
  126. jobs.put(("subtitles-forceedit-metadata", vid, langcode))
  127. for langcode in langs:
  128. jobs.put(("subtitles-forceedit-captions", vid, langcode))
  129. jobs.put(("complete", None, "video:"+vid))
  130. for videodisc in info[1]:
  131. jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
  132. for channeldisc in info[2]:
  133. jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
  134. for mixdisc in info[3]:
  135. jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
  136. for playldisc in info[4]:
  137. jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
  138. elif task == "subtitles":
  139. subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
  140. elif task == "subtitles-forceedit-captions":
  141. subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
  142. elif task == "subtitles-forceedit-metadata":
  143. subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
  144. elif task == "channel":
  145. try:
  146. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  147. for itemyv in y["entries"]:
  148. jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
  149. #channel created playlists
  150. y = process_channel(desit.split(":", 1)[1])
  151. for itemyv in y["playlists"]:
  152. jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
  153. for itemyv in y["channels"]:
  154. jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Channel))
  155. jobs.put(("complete", None, "channel:"+args))
  156. except:
  157. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
  158. elif task == "playlist":
  159. try:
  160. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  161. #TODO: extract owner channel in other projects
  162. #TODO: handle channels in other projects, not needed here because we will get it from the video
  163. for itemyvp in y["entries"]:
  164. jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
  165. jobs.put(("complete", None, "playlist:"+args))
  166. except:
  167. print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
  168. elif task == "mixplaylist":
  169. try:
  170. wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
  171. #channel handling not needed here because we will get it from the video
  172. for line in wptext.splitlines():
  173. if line.strip().startswith('window["ytInitialData"] = '):
  174. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  175. for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
  176. jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
  177. jobs.put(("complete", None, "mixplaylist:"+args))
  178. except:
  179. print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
  180. elif task == "complete":
  181. size = 0
  182. if ":" in args:
  183. if args.split(":", 1)[0] == "video":
  184. #check if dir is empty, make zip if needed
  185. if isdir("out/"+args.split(":", 1)[1]):
  186. if not listdir("out/"+args.split(":", 1)[1]):
  187. rmdir("out/"+args.split(":", 1)[1])
  188. else:
  189. #zip it up
  190. if not isdir("directory/"+args.split(":", 1)[1]):
  191. mkdir("directory/"+args.split(":", 1)[1])
  192. while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
  193. print("Attempting to zip item...")
  194. system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
  195. #get a target
  196. targetloc = None
  197. while not targetloc:
  198. targetloc = tracker.request_upload_target()
  199. if targetloc:
  200. break
  201. else:
  202. print("Waiting 5 minutes...")
  203. sleep(300)
  204. while True:
  205. if targetloc.startswith("rsync"):
  206. exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
  207. elif targetloc.startswith("http"):
  208. exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
  209. if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
  210. break
  211. else:
  212. print("Error in sending data to target, waiting 30 seconds and trying again.")
  213. sleep(30)
  214. size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
  215. #cleanup
  216. try:
  217. del langcnt[args.split(":", 1)[1]]
  218. rmtree("directory/"+args.split(":", 1)[1]+"/")
  219. rmdir("directory/"+args.split(":", 1)[1]+"/")
  220. rmtree("out/"+args.split(":", 1)[1]+"/")
  221. rmdir("out/"+args.split(":", 1)[1]+"/")
  222. except:
  223. pass
  224. tracker.mark_item_as_done(args, size)
  225. jobs.task_done()
  226. else:
  227. if not gkiller.kill_now:
  228. # get a new task from tracker
  229. collect() #cleanup
  230. desit = tracker.request_item_from_tracker()
  231. print("New task:", desit)
  232. if desit:
  233. if desit.split(":", 1)[0] == "video":
  234. needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  235. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  236. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  237. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  238. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  239. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  240. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  241. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  242. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  243. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  244. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  245. needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
  246. 'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
  247. 'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
  248. 'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
  249. 'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
  250. 'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
  251. 'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
  252. 'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
  253. 'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
  254. 'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
  255. 'xh': None, 'yi': None, 'yo': None, 'zu': None}
  256. jobs.put(("discovery", desit.split(":", 1)[1], None))
  257. elif desit.split(":", 1)[0] == "channel":
  258. jobs.put(("channel", None, desit.split(":", 1)[1]))
  259. elif desit.split(":", 1)[0] == "playlist":
  260. jobs.put(("playlist", None, desit.split(":", 1)[1]))
  261. elif desit.split(":", 1)[0] == "mixplaylist":
  262. jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
  263. else:
  264. print("Ignoring item for now", desit)
  265. else:
  266. print("Ignoring item for now", desit)
  267. else:
  268. break
  269. threads = []
  270. THREADCNT = 50
  271. if HEROKU:
  272. THREADCNT = 20
  273. #now create the rest of the threads
  274. for i in range(THREADCNT):
  275. runthread = Thread(target=threadrunner)
  276. runthread.start()
  277. threads.append(runthread)
  278. del runthread
  279. #https://stackoverflow.com/a/11968881
  280. for x in threads:
  281. x.join()
  282. threads.remove(x)
  283. del x
  284. print("Exiting...")