archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 

321 rinda
11 KiB

  1. from threading import Thread
  2. import requests
  3. from time import sleep
  4. from os import mkdir, rmdir, listdir, system, environ
  5. from os.path import isdir, isfile, getsize
  6. from json import dumps, loads
  7. import signal
  8. from youtube_dl.utils import DownloadError
  9. import tracker
  10. from youtube_dl import YoutubeDL
  11. from shutil import make_archive, rmtree
  12. from queue import Queue
  13. from gc import collect
  14. from discovery import getmetadata
  15. from export import subprrun
  16. batchcontent = []
  17. def batchfunc():
  18. ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
  19. while jobs.qsize() < 501:
  20. desit = tracker.request_item_from_tracker()
  21. if desit:
  22. if desit.split(":", 1)[0] == "video":
  23. jobs.put(desit.split(":", 1)[1])
  24. elif desit.split(":", 1)[0] == "channel":
  25. y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
  26. for itemyv in y["entries"]:
  27. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyv["id"])
  28. elif desit.split(":", 1)[0] == "playlist":
  29. y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
  30. for itemyvp in y["entries"]:
  31. tracker.add_item_to_tracker(tracker.ItemType.Video, itemyvp["id"])
  32. else:
  33. print("Ignoring item for now", desit)
  34. else:
  35. print("Ignoring item for now", desit)
  36. batchcontent.append(desit.split(":", 1)[1])
  37. def submitfunc(submitqueue):
  38. while not submitqueue.empty():
  39. itype, ival = submitqueue.get()
  40. tracker.add_item_to_tracker(itype, ival)
  41. langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
  42. 'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
  43. 'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
  44. 'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
  45. 'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
  46. 'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
  47. 'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
  48. 'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
  49. 'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
  50. 'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
  51. 'xh', 'yi', 'yo', 'zu']
  52. #useful Queue example: https://stackoverflow.com/a/54658363
  53. jobs = Queue()
  54. ccenabledl = []
  55. recvids = set()
  56. recchans = set()
  57. recmixes = set()
  58. recplayl = set()
  59. #HSID, SSID, SID cookies required
  60. if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
  61. cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
  62. elif isfile("config.json"):
  63. cookies = loads(open("config.json").read())
  64. else:
  65. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  66. assert False
  67. if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
  68. print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
  69. assert False
  70. mysession = requests.session()
  71. mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
  72. validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
  73. assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
  74. assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
  75. Language:
  76. </span>
  77. English
  78. </span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
  79. del validationtest
  80. open("cookies.txt", "w").write("""# HTTP Cookie File
  81. .youtube.com TRUE / FALSE 1663793455 SID [SID]
  82. .youtube.com TRUE / FALSE 1663793455 HSID [HSID]
  83. .youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
  84. del cookies
  85. #Graceful Shutdown
  86. class GracefulKiller:
  87. kill_now = False
  88. def __init__(self):
  89. signal.signal(signal.SIGINT, self.exit_gracefully)
  90. signal.signal(signal.SIGTERM, self.exit_gracefully)
  91. def exit_gracefully(self,signum, frame):
  92. self.kill_now = True
  93. gkiller = GracefulKiller()
  94. def prrun():
  95. while not jobs.empty():
  96. global recvids
  97. global recchans
  98. global recmixes
  99. global recplayl
  100. global ccenabledl
  101. item = jobs.get()
  102. print("Video ID:", str(item).strip())
  103. while True:
  104. try:
  105. info = getmetadata(str(item).strip())
  106. break
  107. except BaseException as e:
  108. print(e)
  109. print("Error in retrieving information, waiting 30 seconds")
  110. #raise
  111. sleep(30)
  112. # Add any discovered videos
  113. recvids.update(info[2])
  114. recchans.update(info[3])
  115. recmixes.update(info[4])
  116. recplayl.update(info[5])
  117. if info[0] or info[1]: # ccenabled or creditdata
  118. if not isdir("out/"+str(item).strip()):
  119. mkdir("out/"+str(item).strip())
  120. if info[1]: # creditdata
  121. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  122. if info[0]: #ccenabled
  123. ccenabledl.append(item)
  124. jobs.task_done()
  125. return True
  126. while not gkiller.kill_now:
  127. collect() #cleanup
  128. try:
  129. mkdir("out")
  130. except:
  131. pass
  132. try:
  133. mkdir("directory")
  134. except:
  135. pass
  136. batchcontent.clear()
  137. # Get a batch ID
  138. batchthreads = []
  139. for r in range(50):
  140. batchrunthread = Thread(target=batchfunc)
  141. batchrunthread.start()
  142. batchthreads.append(batchrunthread)
  143. del batchrunthread
  144. for xc in batchthreads:
  145. xc.join() #bug (occurred once: the script ended before the last thread finished)
  146. batchthreads.remove(xc)
  147. del xc
  148. #for ir in range(501):
  149. # batchcontent.append(tracker.request_item_from_tracker())
  150. threads = []
  151. for i in range(50):
  152. runthread = Thread(target=prrun)
  153. runthread.start()
  154. threads.append(runthread)
  155. del runthread
  156. for x in threads:
  157. x.join()
  158. threads.remove(x)
  159. del x
  160. print("Sending discoveries to tracker...")
  161. submitjobs = Queue()
  162. #IDK how to handle mixes so just send them for now
  163. print("Videos:", len(recvids))
  164. for itemvid in recvids:
  165. submitjobs.put((tracker.ItemType.Video, itemvid))
  166. print("Channels:", len(recchans))
  167. for itemchan in recchans:
  168. submitjobs.put((tracker.ItemType.Channel, itemchan))
  169. print("Mix Playlists:", len(recmixes))
  170. for itemmix in recmixes:
  171. submitjobs.put((tracker.ItemType.MixPlaylist, itemmix))
  172. print("Playlists:", len(recplayl))
  173. for itemplayl in recplayl:
  174. submitjobs.put((tracker.ItemType.Playlist, itemplayl))
  175. #open("out/discoveries.json", "w").write(dumps({"recvids": sorted(recvids), "recchans": sorted(recchans), "recmixes": sorted(recmixes), "recplayl": sorted(recplayl)}))
  176. #clear
  177. recvids.clear()
  178. recchans.clear()
  179. recmixes.clear()
  180. recplayl.clear()
  181. submitthreads = []
  182. for r in range(50):
  183. submitrunthread = Thread(target=submitfunc, args=(submitjobs,))
  184. submitrunthread.start()
  185. submitthreads.append(submitrunthread)
  186. del submitrunthread
  187. for xb in submitthreads:
  188. xb.join() #bug (occurred once: the script ended before the last thread finished)
  189. submitthreads.remove(xb)
  190. del xb
  191. sleep(1)
  192. subtjobs = Queue()
  193. while ccenabledl:
  194. langcontent = langs.copy()
  195. intvid = ccenabledl.pop(0)
  196. while langcontent:
  197. subtjobs.put((langcontent.pop(0), intvid, "default"))
  198. del intvid
  199. del langcontent
  200. subthreads = []
  201. for r in range(50):
  202. subrunthread = Thread(target=subprrun, args=(subtjobs,mysession))
  203. subrunthread.start()
  204. subthreads.append(subrunthread)
  205. del subrunthread
  206. for xa in subthreads:
  207. xa.join() #bug (occurred once: the script ended before the last thread finished)
  208. subthreads.remove(xa)
  209. del xa
  210. sleep(1) #wait a second to hopefully allow the other threads to finish
  211. for fol in listdir("out"): #remove extra folders
  212. try:
  213. if isdir("out/"+fol):
  214. rmdir("out/"+fol)
  215. except:
  216. pass
  217. #https://stackoverflow.com/a/11968881
  218. # TODO: put the data somewhere...
  219. # TODO: put the discoveries somewhere...
  220. for fol in listdir("out"):
  221. if isdir("out/"+fol):
  222. make_archive("directory/"+fol, "zip", "out/"+fol) #check this
  223. targetloc = None
  224. while not targetloc:
  225. targetloc = tracker.request_upload_target()
  226. if targetloc:
  227. break
  228. else:
  229. print("Waiting 5 minutes...")
  230. sleep(300)
  231. if targetloc.startswith("rsync"):
  232. system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 --files-from=- directory/ "+targetloc)
  233. elif targetloc.startswith("http"):
  234. for filzip in listdir("directory"):
  235. if filzip.endswith(".zip"):
  236. system("curl --data-binary @directory/"+filzip+" "+targetloc)
  237. # Report the batch as complete
  238. for itemb in batchcontent:
  239. if isfile("directory/"+itemb.split(":", 1)[1]+".zip"):
  240. size = getsize("directory/"+itemb.split(":", 1)[1]+".zip")
  241. else:
  242. size = 0
  243. tracker.mark_item_as_done(itemb, size)
  244. # clear the output directory
  245. rmtree("out")
  246. rmtree("directory")