archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

114 lines
3.3 KiB

  1. import requests
  2. from time import sleep
  3. from os import mkdir
  4. from json import dumps
  5. from discovery import getmetadata
  6. from export import getsubs
  7. WORKER_VERSION = 1
  8. SERVER_BASE_URL = "http://localhost:5000"
  9. # Get a worker ID
  10. while True:
  11. params = (
  12. ("worker_version", WORKER_VERSION),
  13. )
  14. idrequest = requests.get(SERVER_BASE_URL+"/worker/getID", params=params)
  15. if idrequest.status_code == 200:
  16. WORKER_ID = idrequest.text
  17. break
  18. else:
  19. print("Error in retrieving ID, will attempt again in 10 minutes")
  20. sleep(600)
  21. try:
  22. mkdir("out")
  23. except:
  24. pass
  25. while True:
  26. recvids = set()
  27. recchans = set()
  28. recmixes = set()
  29. recplayl = set()
  30. # Get a batch ID
  31. while True:
  32. params = (
  33. ("id", WORKER_ID),
  34. ("worker_version", WORKER_VERSION),
  35. )
  36. batchrequest = requests.get(SERVER_BASE_URL+"/worker/getBatch", params=params)
  37. if batchrequest.status_code == 200:
  38. batchinfo = batchrequest.json()
  39. break
  40. else:
  41. print("Error in retrieving batch assignment, will attempt again in 10 minutes")
  42. sleep(600)
  43. print("Received batch ID:", batchinfo["batchID"], "Content:", batchinfo["content"])
  44. # Process the batch
  45. batchcontent = requests.get(batchinfo["content"]).text.split("\n")
  46. for item in batchcontent:
  47. print("Video ID:", str(item).strip())
  48. while True:
  49. try:
  50. info = getmetadata(str(item).strip())
  51. break
  52. except BaseException as e:
  53. print(e)
  54. print("Error in retrieving information, waiting 10 minutes")
  55. sleep(600)
  56. # Add any discovered videos
  57. recvids.update(info[2])
  58. recchans.update(info[3])
  59. recmixes.update(info[4])
  60. recplayl.update(info[5])
  61. if info[0] or info[1]: # ccenabled or creditdata
  62. mkdir("out/"+str(item).strip())
  63. if info[1]: # creditdata
  64. open("out/"+str(item).strip()+"/"+str(item).strip()+"_published_credits.json", "w").write(dumps(info[1]))
  65. if info[0]: #ccenabled
  66. while True:
  67. gsres = False
  68. try:
  69. gsres = getsubs(str(item).strip())
  70. except BaseException as e:
  71. print(e)
  72. if gsres:
  73. break
  74. else:
  75. print("Error in retrieving subtitles, waiting 10 minutes")
  76. sleep(600)
  77. # TODO: put the data somewhere...
  78. # TODO: put the discoveries somewhere...
  79. # Report the batch as complete (I can't think of a fail condition except for a worker exiting...)
  80. # TODO: handle worker exit
  81. while True:
  82. params = (
  83. ("id", WORKER_ID),
  84. ("worker_version", WORKER_VERSION),
  85. ("batchID", batchinfo["batchID"]),
  86. ("randomKey", batchinfo["randomKey"]),
  87. ("status", "c"),
  88. )
  89. statusrequest = requests.get(SERVER_BASE_URL+"/worker/updateStatus", params=params)
  90. if statusrequest.status_code == 200 and statusrequest.text == "Success":
  91. break
  92. else:
  93. print("Error in reporting success, will attempt again in 10 minutes")
  94. sleep(600)
  95. # TODO: clear the output directory