archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

179 lines
5.2 KiB

  1. from typing import Optional, List
  2. from enum import Enum, auto
  3. import requests
  4. from requests.adapters import HTTPAdapter
  5. from requests.packages.urllib3.util.retry import Retry
  6. from os.path import isfile
  7. from json import loads
  8. # https://github.com/ArchiveTeam/tencent-weibo-grab/blob/9bae5f9747e014db9227821a9c11557267967023/pipeline.py
  9. VERSION = "20201017.01"
  10. TRACKER_ID = "ext-yt-communitycontribs"
  11. TRACKER_HOST = "trackerproxy.meo.ws"
  12. BACKFEED_HOST = "blackbird-amqp.meo.ws:23038"
  13. BACKFEED_ENDPOINT = f"http://{BACKFEED_HOST}/{TRACKER_ID}-kj57sxhhzcn2kqjp/"
  14. TRACKER_ENDPOINT = f"http://{TRACKER_HOST}/{TRACKER_ID}"
  15. from os import environ
  16. if "TRACKER_USERNAME" in environ.keys():
  17. TRACKER_USERNAME = environ["TRACKER_USERNAME"]
  18. elif isfile("config.json"):
  19. try:
  20. TRACKER_USERNAME = loads(open("config.json").read())["TRACKER_USERNAME"]
  21. except:
  22. TRACKER_USERNAME = "Unnamed"
  23. else:
  24. TRACKER_USERNAME = "Unnamed"
  25. # https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
  26. retry_strategy = Retry(
  27. total=4,
  28. backoff_factor=2,
  29. status_forcelist=[x for x in range(500, 600)] + [429],
  30. method_whitelist=["GET", "POST"]
  31. )
  32. adapter = HTTPAdapter(max_retries=retry_strategy)
  33. tracker_session = requests.Session()
  34. tracker_session.mount("https://", adapter)
  35. tracker_session.mount("http://", adapter)
  36. class ItemType(Enum):
  37. Video = auto()
  38. Channel = auto()
  39. MixPlaylist = auto()
  40. Playlist = auto()
  41. def add_item_to_tracker(item_type: ItemType, item_id: str) -> bool:
  42. """Feed items into the tracker through backfeed (item names will be deduplicated):
  43. # curl -d 'ITEMNAME' -so/dev/null $amqp_endpoint
  44. # Response codes:
  45. # 200 - Item added to tracker
  46. # 409 - Item is already in tracker
  47. # 404 - Project backfeed channel not found
  48. # 400 - Item name has a bad format
  49. """
  50. type_name = item_type.name.lower()
  51. item_name = f"{type_name}:{item_id}"
  52. req = tracker_session.post(BACKFEED_ENDPOINT, data=item_name)
  53. code = req.status_code
  54. if code == 200:
  55. # print(f"[INFO] Item ID \'{item_name}\' added to tracker successfully")
  56. return True
  57. elif code == 409:
  58. # print(f"[INFO] Item ID \'{item_name}\' has already been added to tracker")
  59. return True
  60. elif code == 404:
  61. print(f"[ERROR] Unable to add item ID \'{item_name}\' to tracker. Project backfeed channel not found: {BACKFEED_ENDPOINT}")
  62. elif code == 400:
  63. print(f"[ERROR] Item ID \'{item_name}\' has a bad format")
  64. else:
  65. print(f"[ERROR] Unknown response code adding item \'{item_name}\' to tracker: {code}")
  66. return False
  67. def request_item_from_tracker() -> Optional[str]:
  68. data = {
  69. "downloader": TRACKER_USERNAME,
  70. "api_version": "2",
  71. "version": VERSION
  72. }
  73. req = tracker_session.post(f"{TRACKER_ENDPOINT}/request", json=data)
  74. code = req.status_code
  75. if code == 200:
  76. data = req.json()
  77. if "item_name" in data:
  78. item_name = data["item_name"]
  79. print(f"[INFO] Received an item from tracker: {item_name}")
  80. return item_name
  81. else:
  82. print(f"[ERROR] Received item is missing the \'item_name\' key: {data}")
  83. else:
  84. print(f"[ERROR] Unable to get an item from tracker. Status: {code}")
  85. def request_upload_target() -> Optional[str]:
  86. req = tracker_session.get(f"{TRACKER_ENDPOINT}/upload")
  87. code = req.status_code
  88. if code == 200:
  89. data = req.json()
  90. if "upload_target" in data:
  91. upload_target = data["upload_target"]
  92. print(f"[INFO] Received an upload target from tracker: {upload_target}")
  93. return upload_target
  94. else:
  95. print(f"[ERROR] Response is missing the \'upload_target\' key: {data}")
  96. else:
  97. print(f"[ERROR] Unable to get an upload target from tracker. Status: {code}")
  98. def request_all_upload_targets() -> Optional[List[str]]:
  99. req = tracker_session.get(f"{TRACKER_ENDPOINT}/upload_targets")
  100. code = req.status_code
  101. if code == 200:
  102. data = req.json()
  103. print(f"[INFO] Received all upload targets from tracker: {data}")
  104. return data
  105. else:
  106. print(f"[ERROR] Unable to get all upload targets from tracker. Status: {code}")
  107. # `item_name` includes type prefix (video:id, playlist:id, etc)
  108. def mark_item_as_done(item_name: str, item_size_bytes: int) -> bool:
  109. data = {
  110. "downloader": TRACKER_USERNAME,
  111. "version": VERSION,
  112. "item": item_name,
  113. "bytes": {
  114. "data": item_size_bytes
  115. }
  116. }
  117. req = tracker_session.post(f"{TRACKER_ENDPOINT}/done", json=data)
  118. code = req.status_code
  119. if code == 200:
  120. print(f"[INFO] Marked item \'{item_name}\' as done")
  121. return True
  122. elif code > 399 and code < 500:
  123. print(f"[ERROR] Unable to mark item as done. Status: {code}")
  124. else:
  125. print(f"[ERROR] Unknown response code while marking item \'{item_name}\' as done: {code}")
  126. return False
  127. # if __name__ == "__main__":
  128. # print(add_item_to_tracker(ItemType.Channel, "test10"))
  129. # print(request_item_from_tracker())
  130. # print(request_upload_target())
  131. # print(request_all_upload_targets())
  132. # print(mark_item_as_done("test4", 200))