archiving community contributions on YouTube: unpublished captions, title and description translations and caption credits
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 

132 Zeilen
11 KiB

  1. from typing import Dict
  2. import requests
  3. from json import loads
  4. langcodes = {"Afar": "aa", "Abkhazian": "ab", "Afrikaans": "af", "Akan": "ak", "all": "all", "Amharic": "am", "Aragonese": "an", "Arabic": "ar", "Aramaic": "arc", "Algerian Arabic": "arq", "Assamese": "as", "American Sign Language": "ase", "Asturian": "ast", "Avaric": "av", "Aymara": "ay", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bihari": "bh", "Bislama": "bi", "Bangla": "bn", "Tibetan": "bo", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Choctaw": "cho", "Cherokee": "chr", "Corsican": "co", "Czech": "cs", "Church Slavic": "cu", "Welsh": "cy", "Danish": "da", "Danish (Denmark)": "da-DK", "German": "de", "German (Austria)": "de-AT", "German (Switzerland)": "de-CH", "German (Germany)": "de-DE", "Divehi": "dv", "Dzongkha": "dz", "Ewe": "ee", "Greek": "el", "English": "en", "English (United Arab Emirates)": "en-AE", "English (Canada)": "en-CA", "English (United Kingdom)": "en-GB", "English (Ireland)": "en-IE", "English (India)": "en-IN", "English (United States)": "en-US", "Esperanto": "eo", "Spanish": "es", "Spanish (Latin America)": "es-419", "Spanish (Argentina)": "es-AR", "Spanish (Chile)": "es-CL", "Spanish (Colombia)": "es-CO", "Spanish (Costa Rica)": "es-CR", "Spanish (Spain)": "es-ES", "Spanish (Mexico)": "es-MX", "Spanish (Nicaragua)": "es-NI", "Spanish (United States)": "es-US", "Estonian": "et", "Basque": "eu", "Persian": "fa", "Persian (Afghanistan)": "fa-AF", "Persian (Iran)": "fa-IR", "Fulah": "ff", "Finnish": "fi", "Filipino": "fil", "Fijian": "fj", "Faroese": "fo", "French": "fr", "French (Belgium)": "fr-BE", "French (Canada)": "fr-CA", "French (Switzerland)": "fr-CH", "French (France)": "fr-FR", "Western Frisian": "fy", "Irish": "ga", "Scottish Gaelic": "gd", "Galician": "gl", "Guarani": "gn", "Swiss German": "gsw", "Gujarati": "gu", "Hausa": "ha", "Hakka Chinese": "hak", "Hakka Chinese (Taiwan)": "hak-TW", "Hindi": "hi-Latn", "Hmong": "hmn", "Croatian": "hr", "Haitian Creole": "ht", "Hungarian": "hu", "Armenian": "hy", "Interlingua": "ia", "Indonesian": "id", "Interlingue": "ie", "Igbo": "ig", "Sichuan Yi": "ii", "Inupiaq": "ik", "Icelandic": "is", "Italian": "it", "Italian (Italy)": "it-IT", "Inuktitut": "iu", "Hebrew": "iw", "Japanese": "ja", "Javanese": "jv", "Georgian": "ka", "Kazakh": "kk", "Kalaallisut": "kl", "Khmer": "km", "Kannada": "kn", "Korean": "ko", "Korean (South Korea)": "ko-KR", "Kanuri": "kr", "Kashmiri": "ks", "Kurdish": "ku", "Kyrgyz": "ky", "Latin": "la", "Luxembourgish": "lb", "Lingala": "ln", "Lao": "lo", "Lithuanian": "lt", "Mizo": "lus", "Latvian": "lv", "Masai": "mas", "Malagasy": "mg", "Maori": "mi", "Miscellaneous languages": "mis", "Macedonian": "mk", "Malayalam": "ml", "Mongolian": "mn", "Manipuri": "mni", "Moldavian": "mo", "Marathi": "mr", "Malay": "ms", "Maltese": "mt", "Burmese": "my", "Nauru": "na", "Min Nan Chinese": "nan", "Min Nan Chinese (Taiwan)": "nan-TW", "Nepali": "ne", "Dutch": "nl", "Dutch (Belgium)": "nl-BE", "Dutch (Netherlands)": "nl-NL", "Norwegian Nynorsk": "nn", "Norwegian": "no", "not": "not", "Navajo": "nv", "Occitan": "oc", "Oromo": "om", "Odia": "or", "Punjabi": "pa", "Polish": "pl", "Polish (Poland)": "pl-PL", "Pashto": "ps", "Portuguese": "pt", "Portuguese (Brazil)": "pt-BR", "Portuguese (Portugal)": "pt-PT", "Quechua": "qu", "Romansh": "rm", "Rundi": "rn", "Romanian": "ro", "Romanian (Moldova)": "ro-MD", "Russian": "ru-Latn", "Russian (Russia)": "ru-RU", "Kinyarwanda": "rw", "Sanskrit": "sa", "Sardinian": "sc", "Sicilian": "scn", "Scots": "sco", "Sindhi": "sd", "Sherdukpen": "sdp", "Northern Sami": "se", "Sango": "sg", "Serbo-Croatian": "sh", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Samoan": "sm", "Shona": "sn", "Somali": "so", "Albanian": "sq", "Serbian": "sr", "Serbian (Cyrillic)": "sr-Cyrl", "Serbian (Latin)": "sr-Latn", "Swati": "ss", "Southern Sotho": "st", "Sundanese": "su", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Telugu": "te", "Tajik": "tg", "Thai": "th", "Tigrinya": "ti", "Turkmen": "tk", "Tagalog": "tl", "Klingon": "tlh", "Tswana": "tn", "Tongan": "to", "Turkish": "tr", "Turkish (Turkey)": "tr-TR", "Tsonga": "ts", "Tatar": "tt", "Twi": "tw", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Volap\\xFCk": "vo", "Wolof": "wo", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Cantonese": "yue", "Cantonese (Hong Kong)": "yue-HK", "Chinese": "zh", "Chinese (China)": "zh-CN", "Chinese (Hong Kong)": "zh-HK", "Chinese (Simplified)": "zh-Hans", "Chinese (Simplified, China)": "zh-Hans-CN", "Chinese (Simplified, Singapore)": "zh-Hans-SG", "Chinese (Traditional)": "zh-Hant", "Chinese (Traditional, Hong Kong)": "zh-Hant-HK", "Chinese (Traditional, Taiwan)": "zh-Hant-TW", "Chinese (Singapore)": "zh-SG", "Chinese (Taiwan)": "zh-TW", "Zulu": "zu", "Hiri Motu": "ho", "Tok Pisin": "tpi", "Voro": "vor"}
  5. def getmetadata(vid):
  6. params = (
  7. ("v", vid),
  8. )
  9. headers = {
  10. "Accept-Language": "en-US",
  11. }
  12. wpage = requests.get("https://www.youtube.com/watch", headers=headers, params=params)
  13. wptext = wpage.text
  14. initplay = None
  15. initdata = None
  16. recvids = set()
  17. recchans = set()
  18. recmixes = set()
  19. recplayl = set()
  20. ccenabled = False #default values
  21. creditdata = {}
  22. for line in wptext.splitlines():
  23. if line.strip().startswith('window["ytInitialPlayerResponse"] = '):
  24. initplay = loads(line.split('window["ytInitialPlayerResponse"] = ', 1)[1].strip()[:-1])
  25. if initplay["playabilityStatus"]["status"] == "ERROR":
  26. print(vid, "unavailable")
  27. return False, {}, recvids, recchans, recmixes, recplayl
  28. if "endscreen" in initplay.keys():
  29. if "endscreenRenderer" in initplay["endscreen"].keys():
  30. for el in initplay["endscreen"]["endscreenRenderer"]:
  31. if type(el) == Dict:
  32. elint = el["endscreenElementRenderer"]
  33. if "endscreenElementRenderer" in el.keys():
  34. if elint["style"] == "VIDEO":
  35. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  36. elif elint["style"] == "CHANNEL":
  37. try:
  38. recchans.add(elint["endpoint"]["browseEndpoint"]["browseId"])
  39. except:
  40. print("Channel endscreen error")
  41. raise
  42. elif elint["style"] == "PLAYLIST":
  43. recvids.add(elint["endpoint"]["watchEndpoint"]["videoId"])
  44. recplayl.add(elint["endpoint"]["watchEndpint"]["playlistId"])
  45. if "captions" in initplay.keys():
  46. ccenabled = "contribute" in initplay["captions"]["playerCaptionsRenderer"]
  47. else:
  48. ccenabled = False # if captions information is not present, community contributions are not enabled
  49. if "videoDetails" in initplay.keys():
  50. if "channelId" in initplay["videoDetails"].keys():
  51. recchans.add(initplay["videoDetails"]["channelId"])
  52. elif line.strip().startswith('window["ytInitialData"] = '):
  53. initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
  54. if "contents" in initdata.keys(): #prevent exception
  55. for recmd in initdata["contents"]["twoColumnWatchNextResults"]["secondaryResults"]["secondaryResults"]["results"]:
  56. #auto is like the others
  57. if "compactAutoplayRenderer" in recmd.keys():
  58. recmd = recmd["compactAutoplayRenderer"]["contents"][0]
  59. if "compactVideoRenderer" in recmd.keys():
  60. recvids.add(recmd["compactVideoRenderer"]["videoId"])
  61. try:
  62. recchans.add(recmd["compactVideoRenderer"]["channelId"])
  63. except KeyError as e:
  64. try:
  65. recchans.add(recmd["compactVideoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  66. except KeyError as e:
  67. print("Channel extract error")
  68. #raise
  69. #print("Unable to extract channel:")
  70. #print(recmd["compactVideoRenderer"])
  71. elif "compactPlaylistRenderer" in recmd.keys():
  72. recplayl.add(recmd["compactPlaylistRenderer"]["playlistId"])
  73. if "navigationEndpoint" in recmd["compactPlaylistRenderer"].keys():
  74. recvids.add(recmd["compactPlaylistRenderer"]["navigationEndpoint"]["watchEndpoint"]["videoId"])
  75. if "navigationEndpoint" in recmd["compactPlaylistRenderer"]["shortBylineText"].keys():
  76. recchans.add(recmd["compactPlaylistRenderer"]["shortBylineText"]["navigationEndpoint"]["browseEndpoint"]["browseId"])
  77. elif "compactRadioRenderer" in recmd.keys(): #mix playlist
  78. recmixes.add(recmd["compactRadioRenderer"]["playlistId"])
  79. # todo: find out if channels can be suggested
  80. creditdata = {}
  81. try:
  82. mdinfo = initdata["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
  83. for item in mdinfo:
  84. if item["metadataRowRenderer"]["title"]["simpleText"].startswith("Caption author"): #the request to /watch needs to be in English for this to work
  85. try:
  86. desl = langcodes[item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]]
  87. except KeyError as e:
  88. #print(e)
  89. print("Language code conversion error, using language name")
  90. desl = item["metadataRowRenderer"]["title"]["simpleText"].split("(", 1)[1][:-1]
  91. creditdata[desl] = []
  92. for itemint in item["metadataRowRenderer"]["contents"]:
  93. creditdata[desl].append({"name": itemint["runs"][0]["text"], "channel": itemint["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]})
  94. except KeyError as e:
  95. #print("Video does not have credits")
  96. pass
  97. #raise
  98. #print(e)
  99. if initplay and initdata:
  100. break
  101. return ccenabled, creditdata, recvids, recchans, recmixes, recplayl
  102. if __name__ == "__main__":
  103. from sys import argv
  104. vidl = argv
  105. vidl.pop(0)
  106. for video in vidl:
  107. print(getmetadata(video))