# coding: utf-8 from __future__ import unicode_literals import hashlib import itertools import json import os.path import random import re import time import traceback from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_chr, compat_HTTPError, compat_kwargs, compat_parse_qs, compat_str, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..jsinterp import JSInterpreter from ..utils import ( clean_html, dict_get, ExtractorError, format_field, float_or_none, int_or_none, mimetype2ext, parse_codecs, parse_duration, qualities, remove_start, smuggle_url, str_or_none, str_to_int, try_get, unescapeHTML, unified_strdate, unsmuggle_url, update_url_query, url_or_none, urlencode_postdata, urljoin, ) class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|' r'movies|results|shared|hashtag|trending|feed|feeds|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' def _ids_to_results(self, ids): return [ self.url_result(vid_id, 'Youtube', video_id=vid_id) for vid_id in ids] def _login(self): """ Attempt to log in to YouTube. True is returned if successful or skipped. False is returned if login failed. If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them. # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!') return True login_page = self._download_webpage( self._LOGIN_URL, None, note='Downloading login page', errnote='unable to fetch login page', fatal=False) if login_page is False: return login_form = self._hidden_inputs(login_page) def req(url, f_req, note, errnote): data = login_form.copy() data.update({ 'pstMsg': 1, 'checkConnection': 'youtube', 'checkedDomains': 'youtube', 'hl': 'en', 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', 'f.req': json.dumps(f_req), 'flowName': 'GlifWebSignIn', 'flowEntry': 'ServiceLogin', # TODO: reverse actual botguard identifier generation algo 'bgRequest': '["identifier",""]', }) return self._download_json( url, None, note=note, errnote=errnote, transform_source=lambda s: re.sub(r'^[^[]*', '', s), fatal=False, data=urlencode_postdata(data), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', 'Google-Accounts-XSRF': 1, }) def warn(message): self._downloader.report_warning(message) lookup_req = [ username, None, [], None, 'US', None, None, 2, False, True, [ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], 1, [None, None, []], None, None, None, True ], username, ] lookup_results = req( self._LOOKUP_URL, lookup_req, 'Looking up account info', 'Unable to look up account info') if lookup_results is False: return False user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) if not user_hash: warn('Unable to extract user hash') return False challenge_req = [ user_hash, None, 1, None, [1, None, None, None, [password, None, True]], [ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], 1, [None, None, []], None, None, None, True ]] challenge_results = req( self._CHALLENGE_URL, challenge_req, 'Logging in', 'Unable to log in') if challenge_results is False: return login_res = try_get(challenge_results, lambda x: x[0][5], list) if login_res: login_msg = try_get(login_res, lambda x: x[5], compat_str) warn( 'Unable to login: %s' % 'Invalid password' if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) return False res = try_get(challenge_results, lambda x: x[0][-1], list) if not res: warn('Unable to extract result entry') return False login_challenge = try_get(res, lambda x: x[0][0], list) if login_challenge: challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) if challenge_str == 'TWO_STEP_VERIFICATION': # SEND_SUCCESS - TFA code has been successfully sent to phone # QUOTA_EXCEEDED - reached the limit of TFA codes status = try_get(login_challenge, lambda x: x[5], compat_str) if status == 'QUOTA_EXCEEDED': warn('Exceeded the limit of TFA codes, try later') return False tl = try_get(challenge_results, lambda x: x[1][2], compat_str) if not tl: warn('Unable to extract TL') return False tfa_code = self._get_tfa_info('2-step verification code') if not tfa_code: warn( 'Two-factor authentication required. Provide it either interactively or with --twofactor ' '(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False tfa_code = remove_start(tfa_code, 'G-') tfa_req = [ user_hash, None, 2, None, [ 9, None, None, None, None, None, None, None, [None, tfa_code, True, 2] ]] tfa_results = req( self._TFA_URL.format(tl), tfa_req, 'Submitting TFA code', 'Unable to submit TFA code') if tfa_results is False: return False tfa_res = try_get(tfa_results, lambda x: x[0][5], list) if tfa_res: tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) warn( 'Unable to finish TFA: %s' % 'Invalid TFA code' if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) return False check_cookie_url = try_get( tfa_results, lambda x: x[0][-1][2], compat_str) else: CHALLENGES = { 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", } challenge = CHALLENGES.get( challenge_str, '%s returned error %s.' % (self.IE_NAME, challenge_str)) warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) return False else: check_cookie_url = try_get(res, lambda x: x[2], compat_str) if not check_cookie_url: warn('Unable to extract CheckCookie URL') return False check_cookie_results = self._download_webpage( check_cookie_url, None, 'Checking cookie', fatal=False) if check_cookie_results is False: return False if 'https://myaccount.google.com/' not in check_cookie_results: warn('Unable to log in') return False return True def _download_webpage_handle(self, *args, **kwargs): query = kwargs.get('query', {}).copy() kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _real_initialize(self): if self._downloader is None: return if not self._login(): return _DEFAULT_API_DATA = { 'context': { 'client': { 'clientName': 'WEB', 'clientVersion': '2.20210301.08.00', } }, } _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta| you can pass the naked ID (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?!.*?\blist= (?: %(playlist_id)s| # combined list/video URLs are handled by the playlist IE WL # WL are handled by the watch later IE ) ) (?(1).+)? # if we found the ID, everything can follow $""" % { 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, 'invidious': '|'.join(_INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, # 3D videos '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, # Dash webm audio with opus inside '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, # av01 video only formats sometimes served with "unknown" codecs '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False IE_NAME = 'youtube' _TESTS = [ { 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, 'end_time': 9, } }, { 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', 'note': 'Embed-only video (#1746)', 'info_dict': { 'id': 'yZIXLfi8CZQ', 'ext': 'mp4', 'upload_date': '20120608', 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, }, 'skip': 'Private video', }, { 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, 'view_count': int, 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', 'info_dict': { 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, }, 'params': { 'youtube_include_dash_manifest': True, 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', 'uploader': 'Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, # Normal age-gate video (embed allowed) { 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, }, }, # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', ] }, # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { 'skip_download': 'requires avconv', } }, # Non-square pixels { 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', 'info_dict': { 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, # url_encoded_fmt_stream_map is empty string { 'url': 'qEJwOuvDf7I', 'info_dict': { 'id': 'qEJwOuvDf7I', 'ext': 'webm', 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', 'uploader_id': 'spbelect', 'uploader': 'Наблюдатели Петербурга', }, 'params': { 'skip_download': 'requires avconv', }, 'skip': 'This live event has ended.', }, # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) { 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, 'upload_date': '20150625', 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', }, # DASH manifest with segment_list { 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', 'md5': '8ce563a1d667b599d21064e982ab9e31', 'info_dict': { 'id': 'CsmdDsKjzN8', 'ext': 'mp4', 'upload_date': '20150501', # According to ' valid video id redirection 'url': 'DJztXj2GPfl', 'info_dict': { 'id': 'DJztXj2GPfk', 'ext': 'mp4', 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', 'description': 'md5:bf577a41da97918e94fa9798d9228825', 'upload_date': '20090125', 'uploader': 'Prochorowka', 'uploader_id': 'Prochorowka', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', 'artist': 'Panjabi MC', 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', 'album': 'Beware of the Boys (Mundian To Bach Ke)', }, 'params': { 'skip_download': True, }, 'skip': 'Video unavailable', }, { # empty description results in an empty string 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', 'info_dict': { 'id': 'x41yOUIvK2k', 'ext': 'mp4', 'title': 'IMG 3456', 'description': '', 'upload_date': '20170613', 'uploader_id': 'ElevageOrVert', 'uploader': 'ElevageOrVert', }, 'params': { 'skip_download': True, }, }, { # with '};' inside yt initial data (see [1]) # see [2] for an example with '};' inside ytInitialPlayerResponse # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', 'info_dict': { 'id': 'CHqg6qOn4no', 'ext': 'mp4', 'title': 'Part 77 Sort a list of simple types in c#', 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', 'upload_date': '20130831', 'uploader_id': 'kudvenkat', 'uploader': 'kudvenkat', }, 'params': { 'skip_download': True, }, }, { # another example of '};' in ytInitialData 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', 'only_matching': True, }, { # https://github.com/ytdl-org/youtube-dl/pull/28094 'url': 'OtqTfy26tG0', 'info_dict': { 'id': 'OtqTfy26tG0', 'ext': 'mp4', 'title': 'Burn Out', 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', 'upload_date': '20141120', 'uploader': 'The Cinematic Orchestra - Topic', 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', 'artist': 'The Cinematic Orchestra', 'track': 'Burn Out', 'album': 'Every Day', 'release_data': None, 'release_year': None, }, 'params': { 'skip_download': True, }, }, { # controversial video, only works with bpctr when authenticated with cookies 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', 'only_matching': True, }, ] def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @classmethod def _extract_player_info(cls, player_url): for player_re in cls._PLAYER_INFO_RE: id_m = re.search(player_re, player_url) if id_m: break else: raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) # Read from filesystem cache func_id = 'js_%s_%s' % ( player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) if player_id not in self._code_cache: self._code_cache[player_id] = self._download_webpage( player_url, video_id, note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) code = self._code_cache[player_id] res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) ends = (':%d' % (end + step)) if end + step >= 0 else ':' steps = '' if step == 1 else (':%d' % step) return 's[%s%s%s]' % (starts, ends, steps) step = None # Quelch pyflakes warnings - start will be set when step is set start = '(Never used)' for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: continue yield _genslice(start, prev, step) step = None continue if i - prev in [-1, 1]: step = i - prev start = prev continue else: yield 's[%d]' % prev if step is None: yield 's[%d]' % i else: yield _genslice(start, i, step) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" if player_url is None: raise ExtractorError('Cannot decrypt signature without player_url') if player_url.startswith('//'): player_url = 'https:' + player_url elif not re.match(r'https?://', player_url): player_url = compat_urlparse.urljoin( 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: func = self._extract_signature_function( video_id, player_url, s ) self._player_cache[player_id] = func func = self._player_cache[player_id] if self._downloader.params.get('youtube_print_sig_code'): self._print_sig_code(func, s) return func(s) except Exception as e: tb = traceback.format_exc() raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) qs = compat_urlparse.parse_qs(parsed_playback_url.query) # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) qs.update({ 'ver': ['2'], 'cpn': [cpn], }) playback_url = compat_urlparse.urlunparse( parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) self._download_webpage( playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) @staticmethod def _extract_urls(webpage): # Embedded YouTube player entries = [ unescapeHTML(mobj.group('url')) for mobj in re.finditer(r'''(?x) (?: ]+?src=| data-video-url=| ]+?src=| embedSWF\(?:\s*| ]+data=| new\s+SWFObject\( ) (["\']) (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) \1''', webpage)] # lazyYT YouTube embed entries.extend(list(map( unescapeHTML, re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) # Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)]+ class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) entries.extend(m[-1] for m in matches) return entries @staticmethod def _extract_url(webpage): urls = YoutubeIE._extract_urls(webpage) return urls[0] if urls else None @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group(2) return video_id def _extract_chapters_from_json(self, data, video_id, duration): chapters_list = try_get( data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] ['decoratedPlayerBarRenderer'] ['playerBar'] ['chapteredPlayerBarRenderer'] ['chapters'], list) if not chapters_list: return def chapter_time(chapter): return float_or_none( try_get( chapter, lambda x: x['chapterRenderer']['timeRangeStartMillis'], int), scale=1000) chapters = [] for next_num, chapter in enumerate(chapters_list, start=1): start_time = chapter_time(chapter) if start_time is None: continue end_time = (chapter_time(chapters_list[next_num]) if next_num < len(chapters_list) else duration) if end_time is None: continue title = try_get( chapter, lambda x: x['chapterRenderer']['title']['simpleText'], compat_str) chapters.append({ 'start_time': start_time, 'end_time': end_time, 'title': title, }) return chapters def _extract_yt_initial_variable(self, webpage, regex, video_id, name): return self._parse_json(self._search_regex( (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id webpage = self._download_webpage( webpage_url + '&has_verified=1&bpctr=9999999999', video_id, fatal=False) player_response = None if webpage: player_response = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') if not player_response: player_response = self._call_api( 'player', {'videoId': video_id}, video_id) playability_status = player_response.get('playabilityStatus') or {} if playability_status.get('reason') == 'Sign in to confirm your age': pr = self._parse_json(try_get(compat_parse_qs( self._download_webpage( base_url + 'get_video_info', video_id, 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) if pr: player_response = pr trailer_video_id = try_get( playability_status, lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], compat_str) if trailer_video_id: return self.url_result( trailer_video_id, self.ie_key(), trailer_video_id) def get_text(x): if not x: return return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) search_meta = ( lambda x: self._html_search_meta(x, webpage, default=None)) \ if webpage else lambda x: None video_details = player_response.get('videoDetails') or {} microformat = try_get( player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} video_title = video_details.get('title') \ or get_text(microformat.get('title')) \ or search_meta(['og:title', 'twitter:title', 'title']) video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] for feed in multifeed_metadata_list.split(','): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs( compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): return try_get( feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: continue feed_title = feed_entry('title') title = video_title if feed_title: title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) feed_ids.append(feed_id) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) return self.playlist_result( entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) formats = [] itags = [] itag_qualities = {} player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) for fmt in streaming_formats: if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue itag = str_or_none(fmt.get('itag')) quality = fmt.get('quality') if itag and quality: itag_qualities[itag] = quality # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment # (adding `&sq=0` to the URL) and parsing emsg box to determine the # number of fragment that would subsequently requested with (`&sq=N`) if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': continue fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not (sc and fmt_url and encrypted_sig): continue if not player_url: if not webpage: continue player_url = self._search_regex( r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', webpage, 'player URL', fatal=False) if not player_url: continue signature = self._decrypt_signature(sc['s'][0], video_id, player_url) sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature if itag: itags.append(itag) tbr = float_or_none( fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, 'format_note': fmt.get('qualityLabel') or quality, 'fps': int_or_none(fmt.get('fps')), 'height': int_or_none(fmt.get('height')), 'quality': q(quality), 'tbr': tbr, 'url': fmt_url, 'width': fmt.get('width'), } mimetype = fmt.get('mimeType') if mimetype: mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) if mobj: dct['ext'] = mimetype2ext(mobj.group(1)) dct.update(parse_codecs(mobj.group(2))) no_audio = dct.get('acodec') == 'none' no_video = dct.get('vcodec') == 'none' if no_audio: dct['vbr'] = tbr if no_video: dct['abr'] = tbr if no_audio or no_video: dct['downloader_options'] = { # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, } if dct.get('ext'): dct['container'] = dct['ext'] + '_dash' formats.append(dct) hls_manifest_url = streaming_data.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats( hls_manifest_url, video_id, 'mp4', fatal=False): itag = self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None) if itag: f['format_id'] = itag formats.append(f) if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): itag = f['format_id'] if itag in itags: continue if itag in itag_qualities: # Not actually usefull since the sorting is already done with "quality,res,fps,codec" # but kept to maintain feature parity (and code similarity) with youtube-dl # Remove if this causes any issues with sorting in future f['quality'] = q(itag_qualities[itag]) filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize formats.append(f) if not formats: if not self._downloader.params.get('allow_unplayable_formats') and streaming_data.get('licenseInfos'): raise ExtractorError( 'This video is DRM protected.', expected=True) pemr = try_get( playability_status, lambda x: x['errorScreen']['playerErrorMessageRenderer'], dict) or {} reason = get_text(pemr.get('reason')) or playability_status.get('reason') subreason = pemr.get('subreason') if subreason: subreason = clean_html(get_text(subreason)) if subreason == 'The uploader has not made this video available in your country.': countries = microformat.get('availableCountries') if not countries: regions_allowed = search_meta('regionsAllowed') countries = regions_allowed.split(',') if regions_allowed else None self.raise_geo_restricted( subreason, countries) reason += '\n' + subreason if reason: raise ExtractorError(reason, expected=True) self._sort_formats(formats) keywords = video_details.get('keywords') or [] if not keywords and webpage: keywords = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for keyword in keywords: if keyword.startswith('yt:stretch='): w, h = keyword.split('=')[1].split(':') w, h = int(w), int(h) if w > 0 and h > 0: ratio = w / h for f in formats: if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio thumbnails = [] for container in (video_details, microformat): for thumbnail in (try_get( container, lambda x: x['thumbnail']['thumbnails'], list) or []): thumbnail_url = thumbnail.get('url') if not thumbnail_url: continue thumbnails.append({ 'height': int_or_none(thumbnail.get('height')), 'url': thumbnail_url, 'width': int_or_none(thumbnail.get('width')), }) if thumbnails: break else: thumbnail = search_meta(['og:image', 'twitter:image']) if thumbnail: thumbnails = [{'url': thumbnail}] category = microformat.get('category') or search_meta('genre') channel_id = video_details.get('channelId') \ or microformat.get('externalChannelId') \ or search_meta('channelId') duration = int_or_none( video_details.get('lengthSeconds') or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') owner_profile_url = microformat.get('ownerProfileUrl') info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, 'formats': formats, 'thumbnails': thumbnails, 'description': video_description, 'upload_date': unified_strdate( microformat.get('uploadDate') or search_meta('uploadDate')), 'uploader': video_details['author'], 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, 'duration': duration, 'view_count': int_or_none( video_details.get('viewCount') or microformat.get('viewCount') or search_meta('interactionCount')), 'average_rating': float_or_none(video_details.get('averageRating')), 'age_limit': 18 if ( microformat.get('isFamilySafe') is False or search_meta('isFamilyFriendly') == 'false' or search_meta('og:restrictions:age') == '18+') else 0, 'webpage_url': webpage_url, 'categories': [category] if category else None, 'tags': keywords, 'is_live': is_live, 'playable_in_embed': playability_status.get('playableInEmbed'), 'was_live': video_details.get('isLiveContent') } pctr = try_get( player_response, lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) subtitles = {} if pctr: def process_language(container, base_url, lang_code, query): lang_subs = [] for fmt in self._SUBTITLE_FORMATS: query.update({ 'fmt': fmt, }) lang_subs.append({ 'ext': fmt, 'url': update_url_query(base_url, query), }) container[lang_code] = lang_subs for caption_track in (pctr.get('captionTracks') or []): base_url = caption_track.get('baseUrl') if not base_url: continue if caption_track.get('kind') != 'asr': lang_code = caption_track.get('languageCode') if not lang_code: continue process_language( subtitles, base_url, lang_code, {}) continue automatic_captions = {} for translation_language in (pctr.get('translationLanguages') or []): translation_language_code = translation_language.get('languageCode') if not translation_language_code: continue process_language( automatic_captions, base_url, translation_language_code, {'tlang': translation_language_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles parsed_url = compat_urllib_parse_urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: query = compat_parse_qs(component) for k, v in query.items(): for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: d_k += '_time' if d_k not in info and k in s_ks: info[d_k] = parse_duration(query[k][0]) # Youtube Music Auto-generated description if video_description: mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: release_year = release_date[:4] info.update({ 'album': mobj.group('album'.strip()), 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), 'track': mobj.group('track').strip(), 'release_date': release_date, 'release_year': int_or_none(release_year), }) initial_data = None if webpage: initial_data = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_DATA_RE, video_id, 'yt initial data') if not initial_data: initial_data = self._call_api( 'next', {'videoId': video_id}, video_id, fatal=False) if not is_live: try: # This will error if there is no livechat initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] info['subtitles']['live_chat'] = [{ 'video_id': video_id, 'ext': 'json', 'protocol': 'youtube_live_chat_replay', }] except (KeyError, IndexError, TypeError): pass if initial_data: chapters = self._extract_chapters_from_json( initial_data, video_id, duration) if not chapters: for engagment_pannel in (initial_data.get('engagementPanels') or []): contents = try_get( engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], list) if not contents: continue def chapter_time(mmlir): return parse_duration( get_text(mmlir.get('timeDescription'))) chapters = [] for next_num, content in enumerate(contents, start=1): mmlir = content.get('macroMarkersListItemRenderer') or {} start_time = chapter_time(mmlir) end_time = chapter_time(try_get( contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ if next_num < len(contents) else duration if start_time is None or end_time is None: continue chapters.append({ 'start_time': start_time, 'end_time': end_time, 'title': get_text(mmlir.get('title')), }) if chapters: break if chapters: info['chapters'] = chapters contents = try_get( initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] for content in contents: vpir = content.get('videoPrimaryInfoRenderer') if vpir: stl = vpir.get('superTitleLink') if stl: stl = get_text(stl) if try_get( vpir, lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), 'season_number': int(mobj.group(2)), 'episode_number': int(mobj.group(3)), }) for tlb in (try_get( vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): tbr = tlb.get('toggleButtonRenderer') or {} for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ lambda x: x['accessibility'], lambda x: x['accessibilityData']['accessibilityData'], ], r'(?P(?:dis)?like) this video along with (?P[\d,]+) other people')]: label = (try_get(tbr, getter, dict) or {}).get('label') if label: mobj = re.match(regex, label) if mobj: info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) break sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: like_count, dislike_count = sbr_tooltip.split(' / ') info.update({ 'like_count': str_to_int(like_count), 'dislike_count': str_to_int(dislike_count), }) vsir = content.get('videoSecondaryInfoRenderer') if vsir: info['channel'] = get_text(try_get( vsir, lambda x: x['owner']['videoOwnerRenderer']['title'], compat_str)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] multiple_songs = False for row in rows: if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: multiple_songs = True break for row in rows: mrr = row.get('metadataRowRenderer') or {} mrr_title = mrr.get('title') if not mrr_title: continue mrr_title = get_text(mrr['title']) mrr_contents_text = get_text(mrr['contents'][0]) if mrr_title == 'License': info['license'] = mrr_contents_text elif not multiple_songs: if mrr_title == 'Album': info['album'] = mrr_contents_text elif mrr_title == 'Artist': info['artist'] = mrr_contents_text elif mrr_title == 'Song': info['track'] = mrr_contents_text fallbacks = { 'channel': 'uploader', 'channel_id': 'uploader_id', 'channel_url': 'uploader_url', } for to, frm in fallbacks.items(): if not info.get(to): info[to] = info.get(frm) for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: info[d_k] = v # get xsrf for annotations or comments get_annotations = self._downloader.params.get('writeannotations', False) get_comments = self._downloader.params.get('getcomments', False) if get_annotations or get_comments: xsrf_token = None ytcfg = self._extract_ytcfg(video_id, webpage) if ytcfg: xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) if not xsrf_token: xsrf_token = self._search_regex( r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P(?:(?!\2).)+)\2', webpage, 'xsrf token', group='xsrf_token', fatal=False) # annotations if get_annotations: invideo_url = try_get( player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) if xsrf_token and invideo_url: xsrf_field_name = None if ytcfg: xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) if not xsrf_field_name: xsrf_field_name = self._search_regex( r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P\w+)\2', webpage, 'xsrf field name', group='xsrf_field_name', default='session_token') info['annotations'] = self._download_webpage( self._proto_relative_url(invideo_url), video_id, note='Downloading annotations', errnote='Unable to download video annotations', fatal=False, data=urlencode_postdata({xsrf_field_name: xsrf_token})) # Get comments # TODO: Refactor and move to seperate function def extract_comments(): expected_video_comment_count = 0 video_comments = [] comment_xsrf = xsrf_token def find_value(html, key, num_chars=2, separator='"'): pos_begin = html.find(key) + len(key) + num_chars pos_end = html.find(separator, pos_begin) return html[pos_begin: pos_end] def search_dict(partial, key): if isinstance(partial, dict): for k, v in partial.items(): if k == key: yield v else: for o in search_dict(v, key): yield o elif isinstance(partial, list): for i in partial: for o in search_dict(i, key): yield o continuations = [] if initial_data: try: ncd = next(search_dict(initial_data, 'nextContinuationData')) continuations = [ncd['continuation']] # Handle videos where comments have been disabled entirely except StopIteration: pass def get_continuation(continuation, session_token, replies=False): query = { 'pbj': 1, 'ctoken': continuation, } if replies: query['action_get_comment_replies'] = 1 else: query['action_get_comments'] = 1 while True: content, handle = self._download_webpage_handle( 'https://www.youtube.com/comment_service_ajax', video_id, note=False, expected_status=[413], data=urlencode_postdata({ 'session_token': session_token }), query=query, headers={ 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0', 'X-YouTube-Client-Name': '1', 'X-YouTube-Client-Version': '2.20201202.06.01' } ) response_code = handle.getcode() if (response_code == 200): return self._parse_json(content, video_id) if (response_code == 413): return None raise ExtractorError('Unexpected HTTP error code: %s' % response_code) first_continuation = True chain_msg = '' self.to_screen('Downloading comments') while continuations: continuation = continuations.pop() comment_response = get_continuation(continuation, comment_xsrf) if not comment_response: continue if list(search_dict(comment_response, 'externalErrorMessage')): raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage'))) if 'continuationContents' not in comment_response['response']: # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?) continue # not sure if this actually helps if 'xsrf_token' in comment_response: comment_xsrf = comment_response['xsrf_token'] item_section = comment_response['response']['continuationContents']['itemSectionContinuation'] if first_continuation: expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', '')) first_continuation = False if 'contents' not in item_section: # continuation returned no comments? # set an empty array as to not break the for loop item_section['contents'] = [] for meta_comment in item_section['contents']: comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer'] video_comments.append({ 'id': comment['commentId'], 'text': ''.join([c['text'] for c in try_get(comment, lambda x: x['contentText']['runs'], list) or []]), 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]), 'author': comment.get('authorText', {}).get('simpleText', ''), 'votes': comment.get('voteCount', {}).get('simpleText', '0'), 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'], 'parent': 'root' }) if 'replies' not in meta_comment['commentThreadRenderer']: continue reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']] while reply_continuations: time.sleep(1) continuation = reply_continuations.pop() replies_data = get_continuation(continuation, comment_xsrf, True) if not replies_data or 'continuationContents' not in replies_data[1]['response']: continue if self._downloader.params.get('verbose', False): chain_msg = ' (chain %s)' % comment['commentId'] self.to_screen('Comments downloaded: %d of ~%d%s' % (len(video_comments), expected_video_comment_count, chain_msg)) reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation'] for reply_meta in reply_comment_meta.get('contents', {}): reply_comment = reply_meta['commentRenderer'] video_comments.append({ 'id': reply_comment['commentId'], 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]), 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]), 'author': reply_comment.get('authorText', {}).get('simpleText', ''), 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'), 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'], 'parent': comment['commentId'] }) if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0: continue reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']] self.to_screen('Comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count)) if 'continuations' in item_section: continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']] time.sleep(1) self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count)) return { 'comments': video_comments, 'comment_count': expected_video_comment_count } if get_comments: info['__post_extractor'] = extract_comments self.mark_watched(video_id, player_response) return info class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' _VALID_URL = r'''(?x) https?:// (?:\w+\.)? (?: youtube(?:kids)?\.com| invidio\.us )/ (?: (?:channel|c|user)/| (?P feed/|hashtag/| (?:playlist|watch)\?.*?\blist= )| (?!(?:%s)\b) # Direct URLs ) (?P[^/?\#&]+) ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES IE_NAME = 'youtube:tab' _TESTS = [{ # playlists, multipage 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', 'uploader': 'Игорь Клейнер', 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', }, }, { # playlists, multipage, different order 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', 'uploader': 'Игорь Клейнер', }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', 'title': 'ThirstForScience - Playlists', 'description': 'md5:609399d937ea957b0f53cbffb747a14c', 'uploader': 'ThirstForScience', 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', } }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, }, { # basic, single video playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'title': 'youtube-dl public playlist', }, 'playlist_count': 1, }, { # empty playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, }, { # Home tab 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Home', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 2, }, { # Videos tab 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 975, }, { # Videos tab, sorted by popular 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 199, }, { # Playlists tab 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Playlists', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 17, }, { # Community tab 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 18, }, { # Channels tab 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', }, 'playlist_mincount': 12, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', }, 'playlist_count': 96, }, { 'note': 'Large playlist', 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', 'info_dict': { 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 1123, }, { # even larger playlist, 8832 videos 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', 'only_matching': True, }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', 'info_dict': { 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, }, { # https://github.com/ytdl-org/youtube-dl/issues/21844 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'uploader': 'Computerphile', 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', }, 'playlist_mincount': 11, }, { 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', 'info_dict': { 'id': 'FqZTN594JQw', 'ext': 'webm', 'title': "Smiley's People 01 detective, Adventure Series, Action", 'uploader': 'STREEM', 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', 'upload_date': '20150526', 'license': 'Standard YouTube License', 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, 'view_count': int, 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, 'skip': 'This video is not available.', 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { 'id': '9Auq9mYxFEE', 'ext': 'mp4', 'title': compat_str, 'uploader': 'Sky News', 'uploader_id': 'skynews', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', 'upload_date': '20191102', 'description': 'md5:85ddd75d888674631aaf9599a9a0b0ae', 'categories': ['News & Politics'], 'tags': list, 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.youtube.com/user/TheYoungTurks/live', 'info_dict': { 'id': 'a48o2S1cPoo', 'ext': 'mp4', 'title': 'The Young Turks - Live Main Show', 'uploader': 'The Young Turks', 'uploader_id': 'TheYoungTurks', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', 'upload_date': '20150715', 'license': 'Standard YouTube License', 'description': 'md5:438179573adcdff3c97ebb1ee632b891', 'categories': ['News & Politics'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, 'only_matching': True, }, { 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', 'only_matching': True, }, { 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, }, { 'url': 'https://www.youtube.com/feed/trending', 'only_matching': True, }, { # needs auth 'url': 'https://www.youtube.com/feed/library', 'only_matching': True, }, { # needs auth 'url': 'https://www.youtube.com/feed/history', 'only_matching': True, }, { # needs auth 'url': 'https://www.youtube.com/feed/subscriptions', 'only_matching': True, }, { # needs auth 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, }, { # no longer available? 'url': 'https://www.youtube.com/feed/recommended', 'only_matching': True, }, { # inline playlist with not always working continuations 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', 'only_matching': True, }, { 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', 'only_matching': True, }, { 'url': 'https://www.youtube.com/course', 'only_matching': True, }, { 'url': 'https://www.youtube.com/zsecurity', 'only_matching': True, }, { 'url': 'http://www.youtube.com/NASAgovVideo/videos', 'only_matching': True, }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if YoutubeIE.suitable(url) else super( YoutubeTabIE, cls).suitable(url) def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( 'channelId', webpage, 'channel id', default=None) if channel_id: return channel_id channel_url = self._html_search_meta( ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', 'twitter:app:url:googleplay'), webpage, 'channel url') return self._search_regex( r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', channel_url, 'channel id') @staticmethod def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer known_renderers = ( 'playlistRenderer', 'videoRenderer', 'channelRenderer' 'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer' ) for key, renderer in item.items(): if key not in known_renderers: continue return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): continue renderer = self._extract_basic_item_renderer(item) if not isinstance(renderer, dict): continue title = try_get( renderer, lambda x: x['title']['runs'][0]['text'], compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) # channel channel_id = renderer.get('channelId') if channel_id: title = try_get( renderer, lambda x: x['title']['simpleText'], compat_str) yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') if not isinstance(content, dict): return renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer') if renderer: # TODO: add support for nested playlists so each shelf is processed # as separate playlist # TODO: this includes only first N items for entry in self._grid_entries(renderer): yield entry renderer = content.get('horizontalListRenderer') if renderer: # TODO pass def _shelf_entries(self, shelf_renderer, skip_channels=False): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str) shelf_url = urljoin('https://www.youtube.com', ep) if shelf_url: # Skipping links to another channels, note that checking for # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL # will not work if skip_channels and '/channels?' in shelf_url: return title = try_get( shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) yield self.url_result(shelf_url, video_title=title) # Shelf may not contain shelf URL, fallback to extraction from content for entry in self._shelf_entries_from_content(shelf_renderer): yield entry def _playlist_entries(self, video_list_renderer): for content in video_list_renderer['contents']: if not isinstance(content, dict): continue renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') if not isinstance(renderer, dict): continue video_id = renderer.get('videoId') if not video_id: continue yield self._extract_video(renderer) def _rich_entries(self, rich_grid_renderer): renderer = try_get( rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} video_id = renderer.get('videoId') if not video_id: return yield self._extract_video(renderer) def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') if video_id: return self._extract_video(video_renderer) def _post_thread_entries(self, post_thread_renderer): post_renderer = try_get( post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) if not post_renderer: return # video attachment video_renderer = try_get( post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) video_id = None if video_renderer: entry = self._video_entry(video_renderer) if entry: yield entry # inline video links runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] for run in runs: if not isinstance(run, dict): continue ep_url = try_get( run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) if not ep_url: continue if not YoutubeIE.suitable(ep_url): continue ep_video_id = YoutubeIE._match_id(ep_url) if video_id == ep_video_id: continue yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) def _post_thread_continuation_entries(self, post_thread_continuation): contents = post_thread_continuation.get('contents') if not isinstance(contents, list): return for content in contents: renderer = content.get('backstagePostThreadRenderer') if not isinstance(renderer, dict): continue for entry in self._post_thread_entries(renderer): yield entry @staticmethod def _build_continuation_query(continuation, ctp=None): query = { 'ctoken': continuation, 'continuation': continuation, } if ctp: query['itct'] = ctp return query @staticmethod def _extract_next_continuation_data(renderer): next_continuation = try_get( renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) if not next_continuation: return continuation = next_continuation.get('continuation') if not continuation: return ctp = next_continuation.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) @classmethod def _extract_continuation(cls, renderer): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation contents = [] for key in ('contents', 'items'): contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: if not isinstance(content, dict): continue continuation_ep = try_get( content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], dict) if not continuation_ep: continue continuation = try_get( continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) if not continuation: continue ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) def _entries(self, tab, item_id, identity_token, account_syncid): def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] for content in contents: if not isinstance(content, dict): continue is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) if not is_renderer: renderer = content.get('richItemRenderer') if renderer: for entry in self._rich_entries(renderer): yield entry continuation_list[0] = self._extract_continuation(parent_renderer) continue isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] for isr_content in isr_contents: if not isinstance(isr_content, dict): continue known_renderers = { 'playlistVideoListRenderer': self._playlist_entries, 'gridRenderer': self._grid_entries, 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'), 'backstagePostThreadRenderer': self._post_thread_entries, 'videoRenderer': lambda x: [self._video_entry(x)], } for key, renderer in isr_content.items(): if key not in known_renderers: continue for entry in known_renderers[key](renderer): if entry: yield entry continuation_list[0] = self._extract_continuation(renderer) break if not continuation_list[0]: continuation_list[0] = self._extract_continuation(is_renderer) if not continuation_list[0]: continuation_list[0] = self._extract_continuation(parent_renderer) continuation_list = [None] # Python 2 doesnot support nonlocal tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return parent_renderer = ( try_get(tab_content, lambda x: x['sectionListRenderer'], dict) or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) for entry in extract_entries(parent_renderer): yield entry continuation = continuation_list[0] headers = { 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20201112.04.01', } if identity_token: headers['x-youtube-identity-token'] = identity_token if account_syncid: headers['X-Goog-PageId'] = account_syncid headers['X-Goog-AuthUser'] = 0 for page_num in itertools.count(1): if not continuation: break retries = self._downloader.params.get('extractor_retries', 3) count = -1 last_error = None while count < retries: count += 1 if last_error: self.report_warning('%s. Retrying ...' % last_error) try: response = self._call_api( ep="browse", fatal=True, headers=headers, video_id='%s page %s' % (item_id, page_num), query={ 'continuation': continuation['continuation'], 'clickTracking': {'clickTrackingParams': continuation['itct']}, }, note='Downloading API JSON%s' % (' (retry #%d)' % count if count else '')) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404): # Downloading page may result in intermittent 5xx HTTP error # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 last_error = 'HTTP Error %s' % e.cause.code if count < retries: continue raise else: # Youtube sometimes sends incomplete data # See: https://github.com/ytdl-org/youtube-dl/issues/28194 if dict_get(response, ('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')): break # Youtube may send alerts if there was an issue with the continuation page self._extract_alerts(response, expected=False) last_error = 'Incomplete data received' if count >= retries: self._downloader.report_error(last_error) if not response: break known_continuation_renderers = { 'playlistVideoListContinuation': self._playlist_entries, 'gridContinuation': self._grid_entries, 'itemSectionContinuation': self._post_thread_continuation_entries, 'sectionListContinuation': extract_entries, # for feeds } continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) or {} continuation_renderer = None for key, value in continuation_contents.items(): if key not in known_continuation_renderers: continue continuation_renderer = value continuation_list = [None] for entry in known_continuation_renderers[key](continuation_renderer): yield entry continuation = continuation_list[0] or self._extract_continuation(continuation_renderer) break if continuation_renderer: continue known_renderers = { 'gridPlaylistRenderer': (self._grid_entries, 'items'), 'gridVideoRenderer': (self._grid_entries, 'items'), 'playlistVideoRenderer': (self._playlist_entries, 'contents'), 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds 'richItemRenderer': (extract_entries, 'contents'), # for hashtag 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents') } continuation_items = try_get( response, lambda x: dict_get(x, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))[0]['appendContinuationItemsAction']['continuationItems'], list) continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {} video_items_renderer = None for key, value in continuation_item.items(): if key not in known_renderers: continue video_items_renderer = {known_renderers[key][1]: continuation_items} continuation_list = [None] for entry in known_renderers[key][0](video_items_renderer): yield entry continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) break if video_items_renderer: continue break @staticmethod def _extract_selected_tab(tabs): for tab in tabs: if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): return tab['tabRenderer'] else: raise ExtractorError('Unable to find selected tab') @staticmethod def _extract_uploader(data): uploader = {} sidebar_renderer = try_get( data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) if sidebar_renderer: for item in sidebar_renderer: if not isinstance(item, dict): continue renderer = item.get('playlistSidebarSecondaryInfoRenderer') if not isinstance(renderer, dict): continue owner = try_get( renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) if owner: uploader['uploader'] = owner.get('text') uploader['uploader_id'] = try_get( owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) uploader['uploader_url'] = urljoin( 'https://www.youtube.com/', try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) return {k: v for k, v in uploader.items() if v is not None} def _extract_from_tabs(self, item_id, webpage, data, tabs): playlist_id = title = description = channel_url = channel_name = channel_id = None thumbnails_list = tags = [] selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) if renderer: channel_name = renderer.get('title') channel_url = renderer.get('channelUrl') channel_id = renderer.get('externalId') if not renderer: renderer = try_get( data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) if renderer: title = renderer.get('title') description = renderer.get('description', '') playlist_id = channel_id tags = renderer.get('keywords', '').split() thumbnails_list = ( try_get(renderer, lambda x: x['avatar']['thumbnails'], list) or try_get( data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'], list) or []) thumbnails = [] for t in thumbnails_list: if not isinstance(t, dict): continue thumbnail_url = url_or_none(t.get('url')) if not thumbnail_url: continue thumbnails.append({ 'url': thumbnail_url, 'width': int_or_none(t.get('width')), 'height': int_or_none(t.get('height')), }) if playlist_id is None: playlist_id = item_id if title is None: title = playlist_id title += format_field(selected_tab, 'title', ' - %s') metadata = { 'playlist_id': playlist_id, 'playlist_title': title, 'playlist_description': description, 'uploader': channel_name, 'uploader_id': channel_id, 'uploader_url': channel_url, 'thumbnails': thumbnails, 'tags': tags, } if not channel_id: metadata.update(self._extract_uploader(data)) metadata.update({ 'channel': metadata['uploader'], 'channel_id': metadata['uploader_id'], 'channel_url': metadata['uploader_url']}) return self.playlist_result( self._entries( selected_tab, playlist_id, self._extract_identity_token(webpage, item_id), self._extract_account_syncid(data)), **metadata) def _extract_mix_playlist(self, playlist, playlist_id): page_num = 0 while True: videos = list(self._playlist_entries(playlist)) if not videos: return video_count = len(videos) start = min(video_count - 24, 26) if video_count > 25 else 0 for item in videos[start:]: yield item page_num += 1 _, data = self._extract_webpage( 'https://www.youtube.com/watch?list=%s&v=%s' % (playlist_id, videos[-1]['id']), '%s page %d' % (playlist_id, page_num)) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) def _extract_from_playlist(self, item_id, url, data, playlist): title = playlist.get('title') or try_get( data, lambda x: x['titleText']['simpleText'], compat_str) playlist_id = playlist.get('playlistId') or item_id # Delegating everything except mix playlists to regular tab-based playlist URL playlist_url = urljoin(url, try_get( playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str)) if playlist_url and playlist_url != url: return self.url_result( playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) return self.playlist_result( self._extract_mix_playlist(playlist, playlist_id), playlist_id=playlist_id, playlist_title=title) def _extract_alerts(self, data, expected=False): def _real_extract_alerts(): for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: if not isinstance(alert_dict, dict): continue for alert in alert_dict.values(): alert_type = alert.get('type') if not alert_type: continue message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) if message: yield alert_type, message for run in try_get(alert, lambda x: x['text']['runs'], list) or []: message = try_get(run, lambda x: x['text'], compat_str) if message: yield alert_type, message err_msg = None for alert_type, alert_message in _real_extract_alerts(): if alert_type.lower() == 'error': if err_msg: self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg)) err_msg = alert_message else: self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message)) if err_msg: raise ExtractorError('YouTube said: %s' % err_msg, expected=expected) def _extract_identity_token(self, webpage, item_id): ytcfg = self._extract_ytcfg(item_id, webpage) if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: return token return self._search_regex( r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None) @staticmethod def _extract_account_syncid(data): """Extract syncId required to download private playlists of secondary channels""" sync_ids = ( try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str) or '').split("||") if len(sync_ids) >= 2 and sync_ids[1]: # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # and just "user_syncid||" for primary channel. We only want the channel_syncid return sync_ids[0] def _extract_webpage(self, url, item_id): retries = self._downloader.params.get('extractor_retries', 3) count = -1 last_error = 'Incomplete yt initial data recieved' while count < retries: count += 1 # Sometimes youtube returns a webpage with incomplete ytInitialData # See: https://github.com/yt-dlp/yt-dlp/issues/116 if count: self.report_warning('%s. Retrying ...' % last_error) webpage = self._download_webpage( url, item_id, 'Downloading webpage%s' % (' (retry #%d)' % count if count else '')) data = self._extract_yt_initial_data(item_id, webpage) self._extract_alerts(data, expected=True) if data.get('contents') or data.get('currentVideoEndpoint'): break if count >= retries: self._downloader.report_error(last_error) return webpage, data def _real_extract(self, url): item_id = self._match_id(url) url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # This is not matched in a channel page with a tab selected mobj = re.match(r'(?P
%s)(?P/?(?![^#?]).*$)' % self._VALID_URL, url)
        mobj = mobj.groupdict() if mobj else {}
        if mobj and not mobj.get('not_channel'):
            self._downloader.report_warning(
                'A channel/user page was given. All the channel\'s videos will be downloaded. '
                'To download only the videos in the home page, add a "/featured" to the URL')
            url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')

        # Handle both video/playlist URLs
        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
        video_id = qs.get('v', [None])[0]
        playlist_id = qs.get('list', [None])[0]

        if not video_id and (mobj.get('not_channel') or '').startswith('watch'):
            if not playlist_id:
                # If there is neither video or playlist ids,
                # youtube redirects to home page, which is undesirable
                raise ExtractorError('Unable to recognize tab page')
            self._downloader.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
            url = 'https://www.youtube.com/playlist?list=%s' % playlist_id

        if video_id and playlist_id:
            if self._downloader.params.get('noplaylist'):
                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
                return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
            self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))

        webpage, data = self._extract_webpage(url, item_id)

        tabs = try_get(
            data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
        if tabs:
            return self._extract_from_tabs(item_id, webpage, data, tabs)

        playlist = try_get(
            data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
        if playlist:
            return self._extract_from_playlist(item_id, url, data, playlist)

        video_id = try_get(
            data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
            compat_str) or video_id
        if video_id:
            self._downloader.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
            return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)

        raise ExtractorError('Unable to recognize tab page')


class YoutubePlaylistIE(InfoExtractor):
    IE_DESC = 'YouTube.com playlists'
    _VALID_URL = r'''(?x)(?:
                        (?:https?://)?
                        (?:\w+\.)?
                        (?:
                            (?:
                                youtube(?:kids)?\.com|
                                invidio\.us
                            )
                            /.*?\?.*?\blist=
                        )?
                        (?P%(playlist_id)s)
                     )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
    IE_NAME = 'youtube:playlist'
    _TESTS = [{
        'note': 'issue #673',
        'url': 'PLBB231211A4F62143',
        'info_dict': {
            'title': '[OLD]Team Fortress 2 (Class-based LP)',
            'id': 'PLBB231211A4F62143',
            'uploader': 'Wickydoo',
            'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
        },
        'playlist_mincount': 29,
    }, {
        'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
        'info_dict': {
            'title': 'YDL_safe_search',
            'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
        },
        'playlist_count': 2,
        'skip': 'This playlist is private',
    }, {
        'note': 'embedded',
        'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
        'playlist_count': 4,
        'info_dict': {
            'title': 'JODA15',
            'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
            'uploader': 'milan',
            'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
        }
    }, {
        'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
        'playlist_mincount': 982,
        'info_dict': {
            'title': '2018 Chinese New Singles (11/6 updated)',
            'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
            'uploader': 'LBK',
            'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
        }
    }, {
        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
        'only_matching': True,
    }, {
        # music album playlist
        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
        'only_matching': True,
    }]

    @classmethod
    def suitable(cls, url):
        return False if YoutubeTabIE.suitable(url) else super(
            YoutubePlaylistIE, cls).suitable(url)

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
        if not qs:
            qs = {'list': playlist_id}
        return self.url_result(
            update_url_query('https://www.youtube.com/playlist', qs),
            ie=YoutubeTabIE.ie_key(), video_id=playlist_id)


class YoutubeYtBeIE(InfoExtractor):
    IE_DESC = 'youtu.be'
    _VALID_URL = r'https?://youtu\.be/(?P[0-9A-Za-z_-]{11})/*?.*?\blist=(?P%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
    _TESTS = [{
        'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
        'info_dict': {
            'id': 'yeWKywCrFtk',
            'ext': 'mp4',
            'title': 'Small Scale Baler and Braiding Rugs',
            'uploader': 'Backus-Page House Museum',
            'uploader_id': 'backuspagemuseum',
            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
            'upload_date': '20161008',
            'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
            'categories': ['Nonprofits & Activism'],
            'tags': list,
            'like_count': int,
            'dislike_count': int,
        },
        'params': {
            'noplaylist': True,
            'skip_download': True,
        },
    }, {
        'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        playlist_id = mobj.group('playlist_id')
        return self.url_result(
            update_url_query('https://www.youtube.com/watch', {
                'v': video_id,
                'list': playlist_id,
                'feature': 'youtu.be',
            }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)


class YoutubeYtUserIE(InfoExtractor):
    IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
    _VALID_URL = r'ytuser:(?P.+)'
    _TESTS = [{
        'url': 'ytuser:phihag',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        user_id = self._match_id(url)
        return self.url_result(
            'https://www.youtube.com/user/%s' % user_id,
            ie=YoutubeTabIE.ie_key(), video_id=user_id)


class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
    IE_NAME = 'youtube:favorites'
    IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
    _VALID_URL = r':ytfav(?:ou?rite)?s?'
    _LOGIN_REQUIRED = True
    _TESTS = [{
        'url': ':ytfav',
        'only_matching': True,
    }, {
        'url': ':ytfavorites',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        return self.url_result(
            'https://www.youtube.com/playlist?list=LL',
            ie=YoutubeTabIE.ie_key())


class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
    IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
    # there doesn't appear to be a real limit, for example if you search for
    # 'python' you get more than 8.000.000 results
    _MAX_RESULTS = float('inf')
    IE_NAME = 'youtube:search'
    _SEARCH_KEY = 'ytsearch'
    _SEARCH_PARAMS = None
    _TESTS = []

    def _entries(self, query, n):
        data = {'query': query}
        if self._SEARCH_PARAMS:
            data['params'] = self._SEARCH_PARAMS
        total = 0
        for page_num in itertools.count(1):
            search = self._call_api(
                ep='search', video_id='query "%s"' % query, fatal=False,
                note='Downloading page %s' % page_num, query=data)
            if not search:
                break
            slr_contents = try_get(
                search,
                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
                list)
            if not slr_contents:
                break

            # Youtube sometimes adds promoted content to searches,
            # changing the index location of videos and token.
            # So we search through all entries till we find them.
            continuation_token = None
            for slr_content in slr_contents:
                if continuation_token is None:
                    continuation_token = try_get(
                        slr_content,
                        lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
                        compat_str)

                isr_contents = try_get(
                    slr_content,
                    lambda x: x['itemSectionRenderer']['contents'],
                    list)
                if not isr_contents:
                    continue
                for content in isr_contents:
                    if not isinstance(content, dict):
                        continue
                    video = content.get('videoRenderer')
                    if not isinstance(video, dict):
                        continue
                    video_id = video.get('videoId')
                    if not video_id:
                        continue

                    yield self._extract_video(video)
                    total += 1
                    if total == n:
                        return

            if not continuation_token:
                break
            data['continuation'] = continuation_token

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        return self.playlist_result(self._entries(query, n), query)


class YoutubeSearchDateIE(YoutubeSearchIE):
    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
    _SEARCH_KEY = 'ytsearchdate'
    IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
    _SEARCH_PARAMS = 'CAI%3D'


class YoutubeSearchURLIE(YoutubeSearchIE):
    IE_DESC = 'YouTube.com search URLs'
    IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
    # _MAX_RESULTS = 100
    _TESTS = [{
        'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
        'playlist_mincount': 5,
        'info_dict': {
            'title': 'youtube-dl test video',
        }
    }, {
        'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
        'only_matching': True,
    }]

    @classmethod
    def _make_valid_url(cls):
        return cls._VALID_URL

    def _real_extract(self, url):
        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
        query = (qs.get('search_query') or qs.get('q'))[0]
        self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
        return self._get_n_results(query, self._MAX_RESULTS)


class YoutubeFeedsInfoExtractor(YoutubeTabIE):
    """
    Base class for feed extractors
    Subclasses must define the _FEED_NAME property.
    """
    _LOGIN_REQUIRED = True
    _TESTS = []

    @property
    def IE_NAME(self):
        return 'youtube:%s' % self._FEED_NAME

    def _real_initialize(self):
        self._login()

    def _real_extract(self, url):
        return self.url_result(
            'https://www.youtube.com/feed/%s' % self._FEED_NAME,
            ie=YoutubeTabIE.ie_key())


class YoutubeWatchLaterIE(InfoExtractor):
    IE_NAME = 'youtube:watchlater'
    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
    _VALID_URL = r':ytwatchlater'
    _TESTS = [{
        'url': ':ytwatchlater',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        return self.url_result(
            'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())


class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
    _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
    _FEED_NAME = 'recommended'
    _TESTS = [{
        'url': ':ytrec',
        'only_matching': True,
    }, {
        'url': ':ytrecommended',
        'only_matching': True,
    }, {
        'url': 'https://youtube.com',
        'only_matching': True,
    }]


class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
    IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
    _VALID_URL = r':ytsub(?:scription)?s?'
    _FEED_NAME = 'subscriptions'
    _TESTS = [{
        'url': ':ytsubs',
        'only_matching': True,
    }, {
        'url': ':ytsubscriptions',
        'only_matching': True,
    }]


class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
    IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
    _VALID_URL = r':ythis(?:tory)?'
    _FEED_NAME = 'history'
    _TESTS = [{
        'url': ':ythistory',
        'only_matching': True,
    }]


class YoutubeTruncatedURLIE(InfoExtractor):
    IE_NAME = 'youtube:truncated_url'
    IE_DESC = False  # Do not list
    _VALID_URL = r'''(?x)
        (?:https?://)?
        (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
        (?:watch\?(?:
            feature=[a-z_]+|
            annotation_id=annotation_[^&]+|
            x-yt-cl=[0-9]+|
            hl=[^&]*|
            t=[0-9]+
        )?
        |
            attribution_link\?a=[^&]+
        )
        $
    '''

    _TESTS = [{
        'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
        'only_matching': True,
    }, {
        'url': 'https://www.youtube.com/watch?',
        'only_matching': True,
    }, {
        'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
        'only_matching': True,
    }, {
        'url': 'https://www.youtube.com/watch?feature=foo',
        'only_matching': True,
    }, {
        'url': 'https://www.youtube.com/watch?hl=en-GB',
        'only_matching': True,
    }, {
        'url': 'https://www.youtube.com/watch?t=2372',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        raise ExtractorError(
            'Did you forget to quote the URL? Remember that & is a meta '
            'character in most shells, so you want to put the URL in quotes, '
            'like  youtube-dl '
            '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
            ' or simply  youtube-dl BaW_jenozKc  .',
            expected=True)


class YoutubeTruncatedIDIE(InfoExtractor):
    IE_NAME = 'youtube:truncated_id'
    IE_DESC = False  # Do not list
    _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P[0-9A-Za-z_-]{1,10})$'

    _TESTS = [{
        'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        raise ExtractorError(
            'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
            expected=True)