@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter
from . . swfinterp import SWFInterpreter
from . . swfinterp import SWFInterpreter
from . . compat import (
from . . compat import (
compat_chr ,
compat_chr ,
compat_HTTPError ,
compat_kwargs ,
compat_kwargs ,
compat_parse_qs ,
compat_parse_qs ,
compat_urllib_parse_unquote ,
compat_urllib_parse_unquote ,
@ -64,9 +65,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_TFA_URL = ' https://accounts.google.com/_/signin/challenge?hl=en&TL= {0} '
_TFA_URL = ' https://accounts.google.com/_/signin/challenge?hl=en&TL= {0} '
_RESERVED_NAMES = (
_RESERVED_NAMES = (
r ' course|embed|channel|c|user|playlist|watch|w|results|storefront|oops |'
r ' embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared |'
r ' s hared |index|account|reporthistory|t/terms|about|upload|signin|logout|'
r ' s torefront|oops |index|account|reporthistory|t/terms|about|upload|signin|logout|'
r ' feed/( watch_later|history|subscriptions|library|trending|recommended)' )
r ' feed/( ?: watch_later|history|subscriptions|library|trending|recommended)' )
_NETRC_MACHINE = ' youtube '
_NETRC_MACHINE = ' youtube '
# If True it will raise an error if no login info is provided
# If True it will raise an error if no login info is provided
@ -74,11 +75,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_PLAYLIST_ID_RE = r ' (?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_] { 10,}|RDMM|WL|LL|LM) '
_PLAYLIST_ID_RE = r ' (?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_] { 10,}|RDMM|WL|LL|LM) '
_YOUTUBE_CLIENT_HEADERS = {
' x-youtube-client-name ' : ' 1 ' ,
' x-youtube-client-version ' : ' 1.20200609.04.02 ' ,
}
def _set_language ( self ) :
def _set_language ( self ) :
self . _set_cookie (
self . _set_cookie (
' .youtube.com ' , ' PREF ' , ' f1=50000000&f6=8&hl=en ' ,
' .youtube.com ' , ' PREF ' , ' f1=50000000&f6=8&hl=en ' ,
@ -307,6 +303,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
}
}
_YT_INITIAL_DATA_RE = r ' (?:window \ s* \ [ \ s*[ " \' ]ytInitialData[ " \' ] \ s* \ ]|ytInitialData) \ s*= \ s*( { .+?}) \ s*; '
_YT_INITIAL_DATA_RE = r ' (?:window \ s* \ [ \ s*[ " \' ]ytInitialData[ " \' ] \ s* \ ]|ytInitialData) \ s*= \ s*( { .+?}) \ s*; '
_YT_INITIAL_PLAYER_RESPONSE_RE = r ' ytInitialPlayerResponse \ s*= \ s*( { .+?}) \ s*; '
_YT_INITIAL_BOUNDARY_RE = r ' (?:var \ s+meta|</script| \ n) '
def _call_api ( self , ep , query , video_id ) :
def _call_api ( self , ep , query , video_id ) :
data = self . _DEFAULT_API_DATA . copy ( )
data = self . _DEFAULT_API_DATA . copy ( )
@ -324,10 +322,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _extract_yt_initial_data ( self , video_id , webpage ) :
def _extract_yt_initial_data ( self , video_id , webpage ) :
return self . _parse_json (
return self . _parse_json (
self . _search_regex (
self . _search_regex (
( r ' %s \ s* \ n ' % self . _YT_INITIAL_DATA_RE ,
( r ' %s \ s* %s ' % ( self . _YT_INITIAL_DATA_RE , self . _YT_INITIAL_BOUNDARY_RE ) ,
self . _YT_INITIAL_DATA_RE ) , webpage , ' yt initial data ' ) ,
self . _YT_INITIAL_DATA_RE ) , webpage , ' yt initial data ' ) ,
video_id )
video_id )
def _extract_ytcfg ( self , video_id , webpage ) :
return self . _parse_json (
self . _search_regex (
r ' ytcfg \ .set \ s* \ ( \ s*( { .+?}) \ s* \ ) \ s*; ' , webpage , ' ytcfg ' ,
default = ' {} ' ) , video_id , fatal = False )
class YoutubeIE ( YoutubeBaseInfoExtractor ) :
class YoutubeIE ( YoutubeBaseInfoExtractor ) :
IE_DESC = ' YouTube.com '
IE_DESC = ' YouTube.com '
@ -343,14 +347,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
( ? : ( ? : www | dev ) \. ) ? invidio \. us / |
( ? : ( ? : www | dev ) \. ) ? invidio \. us / |
( ? : ( ? : www | no ) \. ) ? invidiou \. sh / |
( ? : ( ? : www | no ) \. ) ? invidiou \. sh / |
( ? : ( ? : www | fi | de ) \. ) ? invidious \. snopyta \. org / |
( ? : ( ? : www | fi ) \. ) ? invidious \. snopyta \. org / |
( ? : www \. ) ? invidious \. kabi \. tk / |
( ? : www \. ) ? invidious \. kabi \. tk / |
( ? : www \. ) ? invidious \.13 ad \. de / |
( ? : www \. ) ? invidious \.13 ad \. de / |
( ? : www \. ) ? invidious \. mastodon \. host / |
( ? : www \. ) ? invidious \. mastodon \. host / |
( ? : www \. ) ? invidious \. zapashcanon \. fr / |
( ? : www \. ) ? invidious \. kavin \. rocks / |
( ? : www \. ) ? invidious \. tube / |
( ? : www \. ) ? invidiou \. site / |
( ? : www \. ) ? invidious \. site / |
( ? : www \. ) ? invidious \. xyz / |
( ? : www \. ) ? invidious \. nixnet \. xyz / |
( ? : www \. ) ? invidious \. nixnet \. xyz / |
( ? : www \. ) ? invidious \. drycat \. fr / |
( ? : www \. ) ? invidious \. drycat \. fr / |
( ? : www \. ) ? tube \. poal \. co / |
( ? : www \. ) ? tube \. poal \. co / |
( ? : www \. ) ? tube \. connect \. cafe / |
( ? : www \. ) ? vid \. wxzm \. sx / |
( ? : www \. ) ? vid \. wxzm \. sx / |
( ? : www \. ) ? vid \. mint \. lgbt / |
( ? : www \. ) ? yewtu \. be / |
( ? : www \. ) ? yewtu \. be / |
( ? : www \. ) ? yt \. elukerio \. org / |
( ? : www \. ) ? yt \. elukerio \. org / |
( ? : www \. ) ? yt \. lelux \. fi / |
( ? : www \. ) ? yt \. lelux \. fi / |
@ -506,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' 396 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 396 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 397 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 397 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
}
}
_SUBTITLE_FORMATS = ( ' srv1' , ' srv2 ' , ' srv3 ' , ' ttml ' , ' vtt ' ) # TODO 'json3' raising issues with automatic captions
_SUBTITLE_FORMATS = ( ' json3' , ' srv1' , ' srv2 ' , ' srv3 ' , ' ttml ' , ' vtt ' )
_GEO_BYPASS = False
_GEO_BYPASS = False
@ -1092,7 +1104,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
} ,
} ,
} ,
} ,
{
{
# with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
# with '};' inside yt initial data (see [1])
# see [2] for an example with '};' inside ytInitialPlayerResponse
# 1. https://github.com/ytdl-org/youtube-dl/issues/27093
# 2. https://github.com/ytdl-org/youtube-dl/issues/27216
' url ' : ' https://www.youtube.com/watch?v=CHqg6qOn4no ' ,
' url ' : ' https://www.youtube.com/watch?v=CHqg6qOn4no ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' CHqg6qOn4no ' ,
' id ' : ' CHqg6qOn4no ' ,
@ -1107,6 +1122,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' skip_download ' : True ,
' skip_download ' : True ,
} ,
} ,
} ,
} ,
{
# another example of '};' in ytInitialData
' url ' : ' https://www.youtube.com/watch?v=gVfgbahppCY ' ,
' only_matching ' : True ,
} ,
{
' url ' : ' https://www.youtube.com/watch_popup?v=63RmMXCd_bQ ' ,
' only_matching ' : True ,
} ,
]
]
def __init__ ( self , * args , * * kwargs ) :
def __init__ ( self , * args , * * kwargs ) :
@ -1335,17 +1359,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self . _parse_json (
return self . _parse_json (
uppercase_escape ( config ) , video_id , fatal = False )
uppercase_escape ( config ) , video_id , fatal = False )
def _get_automatic_captions ( self , video_id , webpage ) :
def _get_automatic_captions ( self , video_id , player_response, player_config ) :
""" We need the webpage for getting the captions url, pass it as an
""" We need the webpage for getting the captions url, pass it as an
argument to speed up the process . """
argument to speed up the process . """
self . to_screen ( ' %s : Looking for automatic captions ' % video_id )
self . to_screen ( ' %s : Looking for automatic captions ' % video_id )
player_config = self . _get_ytplayer_config ( video_id , webpage )
err_msg = ' Couldn \' t find automatic captions for %s ' % video_id
err_msg = ' Couldn \' t find automatic captions for %s ' % video_id
if not player_config :
if not ( player_response or player_config ) :
self . _downloader . report_warning ( err_msg )
self . _downloader . report_warning ( err_msg )
return { }
return { }
try :
try :
args = player_config [ ' args ' ]
args = player_config . get ( ' args ' ) if player_config else { }
caption_url = args . get ( ' ttsurl ' )
caption_url = args . get ( ' ttsurl ' )
if caption_url :
if caption_url :
timestamp = args [ ' timestamp ' ]
timestamp = args [ ' timestamp ' ]
@ -1404,19 +1427,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return captions
return captions
# New captions format as of 22.06.2017
# New captions format as of 22.06.2017
player_response = args . get ( ' player_response ' )
if player_response :
if player_response and isinstance ( player_response , compat_str ) :
renderer = player_response [ ' captions ' ] [ ' playerCaptionsTracklistRenderer ' ]
player_response = self . _parse_json (
base_url = renderer [ ' captionTracks ' ] [ 0 ] [ ' baseUrl ' ]
player_response , video_id , fatal = False )
sub_lang_list = [ ]
if player_response :
for lang in renderer [ ' translationLanguages ' ] :
renderer = player_response [ ' captions ' ] [ ' playerCaptionsTracklistRenderer ' ]
lang_code = lang . get ( ' languageCode ' )
base_url = renderer [ ' captionTracks ' ] [ 0 ] [ ' baseUrl ' ]
if lang_code :
sub_lang_list = [ ]
sub_lang_list . append ( lang_code )
for lang in renderer [ ' translationLanguages ' ] :
return make_captions ( base_url , sub_lang_list )
lang_code = lang . get ( ' languageCode ' )
if lang_code :
sub_lang_list . append ( lang_code )
return make_captions ( base_url , sub_lang_list )
# Some videos don't provide ttsurl but rather caption_tracks and
# Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA)
# caption_translation_languages (e.g. 20LmZk1hakA)
@ -1771,7 +1790,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not video_info and not player_response :
if not video_info and not player_response :
player_response = extract_player_response (
player_response = extract_player_response (
self . _search_regex (
self . _search_regex (
r ' ytInitialPlayerResponse \ s*= \ s*( { .+?}) \ s*; ' , video_webpage ,
( r ' %s \ s* %s ' % ( self . _YT_INITIAL_PLAYER_RESPONSE_RE , self . _YT_INITIAL_BOUNDARY_RE ) ,
self . _YT_INITIAL_PLAYER_RESPONSE_RE ) , video_webpage ,
' initial player response ' , default = ' {} ' ) ,
' initial player response ' , default = ' {} ' ) ,
video_id )
video_id )
@ -2352,7 +2372,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# subtitles
# subtitles
video_subtitles = self . extract_subtitles (
video_subtitles = self . extract_subtitles (
video_id , video_webpage , has_live_chat_replay )
video_id , video_webpage , has_live_chat_replay )
automatic_captions = self . extract_automatic_captions ( video_id , video_webpage )
automatic_captions = self . extract_automatic_captions ( video_id , player_response, ytplayer_config )
video_duration = try_get (
video_duration = try_get (
video_info , lambda x : int_or_none ( x [ ' length_seconds ' ] [ 0 ] ) )
video_info , lambda x : int_or_none ( x [ ' length_seconds ' ] [ 0 ] ) )
@ -2373,16 +2393,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# annotations
# annotations
video_annotations = None
video_annotations = None
if self . _downloader . params . get ( ' writeannotations ' , False ) :
if self . _downloader . params . get ( ' writeannotations ' , False ) :
xsrf_token = self . _search_regex (
xsrf_token = None
r ' ([ \' " ])XSRF_TOKEN \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_token>[A-Za-z0-9+/=]+) \ 2 ' ,
ytcfg = self . _extract_ytcfg ( video_id , video_webpage )
video_webpage , ' xsrf token ' , group = ' xsrf_token ' , fatal = False )
if ytcfg :
xsrf_token = try_get ( ytcfg , lambda x : x [ ' XSRF_TOKEN ' ] , compat_str )
if not xsrf_token :
xsrf_token = self . _search_regex (
r ' ([ \' " ])XSRF_TOKEN \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_token>(?:(?! \ 2).)+) \ 2 ' ,
video_webpage , ' xsrf token ' , group = ' xsrf_token ' , fatal = False )
invideo_url = try_get (
invideo_url = try_get (
player_response , lambda x : x [ ' annotations ' ] [ 0 ] [ ' playerAnnotationsUrlsRenderer ' ] [ ' invideoUrl ' ] , compat_str )
player_response , lambda x : x [ ' annotations ' ] [ 0 ] [ ' playerAnnotationsUrlsRenderer ' ] [ ' invideoUrl ' ] , compat_str )
if xsrf_token and invideo_url :
if xsrf_token and invideo_url :
xsrf_field_name = self . _search_regex (
xsrf_field_name = None
r ' ([ \' " ])XSRF_FIELD_NAME \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_field_name> \ w+) \ 2 ' ,
if ytcfg :
video_webpage , ' xsrf field name ' ,
xsrf_field_name = try_get ( ytcfg , lambda x : x [ ' XSRF_FIELD_NAME ' ] , compat_str )
group = ' xsrf_field_name ' , default = ' session_token ' )
if not xsrf_field_name :
xsrf_field_name = self . _search_regex (
r ' ([ \' " ])XSRF_FIELD_NAME \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_field_name> \ w+) \ 2 ' ,
video_webpage , ' xsrf field name ' ,
group = ' xsrf_field_name ' , default = ' session_token ' )
video_annotations = self . _download_webpage (
video_annotations = self . _download_webpage (
self . _proto_relative_url ( invideo_url ) ,
self . _proto_relative_url ( invideo_url ) ,
video_id , note = ' Downloading annotations ' ,
video_id , note = ' Downloading annotations ' ,
@ -2526,7 +2555,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
feed / |
feed / |
( ? : playlist | watch ) \? . * ? \blist =
( ? : playlist | watch ) \? . * ? \blist =
) |
) |
( ? ! ( % s ) ( [ / #?]|$)) # Direct URLs
( ? ! ( ? : % s ) \b ) # Direct URLs
)
)
( ? P < id > [ ^ / ? \#&]+)
( ? P < id > [ ^ / ? \#&]+)
''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
@ -2791,13 +2820,31 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# no longer available?
# no longer available?
' url ' : ' https://www.youtube.com/feed/recommended ' ,
' url ' : ' https://www.youtube.com/feed/recommended ' ,
' only_matching ' : True ,
' only_matching ' : True ,
}
} , {
# TODO
# inline playlist with not always working continuations
# {
' url ' : ' https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C ' ,
# 'url': 'https://www.youtube.com/TheYoungTurks/live',
' only_matching ' : True ,
# 'only_matching': True,
} , {
# }
' url ' : ' https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8 ' ,
]
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/course ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/zsecurity ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://www.youtube.com/NASAgovVideo/videos ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/TheYoungTurks/live ' ,
' only_matching ' : True ,
} ]
@classmethod
def suitable ( cls , url ) :
return False if YoutubeIE . suitable ( url ) else super (
YoutubeTabIE , cls ) . suitable ( url )
def _extract_channel_id ( self , webpage ) :
def _extract_channel_id ( self , webpage ) :
channel_id = self . _html_search_meta (
channel_id = self . _html_search_meta (
@ -2894,12 +2941,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# TODO
# TODO
pass
pass
def _shelf_entries ( self , shelf_renderer ):
def _shelf_entries ( self , shelf_renderer , skip_channels = False ):
ep = try_get (
ep = try_get (
shelf_renderer , lambda x : x [ ' endpoint ' ] [ ' commandMetadata ' ] [ ' webCommandMetadata ' ] [ ' url ' ] ,
shelf_renderer , lambda x : x [ ' endpoint ' ] [ ' commandMetadata ' ] [ ' webCommandMetadata ' ] [ ' url ' ] ,
compat_str )
compat_str )
shelf_url = urljoin ( ' https://www.youtube.com ' , ep )
shelf_url = urljoin ( ' https://www.youtube.com ' , ep )
if shelf_url :
if shelf_url :
# Skipping links to another channels, note that checking for
# endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
# will not work
if skip_channels and ' /channels? ' in shelf_url :
return
title = try_get (
title = try_get (
shelf_renderer , lambda x : x [ ' title ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] , compat_str )
shelf_renderer , lambda x : x [ ' title ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] , compat_str )
yield self . url_result ( shelf_url , video_title = title )
yield self . url_result ( shelf_url , video_title = title )
@ -2986,6 +3038,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
for entry in self . _post_thread_entries ( renderer ) :
for entry in self . _post_thread_entries ( renderer ) :
yield entry
yield entry
@staticmethod
def _build_continuation_query ( continuation , ctp = None ) :
query = {
' ctoken ' : continuation ,
' continuation ' : continuation ,
}
if ctp :
query [ ' itct ' ] = ctp
return query
@staticmethod
@staticmethod
def _extract_next_continuation_data ( renderer ) :
def _extract_next_continuation_data ( renderer ) :
next_continuation = try_get (
next_continuation = try_get (
@ -2996,11 +3058,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not continuation :
if not continuation :
return
return
ctp = next_continuation . get ( ' clickTrackingParams ' )
ctp = next_continuation . get ( ' clickTrackingParams ' )
return {
return YoutubeTabIE . _build_continuation_query ( continuation , ctp )
' ctoken ' : continuation ,
' continuation ' : continuation ,
' itct ' : ctp ,
}
@classmethod
@classmethod
def _extract_continuation ( cls , renderer ) :
def _extract_continuation ( cls , renderer ) :
@ -3023,13 +3081,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not continuation :
if not continuation :
continue
continue
ctp = continuation_ep . get ( ' clickTrackingParams ' )
ctp = continuation_ep . get ( ' clickTrackingParams ' )
if not ctp :
return YoutubeTabIE . _build_continuation_query ( continuation , ctp )
continue
return {
' ctoken ' : continuation ,
' continuation ' : continuation ,
' itct ' : ctp ,
}
def _entries ( self , tab , identity_token ) :
def _entries ( self , tab , identity_token ) :
@ -3064,7 +3116,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continue
continue
renderer = isr_content . get ( ' shelfRenderer ' )
renderer = isr_content . get ( ' shelfRenderer ' )
if renderer :
if renderer :
for entry in self . _shelf_entries ( renderer ) :
is_channels_tab = tab . get ( ' title ' ) == ' Channels '
for entry in self . _shelf_entries ( renderer , not is_channels_tab ) :
yield entry
yield entry
continue
continue
renderer = isr_content . get ( ' backstagePostThreadRenderer ' )
renderer = isr_content . get ( ' backstagePostThreadRenderer ' )
@ -3086,9 +3139,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continuation_list [ 0 ] = self . _extract_continuation ( parent_renderer )
continuation_list [ 0 ] = self . _extract_continuation ( parent_renderer )
continuation_list = [ None ] # Python 2 doesnot support nonlocal
continuation_list = [ None ] # Python 2 doesnot support nonlocal
tab_content = try_get ( tab , lambda x : x [ ' content ' ] , dict )
if not tab_content :
return
parent_renderer = (
parent_renderer = (
try_get ( tab , lambda x : x [ ' sectionListRenderer ' ] , dict )
try_get ( tab _content , lambda x : x [ ' sectionListRenderer ' ] , dict )
or try_get ( tab , lambda x : x [ ' richGridRenderer ' ] , dict ) or { } )
or try_get ( tab _content , lambda x : x [ ' richGridRenderer ' ] , dict ) or { } )
for entry in extract_entries ( parent_renderer ) :
for entry in extract_entries ( parent_renderer ) :
yield entry
yield entry
continuation = continuation_list [ 0 ]
continuation = continuation_list [ 0 ]
@ -3103,10 +3159,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
for page_num in itertools . count ( 1 ) :
for page_num in itertools . count ( 1 ) :
if not continuation :
if not continuation :
break
break
browse = self . _download_json (
count = 0
' https://www.youtube.com/browse_ajax ' , None ,
retries = 3
' Downloading page %d ' % page_num ,
while count < = retries :
headers = headers , query = continuation , fatal = False )
try :
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
browse = self . _download_json (
' https://www.youtube.com/browse_ajax ' , None ,
' Downloading page %d %s '
% ( page_num , ' (retry # %d ) ' % count if count else ' ' ) ,
headers = headers , query = continuation )
break
except ExtractorError as e :
if isinstance ( e . cause , compat_HTTPError ) and e . cause . code in ( 500 , 503 ) :
count + = 1
if count < = retries :
continue
raise
if not browse :
if not browse :
break
break
response = try_get ( browse , lambda x : x [ 1 ] [ ' response ' ] , dict )
response = try_get ( browse , lambda x : x [ 1 ] [ ' response ' ] , dict )
@ -3212,22 +3282,35 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if title is None :
if title is None :
title = " Youtube " + playlist_id . title ( )
title = " Youtube " + playlist_id . title ( )
playlist = self . playlist_result (
playlist = self . playlist_result (
self . _entries ( selected_tab [' content ' ] , identity_token ) ,
self . _entries ( selected_tab , identity_token ) ,
playlist_id = playlist_id , playlist_title = title ,
playlist_id = playlist_id , playlist_title = title ,
playlist_description = description )
playlist_description = description )
playlist . update ( self . _extract_uploader ( data ) )
playlist . update ( self . _extract_uploader ( data ) )
return playlist
return playlist
def _extract_from_playlist ( self , item_id , data, playlist ) :
def _extract_from_playlist ( self , item_id , url, data, playlist ) :
title = playlist . get ( ' title ' ) or try_get (
title = playlist . get ( ' title ' ) or try_get (
data , lambda x : x [ ' titleText ' ] [ ' simpleText ' ] , compat_str )
data , lambda x : x [ ' titleText ' ] [ ' simpleText ' ] , compat_str )
playlist_id = playlist . get ( ' playlistId ' ) or item_id
playlist_id = playlist . get ( ' playlistId ' ) or item_id
# Inline playlist rendition continuation does not always work
# at Youtube side, so delegating regular tab-based playlist URL
# processing whenever possible.
playlist_url = urljoin ( url , try_get (
playlist , lambda x : x [ ' endpoint ' ] [ ' commandMetadata ' ] [ ' webCommandMetadata ' ] [ ' url ' ] ,
compat_str ) )
if playlist_url and playlist_url != url :
return self . url_result (
playlist_url , ie = YoutubeTabIE . ie_key ( ) , video_id = playlist_id ,
video_title = title )
return self . playlist_result (
return self . playlist_result (
self . _playlist_entries ( playlist ) , playlist_id = playlist_id ,
self . _playlist_entries ( playlist ) , playlist_id = playlist_id ,
playlist_title = title )
playlist_title = title )
def _extract_alerts ( self , data ) :
@staticmethod
def _extract_alerts ( data ) :
for alert_dict in try_get ( data , lambda x : x [ ' alerts ' ] , list ) or [ ] :
for alert_dict in try_get ( data , lambda x : x [ ' alerts ' ] , list ) or [ ] :
if not isinstance ( alert_dict , dict ) :
continue
for renderer in alert_dict :
for renderer in alert_dict :
alert = alert_dict [ renderer ]
alert = alert_dict [ renderer ]
alert_type = alert . get ( ' type ' )
alert_type = alert . get ( ' type ' )
@ -3241,6 +3324,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if message :
if message :
yield alert_type , message
yield alert_type , message
def _extract_identity_token ( self , webpage , item_id ) :
ytcfg = self . _extract_ytcfg ( item_id , webpage )
if ytcfg :
token = try_get ( ytcfg , lambda x : x [ ' ID_TOKEN ' ] , compat_str )
if token :
return token
return self . _search_regex (
r ' \ bID_TOKEN[ " \' ] \ s*: \ s*[ " \' ](.+?)[ " \' ] ' , webpage ,
' identity token ' , default = None )
def _real_extract ( self , url ) :
def _real_extract ( self , url ) :
item_id = self . _match_id ( url )
item_id = self . _match_id ( url )
url = compat_urlparse . urlunparse (
url = compat_urlparse . urlunparse (
@ -3257,7 +3350,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
video_id = qs . get ( ' v ' , [ None ] ) [ 0 ]
video_id = qs . get ( ' v ' , [ None ] ) [ 0 ]
playlist_id = qs . get ( ' list ' , [ None ] ) [ 0 ]
playlist_id = qs . get ( ' list ' , [ None ] ) [ 0 ]
if is_home . group ( ' not_channel ' ) is not None and is_home . group ( ' not_channel ' ) . startswith ( ' watch ' ) and not video_id :
if is_home is not None and is_home . group ( ' not_channel ' ) is not None and is_home . group ( ' not_channel ' ) . startswith ( ' watch ' ) and not video_id :
if playlist_id :
if playlist_id :
self . _downloader . report_warning ( ' %s is not a valid Youtube URL. Trying to download playlist %s ' % ( url , playlist_id ) )
self . _downloader . report_warning ( ' %s is not a valid Youtube URL. Trying to download playlist %s ' % ( url , playlist_id ) )
url = ' https://www.youtube.com/playlist?list= %s ' % playlist_id
url = ' https://www.youtube.com/playlist?list= %s ' % playlist_id
@ -3271,9 +3364,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
self . to_screen ( ' Downloading playlist %s - add --no-playlist to just download video %s ' % ( playlist_id , video_id ) )
self . to_screen ( ' Downloading playlist %s - add --no-playlist to just download video %s ' % ( playlist_id , video_id ) )
webpage = self . _download_webpage ( url , item_id )
webpage = self . _download_webpage ( url , item_id )
identity_token = self . _search_regex (
identity_token = self . _extract_identity_token ( webpage , item_id )
r ' \ bID_TOKEN[ " \' ] \ s*: \ s*[ " \' ](.+?)[ " \' ] ' , webpage ,
' identity token ' , default = None )
data = self . _extract_yt_initial_data ( item_id , webpage )
data = self . _extract_yt_initial_data ( item_id , webpage )
for alert_type , alert_message in self . _extract_alerts ( data ) :
for alert_type , alert_message in self . _extract_alerts ( data ) :
self . _downloader . report_warning ( ' YouTube said: %s - %s ' % ( alert_type , alert_message ) )
self . _downloader . report_warning ( ' YouTube said: %s - %s ' % ( alert_type , alert_message ) )
@ -3284,7 +3375,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
playlist = try_get (
playlist = try_get (
data , lambda x : x [ ' contents ' ] [ ' twoColumnWatchNextResults ' ] [ ' playlist ' ] [ ' playlist ' ] , dict )
data , lambda x : x [ ' contents ' ] [ ' twoColumnWatchNextResults ' ] [ ' playlist ' ] [ ' playlist ' ] , dict )
if playlist :
if playlist :
return self . _extract_from_playlist ( item_id , data, playlist )
return self . _extract_from_playlist ( item_id , url, data, playlist )
# Fallback to video extraction if no playlist alike page is recognized.
# Fallback to video extraction if no playlist alike page is recognized.
# First check for the current video then try the v attribute of URL query.
# First check for the current video then try the v attribute of URL query.
video_id = try_get (
video_id = try_get (
@ -3304,8 +3395,7 @@ class YoutubePlaylistIE(InfoExtractor):
( ? :
( ? :
( ? :
( ? :
youtube ( ? : kids ) ? \. com |
youtube ( ? : kids ) ? \. com |
invidio \. us |
invidio \. us
youtu \. be
)
)
/ . * ? \? . * ? \blist =
/ . * ? \? . * ? \blist =
) ?
) ?
@ -3350,6 +3440,32 @@ class YoutubePlaylistIE(InfoExtractor):
' uploader_id ' : ' UC21nz3_MesPLqtDqwdvnoxA ' ,
' uploader_id ' : ' UC21nz3_MesPLqtDqwdvnoxA ' ,
}
}
} , {
} , {
' url ' : ' TLGGrESM50VT6acwMjAyMjAxNw ' ,
' only_matching ' : True ,
} , {
# music album playlist
' url ' : ' OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM ' ,
' only_matching ' : True ,
} ]
@classmethod
def suitable ( cls , url ) :
return False if YoutubeTabIE . suitable ( url ) else super (
YoutubePlaylistIE , cls ) . suitable ( url )
def _real_extract ( self , url ) :
playlist_id = self . _match_id ( url )
qs = compat_urlparse . parse_qs ( compat_urlparse . urlparse ( url ) . query )
if not qs :
qs = { ' list ' : playlist_id }
return self . url_result (
update_url_query ( ' https://www.youtube.com/playlist ' , qs ) ,
ie = YoutubeTabIE . ie_key ( ) , video_id = playlist_id )
class YoutubeYtBeIE ( InfoExtractor ) :
_VALID_URL = r ' https?://youtu \ .be/(?P<id>[0-9A-Za-z_-] {11} )/*?.*? \ blist=(?P<playlist_id> %(playlist_id)s ) ' % { ' playlist_id ' : YoutubeBaseInfoExtractor . _PLAYLIST_ID_RE }
_TESTS = [ {
' url ' : ' https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5 ' ,
' url ' : ' https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5 ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' yeWKywCrFtk ' ,
' id ' : ' yeWKywCrFtk ' ,
@ -3372,28 +3488,18 @@ class YoutubePlaylistIE(InfoExtractor):
} , {
} , {
' url ' : ' https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21 ' ,
' url ' : ' https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21 ' ,
' only_matching ' : True ,
' only_matching ' : True ,
} , {
' url ' : ' TLGGrESM50VT6acwMjAyMjAxNw ' ,
' only_matching ' : True ,
} , {
# music album playlist
' url ' : ' OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM ' ,
' only_matching ' : True ,
} ]
} ]
@classmethod
def suitable ( cls , url ) :
return False if YoutubeTabIE . suitable ( url ) else super (
YoutubePlaylistIE , cls ) . suitable ( url )
def _real_extract ( self , url ) :
def _real_extract ( self , url ) :
playlist_id = self . _match_id ( url )
mobj = re . match ( self . _VALID_URL , url )
qs = compat_urlparse . parse_qs ( compat_urlparse . urlparse ( url ) . query )
video_id = mobj . group ( ' id ' )
if not qs :
playlist_id = mobj . group ( ' playlist_id ' )
qs = { ' list ' : playlist_id }
return self . url_result (
return self . url_result (
update_url_query ( ' https://www.youtube.com/playlist ' , qs ) ,
update_url_query ( ' https://www.youtube.com/watch ' , {
ie = YoutubeTabIE . ie_key ( ) , video_id = playlist_id )
' v ' : video_id ,
' list ' : playlist_id ,
' feature ' : ' youtu.be ' ,
} ) , ie = YoutubeTabIE . ie_key ( ) , video_id = playlist_id )
class YoutubeYtUserIE ( InfoExtractor ) :
class YoutubeYtUserIE ( InfoExtractor ) :