@ -1,22 +1,28 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import re
from . common import InfoExtractor
from . common import InfoExtractor
from . . utils import (
from . . utils import (
determine_ext ,
ExtractorError ,
ExtractorError ,
)
)
class ARDIE ( InfoExtractor ) :
class ARDIE ( InfoExtractor ) :
_VALID_URL = r ' ^(?:https?://)?(?:(?:www \ .)?ardmediathek \ .de|mediathek \ .daserste \ .de)/(?:.*/)(?P<video_id>[^/ \ ?]+)(?: \ ?.*)? '
_VALID_URL = r ' ^https?://(?:(?:www \ .)?ardmediathek \ .de|mediathek \ .daserste \ .de)/(?:.*/)(?P<video_id>[^/ \ ?]+)(?: \ ?.*)? '
_TITLE = r ' <h1(?: class= " boxTopHeadline " )?>(?P<title>.*)</h1> '
_MEDIA_STREAM = r ' mediaCollection \ .addMediaStream \ ((?P<media_type> \ d+), (?P<quality> \ d+), " (?P<rtmp_url>[^ " ]*) " , " (?P<video_url>[^ " ]*) " , " [^ " ]* " \ ) '
_TEST = {
_TEST = {
u ' url ' : u ' http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640 ' ,
' url ' : ' http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786 ' ,
u ' file ' : u ' 14077640.mp4 ' ,
' file ' : ' 19288786.mp4 ' ,
u ' md5 ' : u ' 6ca8824255460c787376353f9e20bbd8 ' ,
' md5 ' : ' 515bf47ce209fb3f5a61b7aad364634c ' ,
u ' info_dict ' : {
' info_dict ' : {
u " title " : u " 11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden "
' title ' : ' Edward Snowden im Interview - Held oder Verräter? ' ,
' description ' : ' Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdc berwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend. ' ,
' thumbnail ' : ' http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037 ' ,
} ,
} ,
u ' skip ' : u ' Requires rtmpdump '
' skip ' : ' Blocked outside of Germany ' ,
}
}
def _real_extract ( self , url ) :
def _real_extract ( self , url ) :
@ -29,26 +35,49 @@ class ARDIE(InfoExtractor):
else :
else :
video_id = m . group ( ' video_id ' )
video_id = m . group ( ' video_id ' )
# determine title and media streams from webpage
webpage = self . _download_webpage ( url , video_id )
html = self . _download_webpage ( url , video_id )
title = re . search ( self . _TITLE , html ) . group ( ' title ' )
title = self . _html_search_regex (
streams = [ mo . groupdict ( ) for mo in re . finditer ( self . _MEDIA_STREAM , html ) ]
r ' <h1(?: \ s+class= " boxTopHeadline " )?>(.*?)</h1> ' , webpage , ' title ' )
description = self . _html_search_meta (
' dcterms.abstract ' , webpage , ' description ' )
thumbnail = self . _og_search_thumbnail ( webpage )
streams = [
mo . groupdict ( )
for mo in re . finditer (
r ' mediaCollection \ .addMediaStream \ ((?P<media_type> \ d+), (?P<quality> \ d+), " (?P<rtmp_url>[^ " ]*) " , " (?P<video_url>[^ " ]*) " , " [^ " ]* " \ ) ' , webpage ) ]
if not streams :
if not streams :
assert ' " fsk " ' in html
if ' " fsk " ' in webpage :
raise ExtractorError ( u ' This video is only available after 8:00 pm ' )
raise ExtractorError ( ' This video is only available after 20:00 ' )
# choose default media type and highest quality for now
formats = [ ]
stream = max ( [ s for s in streams if int ( s [ " media_type " ] ) == 0 ] ,
for s in streams :
key = lambda s : int ( s [ " quality " ] ) )
format = {
' quality ' : int ( s [ ' quality ' ] ) ,
# there's two possibilities: RTMP stream or HTTP download
}
info = { ' id ' : video_id , ' title ' : title , ' ext ' : ' mp4 ' }
if s . get ( ' rtmp_url ' ) :
if stream [ ' rtmp_url ' ] :
format [ ' protocol ' ] = ' rtmp '
self . to_screen ( u ' RTMP download detected ' )
format [ ' url ' ] = s [ ' rtmp_url ' ]
assert stream [ ' video_url ' ] . startswith ( ' mp4: ' )
format [ ' playpath ' ] = s [ ' video_url ' ]
info [ " url " ] = stream [ " rtmp_url " ]
info [ " play_path " ] = stream [ ' video_url ' ]
else :
else :
assert stream [ " video_url " ] . endswith ( ' .mp4 ' )
format [ ' url ' ] = s [ ' video_url ' ]
info [ " url " ] = stream [ " video_url " ]
return [ info ]
quality_name = self . _search_regex (
r ' [,.]([a-zA-Z0-9_-]+),? \ .mp4 ' , format [ ' url ' ] ,
' quality name ' , default = ' NA ' )
format [ ' format_id ' ] = ' %s - %s - %s - %s ' % (
determine_ext ( format [ ' url ' ] ) , quality_name , s [ ' media_type ' ] ,
s [ ' quality ' ] )
formats . append ( format )
self . _sort_formats ( formats )
return {
' id ' : video_id ,
' title ' : title ,
' description ' : description ,
' formats ' : formats ,
' thumbnail ' : thumbnail ,
}