Merge branch 'remitamine-brightcove_in_page_embed'

9 years ago · 967e0955f0
parent 4e21b3a94f e01b432ad3
commit 967e0955f0
8 changed files with 200 additions and 24 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -60,7 +60,10 @@ from .bloomberg import BloombergIE
 from .bpb import BpbIE
 from .br import BRIE
 from .breakcom import BreakIE
-from .brightcove import BrightcoveIE
+from .brightcove import (
    BrightcoveLegacyIE,
    BrightcoveNewIE,
 )
 from .buzzfeed import BuzzFeedIE
 from .byutv import BYUtvIE
 from .c56 import C56IE
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@ -15,7 +15,7 @@ class AlJazeeraIE(InfoExtractor):
            'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
            'uploader': 'Al Jazeera English',
        },
-        'add_ie': ['Brightcove'],
+        'add_ie': ['BrightcoveLegacy'],
        'skip': 'Not accessible from Travis CI server',
    }
@ -32,5 +32,5 @@ class AlJazeeraIE(InfoExtractor):
                'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'
                '&%40videoPlayer={0}'.format(brightcove_id)
            ),
-            'ie_key': 'Brightcove',
+            'ie_key': 'BrightcoveLegacy',
        }
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@ -20,12 +20,17 @@ from ..utils import (
    ExtractorError,
    find_xpath_attr,
    fix_xml_ampersands,
    float_or_none,
    js_to_json,
    int_or_none,
    parse_iso8601,
    unescapeHTML,
    unsmuggle_url,
 )
-class BrightcoveIE(InfoExtractor):
+class BrightcoveLegacyIE(InfoExtractor):
    IE_NAME = 'brightcove:legacy'
    _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
@ -346,3 +351,152 @@ class BrightcoveIE(InfoExtractor):
        if 'url' not in info and not info.get('formats'):
            raise ExtractorError('Unable to extract video url for %s' % info['id'])
        return info
 class BrightcoveNewIE(InfoExtractor):
    IE_NAME = 'brightcove:new'
    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)'
    _TEST = {
        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
        'md5': 'c8100925723840d4b0d243f7025703be',
        'info_dict': {
            'id': '4463358922001',
            'ext': 'mp4',
            'title': 'Meet the man behind Popcorn Time',
            'description': 'md5:eac376a4fe366edc70279bfb681aea16',
            'timestamp': 1441391203,
            'upload_date': '20150904',
            'duration': 165.768,
            'uploader_id': '929656772001',
        }
    }
    @staticmethod
    def _extract_urls(webpage):
        # Reference:
        # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript)
        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
        entries = []
        # Look for iframe embeds [1]
        for _, url in re.findall(
                r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
            entries.append(url)
        # Look for embed_in_page embeds [2]
        # According to examples from [3] it's unclear whether video id may be optional
        # and what to do when it is
        for video_id, account_id, player_id, embed in re.findall(
                r'''(?sx)
                    <video[^>]+
                        data-video-id=["\'](\d+)["\'][^>]*>.*?
                    </video>.*?
                    <script[^>]+
                        src=["\'](?:https?:)?//players\.brightcove\.net/
                        (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js
                ''', webpage):
            entries.append(
                'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
                % (account_id, player_id, embed, video_id))
        return entries
    def _real_extract(self, url):
        account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
        webpage = self._download_webpage(
            'http://players.brightcove.net/%s/%s_%s/index.min.js'
            % (account_id, player_id, embed), video_id)
        policy_key = None
        catalog = self._search_regex(
            r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
        if catalog:
            catalog = self._parse_json(
                js_to_json(catalog), video_id, fatal=False)
            if catalog:
                policy_key = catalog.get('policyKey')
        if not policy_key:
            policy_key = self._search_regex(
                r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
                webpage, 'policy key', group='pk')
        req = compat_urllib_request.Request(
            'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s'
            % (account_id, video_id),
            headers={'Accept': 'application/json;pk=%s' % policy_key})
        json_data = self._download_json(req, video_id)
        title = json_data['name']
        formats = []
        for source in json_data.get('sources', []):
            source_type = source.get('type')
            src = source.get('src')
            if source_type == 'application/x-mpegURL':
                if not src:
                    continue
                m3u8_formats = self._extract_m3u8_formats(
                    src, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls', fatal=False)
                if m3u8_formats:
                    formats.extend(m3u8_formats)
            else:
                streaming_src = source.get('streaming_src')
                stream_name, app_name = source.get('stream_name'), source.get('app_name')
                if not src and not streaming_src and (not stream_name or not app_name):
                    continue
                tbr = float_or_none(source.get('avg_bitrate'), 1000)
                height = int_or_none(source.get('height'))
                f = {
                    'tbr': tbr,
                    'width': int_or_none(source.get('width')),
                    'height': height,
                    'filesize': int_or_none(source.get('size')),
                    'container': source.get('container'),
                    'vcodec': source.get('codec'),
                    'ext': source.get('container').lower(),
                }
                def build_format_id(kind):
                    format_id = kind
                    if tbr:
                        format_id += '-%dk' % int(tbr)
                    if height:
                        format_id += '-%dp' % height
                    return format_id
                if src or streaming_src:
                    f.update({
                        'url': src or streaming_src,
                        'format_id': build_format_id('http' if src else 'http-streaming'),
                        'preference': 2 if src else 1,
                    })
                else:
                    f.update({
                        'url': app_name,
                        'play_path': stream_name,
                        'format_id': build_format_id('rtmp'),
                    })
                formats.append(f)
        self._sort_formats(formats)
        description = json_data.get('description')
        thumbnail = json_data.get('thumbnail')
        timestamp = parse_iso8601(json_data.get('published_at'))
        duration = float_or_none(json_data.get('duration'), 1000)
        tags = json_data.get('tags', [])
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'timestamp': timestamp,
            'uploader_id': account_id,
            'formats': formats,
            'tags': tags,
        }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -30,7 +30,10 @@ from ..utils import (
    url_basename,
    xpath_text,
 )
-from .brightcove import BrightcoveIE
+from .brightcove import (
    BrightcoveLegacyIE,
    BrightcoveNewIE,
 )
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
@ -275,7 +278,7 @@ class GenericIE(InfoExtractor):
        # it also tests brightcove videos that need to set the 'Referer' in the
        # http requests
        {
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
            'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
            'info_dict': {
                'id': '2765128793001',
@ -299,7 +302,7 @@ class GenericIE(InfoExtractor):
                'uploader': 'thestar.com',
                'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
            },
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
        },
        {
            'url': 'http://www.championat.com/video/football/v/87/87499.html',
@ -314,7 +317,7 @@ class GenericIE(InfoExtractor):
        },
        {
            # https://github.com/rg3/youtube-dl/issues/3541
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
            'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
            'info_dict': {
                'id': '3866516442001',
@ -1031,6 +1034,17 @@ class GenericIE(InfoExtractor):
                'ext': 'mp4',
                'title': 'cinemasnob',
            },
        },
        # BrightcoveInPageEmbed embed
        {
            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
            'info_dict': {
                'id': '4238694884001',
                'ext': 'flv',
                'title': 'Tabletop: Dread, Last Thoughts',
                'description': 'Tabletop: Dread, Last Thoughts',
                'duration': 51690,
            },
        }
    ]
@ -1290,14 +1304,14 @@ class GenericIE(InfoExtractor):
            return self.playlist_result(
                urlrs, playlist_id=video_id, playlist_title=video_title)
-        # Look for BrightCove:
+        # Look for Brightcove Legacy Studio embeds
-        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
        if bc_urls:
            self.to_screen('Brightcove video detected.')
            entries = [{
                '_type': 'url',
                'url': smuggle_url(bc_url, {'Referer': url}),
-                'ie_key': 'Brightcove'
+                'ie_key': 'BrightcoveLegacy'
            } for bc_url in bc_urls]
            return {
@ -1307,6 +1321,11 @@ class GenericIE(InfoExtractor):
                'entries': entries,
            }
        # Look for Brightcove New Studio embeds
        bc_urls = BrightcoveNewIE._extract_urls(webpage)
        if bc_urls:
            return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
        # Look for embedded rtl.nl player
        matches = re.findall(
            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@ -1,7 +1,7 @@
 # encoding: utf-8
 from __future__ import unicode_literals
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
 from .common import InfoExtractor
 from ..utils import ExtractorError
 from ..compat import (
@ -22,10 +22,10 @@ class NownessBaseIE(InfoExtractor):
                            'http://www.nowness.com/iframe?id=%s' % video_id, video_id,
                            note='Downloading player JavaScript',
                            errnote='Unable to download player JavaScript')
-                        bc_url = BrightcoveIE._extract_brightcove_url(player_code)
+                        bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
                        if bc_url is None:
                            raise ExtractorError('Could not find player definition')
-                        return self.url_result(bc_url, 'Brightcove')
+                        return self.url_result(bc_url, 'BrightcoveLegacy')
                    elif source == 'vimeo':
                        return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
                    elif source == 'youtube':
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
 from ..compat import (
    compat_urllib_parse,
@ -112,11 +112,11 @@ class SafariIE(SafariBaseIE):
            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
            part)
-        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
        if not bc_url:
            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
-        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove')
+        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
 class SafariCourseIE(SafariBaseIE):
--- a/youtube_dl/extractor/space.py
+++ b/youtube_dl/extractor/space.py
@ -3,14 +3,14 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
 from ..utils import RegexNotFoundError, ExtractorError
 class SpaceIE(InfoExtractor):
    _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
    _TEST = {
-        'add_ie': ['Brightcove'],
+        'add_ie': ['BrightcoveLegacy'],
        'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
        'info_dict': {
            'id': '2780937028001',
@ -31,8 +31,8 @@ class SpaceIE(InfoExtractor):
            brightcove_url = self._og_search_video_url(webpage)
        except RegexNotFoundError:
            # Other videos works fine with the info from the object
-            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage)
+            brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
        if brightcove_url is None:
            raise ExtractorError(
                'The webpage does not contain a video', expected=True)
-        return self.url_result(brightcove_url, BrightcoveIE.ie_key())
+        return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
 from .discovery import DiscoveryIE
 from ..compat import compat_urlparse
@ -66,6 +66,6 @@ class TlcDeIE(InfoExtractor):
        return {
            '_type': 'url',
-            'url': BrightcoveIE._extract_brightcove_url(iframe),
+            'url': BrightcoveLegacyIE._extract_brightcove_url(iframe),
-            'ie': BrightcoveIE.ie_key(),
+            'ie': BrightcoveLegacyIE.ie_key(),
        }