yt-dlp/youtube_dl/extractor/ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        if talk_info.get('external') is not None:
            self.to_screen('Found video from %s' % talk_info['external']['service'])
            return {
                '_type': 'url',
                'url': talk_info['external']['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
[ted] Use unicode_literals 11 years ago			`from __future__ import unicode_literals`

Move TED IE into its own file 12 years ago			`import json`
			`import re`

[ted] Added support for subtitle download 11 years ago			`from .subtitles import SubtitlesInfoExtractor`
Move TED IE into its own file 12 years ago
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 11 years ago			`from ..utils import (`
[ted] Fix playlist extraction and add a test 11 years ago			`compat_str,`
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 11 years ago			`)`

[ted] Use unicode_literals 11 years ago
[ted] Added support for subtitle download 11 years ago			`class TEDIE(SubtitlesInfoExtractor):`
[ted] Simplify embed code (#2587) 11 years ago			`_VALID_URL = r'''(?x)`
			`(?P<proto>https?://)`
			`(?P<type>www\|embed)(?P<urlmain>\.ted\.com/`
[ted] Style fixes 11 years ago			`(`
			`(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist`
			`\|`
			`((?P<type_talk>talks)) # We have a simple talk`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`\|`
			`(?P<type_watch>watch)/[^/]+/[^/]+`
[ted] Style fixes 11 years ago			`)`
			`(/lang/(.*?))? # The url may contain the language`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`/(?P<name>[\w-]+) # Here goes the name and then ".html"`
[ted] Simplify embed code (#2587) 11 years ago			`.*)$`
[ted] Style fixes 11 years ago			`'''`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`_TESTS = [{`
[ted] Use unicode_literals 11 years ago			`'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',`
[ted] Update test md5 11 years ago			`'md5': 'fc94ac279feebbce69f21c0c6ee82810',`
[ted] Use unicode_literals 11 years ago			`'info_dict': {`
[ted] Remove unused import and modernize test 11 years ago			`'id': '102',`
			`'ext': 'mp4',`
[ted] Fix video extraction The site has been redesigned 11 years ago			`'title': 'The illusion of consciousness',`
[ted] Style fixes 11 years ago			`'description': ('Philosopher Dan Dennett makes a compelling '`
			`'argument that not only don\'t we understand our own '`
			`'consciousness, but that half the time our brains are '`
			`'actively fooling us.'),`
[ted] Fix video extraction The site has been redesigned 11 years ago			`'uploader': 'Dan Dennett',`
[ted] Add width and height (Fixes #2716) 11 years ago			`'width': 854,`
Move tests to the IE definitions 12 years ago			`}`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`}, {`
			`'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'md5': '226f4fb9c62380d11b7995efa4c87994',`
			`'info_dict': {`
			`'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'ext': 'mp4',`
			`'title': 'Vishal Sikka: The beauty and power of algorithms',`
			`'thumbnail': 're:^https?://.+\.jpg',`
			`'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',`
			`}`
[ted] Use the rtmp links if there http downloads are not available. 11 years ago			`}, {`
			`'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',`
			`'info_dict': {`
			`'id': '1972',`
[ted] Update test 11 years ago			`'ext': 'mp4',`
[ted] Use the rtmp links if there http downloads are not available. 11 years ago			`'title': 'Be passionate. Be courageous. Be your best.',`
			`'uploader': 'Gabby Giffords and Mark Kelly',`
[ted] Update test 11 years ago			`'description': 'md5:5174aed4d0f16021b704120360f72b92',`
[ted] Use the rtmp links if there http downloads are not available. 11 years ago			`},`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 10 years ago			`}, {`
			`'url': 'http://www.ted.com/playlists/who_are_the_hackers',`
			`'info_dict': {`
			`'id': '10',`
			`'title': 'Who are the hackers?',`
			`},`
			`'playlist_mincount': 6,`
[ted] Add support for external videos (fixes #3948) 10 years ago			`}, {`
			`# contains a youtube video`
			`'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',`
			`'add_ie': ['Youtube'],`
			`'info_dict': {`
			`'id': '_ZG8HBuDjgc',`
			`'ext': 'mp4',`
			`'title': 'Douglas Adams: Parrots the Universe and Everything',`
			`'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',`
			`'uploader': 'University of California Television (UCTV)',`
			`'uploader_id': 'UCtelevision',`
			`'upload_date': '20080522',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`}]`
Move TED IE into its own file 12 years ago
[ted] Add width and height (Fixes #2716) 11 years ago			`_NATIVE_FORMATS = {`
			`'low': {'preference': 1, 'width': 320, 'height': 180},`
			`'medium': {'preference': 2, 'width': 512, 'height': 288},`
			`'high': {'preference': 3, 'width': 854, 'height': 480},`
[ted] Fix video extraction The site has been redesigned 11 years ago			`}`
Move TED IE into its own file 12 years ago
[ted] Fix playlist extraction and add a test 11 years ago			`def _extract_info(self, webpage):`
[ted] Style fixes 11 years ago			`info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',`
			`webpage, 'info json')`
[ted] Fix playlist extraction and add a test 11 years ago			`return json.loads(info_json)`

Move TED IE into its own file 12 years ago			`def _real_extract(self, url):`
[ted] Style fixes 11 years ago			`m = re.match(self._VALID_URL, url, re.VERBOSE)`
[ted] Simplify embed code (#2587) 11 years ago			`if m.group('type') == 'embed':`
			`desktop_url = m.group('proto') + 'www' + m.group('urlmain')`
			`return self.url_result(desktop_url, 'TED')`
[ted] Style fixes 11 years ago			`name = m.group('name')`
Move TED IE into its own file 12 years ago			`if m.group('type_talk'):`
[ted] Style fixes 11 years ago			`return self._talk_info(url, name)`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`elif m.group('type_watch'):`
			`return self._watch_info(url, name)`
[ted] Style fixes 11 years ago			`else:`
[ted] Fix playlist extraction and add a test 11 years ago			`return self._playlist_videos_info(url, name)`
Move TED IE into its own file 12 years ago
[ted] Fix playlist extraction and add a test 11 years ago			`def _playlist_videos_info(self, url, name):`
Move TED IE into its own file 12 years ago			`'''Returns the videos of the playlist'''`
[ted] Fix playlists (Fixes #1770) 11 years ago
[ted] Fix playlist extraction and add a test 11 years ago			`webpage = self._download_webpage(url, name,`
			`'Downloading playlist webpage')`
			`info = self._extract_info(webpage)`
			`playlist_info = info['playlist']`
Move TED IE into its own file 12 years ago
[ted] Fix playlists (Fixes #1770) 11 years ago			`playlist_entries = [`
[ted] Remove superfluous u prefixes 11 years ago			`self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())`
[ted] Fix playlist extraction and add a test 11 years ago			`for talk in info['talks']`
[ted] Fix playlists (Fixes #1770) 11 years ago			`]`
			`return self.playlist_result(`
[ted] Fix playlist extraction and add a test 11 years ago			`playlist_entries,`
			`playlist_id=compat_str(playlist_info['id']),`
			`playlist_title=playlist_info['title'])`
Move TED IE into its own file 12 years ago
[ted] Style fixes 11 years ago			`def _talk_info(self, url, video_name):`
			`webpage = self._download_webpage(url, video_name)`
Move TED IE into its own file 12 years ago			`self.report_extraction(video_name)`
[ted] Added support for subtitle download 11 years ago
[ted] Fix playlist extraction and add a test 11 years ago			`talk_info = self._extract_info(webpage)['talks'][0]`
[ted] Added support for subtitle download 11 years ago
[ted] Add support for external videos (fixes #3948) 10 years ago			`if talk_info.get('external') is not None:`
			`self.to_screen('Found video from %s' % talk_info['external']['service'])`
			`return {`
			`'_type': 'url',`
			`'url': talk_info['external']['uri'],`
			`}`

[ted] Fix video extraction The site has been redesigned 11 years ago			`formats = [{`
			`'url': format_url,`
			`'format_id': format_id,`
			`'format': format_id,`
[ted] Use the rtmp links if there http downloads are not available. 11 years ago			`} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]`
			`if formats:`
			`for f in formats:`
			`finfo = self._NATIVE_FORMATS.get(f['format_id'])`
			`if finfo:`
			`f.update(finfo)`
			`else:`
			`# Use rtmp downloads`
			`formats = [{`
			`'format_id': f['name'],`
			`'url': talk_info['streamer'],`
			`'play_path': f['file'],`
			`'ext': 'flv',`
			`'width': f['width'],`
			`'height': f['height'],`
			`'tbr': f['bitrate'],`
			`} for f in talk_info['resources']['rtmp']]`
[ted] Fix video extraction The site has been redesigned 11 years ago			`self._sort_formats(formats)`

[ted] Remove unused import and modernize test 11 years ago			`video_id = compat_str(talk_info['id'])`
[ted] Added support for subtitle download 11 years ago			`# subtitles`
[ted] Fix video extraction The site has been redesigned 11 years ago			`video_subtitles = self.extract_subtitles(video_id, talk_info)`
[ted] Added support for subtitle download 11 years ago			`if self._downloader.params.get('listsubtitles', False):`
[ted] Fix video extraction The site has been redesigned 11 years ago			`self._list_available_subtitles(video_id, talk_info)`
[ted] Added support for subtitle download 11 years ago			`return`

[ted] Add 'http://' to the thumbnail url if it's missing 11 years ago			`thumbnail = talk_info['thumb']`
			`if not thumbnail.startswith('http'):`
			`thumbnail = 'http://' + thumbnail`
[ted] simplify 11 years ago			`return {`
[ted] Added support for subtitle download 11 years ago			`'id': video_id,`
[generic] Fix testcases 10 years ago			`'title': talk_info['title'].strip(),`
[ted] Fix video extraction The site has been redesigned 11 years ago			`'uploader': talk_info['speaker'],`
[ted] Add 'http://' to the thumbnail url if it's missing 11 years ago			`'thumbnail': thumbnail,`
[ted] Fix video extraction The site has been redesigned 11 years ago			`'description': self._og_search_description(webpage),`
[ted] Added support for subtitle download 11 years ago			`'subtitles': video_subtitles,`
[ted] Prepare #980 merge 11 years ago			`'formats': formats,`
			`}`

[ted] Fix video extraction The site has been redesigned 11 years ago			`def _get_available_subtitles(self, video_id, talk_info):`
			`languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]`
			`if languages:`
			`sub_lang_list = {}`
			`for l in languages:`
			`url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)`
			`sub_lang_list[l] = url`
			`return sub_lang_list`
			`else:`
[ted] Remove superfluous u prefixes 11 years ago			`self._downloader.report_warning('video doesn\'t have subtitles')`
[ted] Fix video extraction The site has been redesigned 11 years ago			`return {}`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago
			`def _watch_info(self, url, name):`
			`webpage = self._download_webpage(url, name)`

			`config_json = self._html_search_regex(`
			`r"data-config='([^']+)", webpage, 'config')`
			`config = json.loads(config_json)`
			`video_url = config['video']['url']`
			`thumbnail = config.get('image', {}).get('url')`

			`title = self._html_search_regex(`
			`r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')`
			`description = self._html_search_regex(`
[ted] Extend search for description 11 years ago			`[`
			`r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',`
			`r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',`
			`],`
[ted] Add support for watch/ URLs (Fixes #2637) 11 years ago			`webpage, 'description', fatal=False)`

			`return {`
			`'id': name,`
			`'url': video_url,`
			`'title': title,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`}`