Add an extractor for internetvideoarchive.com videos

It's used by videodetective.com
11 years ago · d7e66d39a0
parent d3f46b9aa5
commit d7e66d39a0
4 changed files with 99 additions and 0 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -20,6 +20,7 @@ from youtube_dl.utils import (
    unified_strdate,
    find_xpath_attr,
    get_meta_content,
    xpath_with_ns,
 )
 if sys.version_info < (3, 0):
@ -141,5 +142,18 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(get_meta('description'), u'foo & bar')
        self.assertEqual(get_meta('author'), 'Plato')
    def test_xpath_with_ns(self):
        testxml = u'''<root xmlns:media="http://example.com/">
            <media:song>
                <media:author>The Author</media:author>
                <url>http://server.com/download.mp3</url>
            </media:song>
        </root>'''
        doc = xml.etree.ElementTree.fromstring(testxml)
        find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
        self.assertTrue(find('media:song') is not None)
        self.assertEqual(find('media:song/media:author').text, u'The Author')
        self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3')
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -62,6 +62,7 @@ from .ign import IGNIE, OneUPIE
 from .ina import InaIE
 from .infoq import InfoQIE
 from .instagram import InstagramIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .jeuxvideo import JeuxVideoIE
 from .jukebox import JukeboxIE
 from .justintv import JustinTVIE
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@ -0,0 +1,71 @@
 import re
 import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import (
    compat_urlparse,
    xpath_with_ns,
    determine_ext,
 )
 class InternetVideoArchiveIE(InfoExtractor):
    _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
    _TEST = {
        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
        u'file': u'452693.mp4',
        u'info_dict': {
            u'title': u'SKYFALL',
            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
            u'duration': 156,
        },
    }
    @staticmethod
    def _build_url(query):
        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
    def _real_extract(self, url):
        query = compat_urlparse.urlparse(url).query
        query_dic = compat_urlparse.parse_qs(query)
        video_id = query_dic['publishedid'][0]
        url = self._build_url(query)
        flashconfiguration_xml = self._download_webpage(url, video_id,
            u'Downloading flash configuration')
        flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
        file_url = flashconfiguration.find('file').text
        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
        info_xml = self._download_webpage(file_url, video_id,
            u'Downloading video info')
        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
        item = info.find('channel/item')
        def _bp(p):
            return xpath_with_ns(p,
                {'media': 'http://search.yahoo.com/mrss/',
                'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
        formats = []
        for content in item.findall(_bp('media:group/media:content')):
            attr = content.attrib
            f_url = attr['url']
            formats.append({
                'url': f_url,
                'ext': determine_ext(f_url),
                'width': int(attr['width']),
                'bitrate': int(attr['bitrate']),
            })
        formats = sorted(formats, key=lambda f: f['bitrate'])
        info = {
            'id': video_id,
            'title': item.find('title').text,
            'formats': formats,
            'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
            'description': item.find('description').text,
            'duration': int(attr['duration']),
        }
        # TODO: Remove when #980 has been merged
        info.update(formats[-1])
        return info
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -230,6 +230,19 @@ else:
                return f
        return None
 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 # the namespace parameter
 def xpath_with_ns(path, ns_map):
    components = [c.split(':') for c in path.split('/')]
    replaced = []
    for c in components:
        if len(c) == 1:
            replaced.append(c[0])
        else:
            ns, tag = c
            replaced.append('{%s}%s' % (ns_map[ns], tag))
    return '/'.join(replaced)
 def htmlentity_transform(matchobj):
    """Transforms an HTML entity to a character.