[udn] Add new extractor

pull/5407/merge
Yen Chi Hsuan 10 years ago
parent de5c545648
commit 418c5cc3fc

@ -53,6 +53,7 @@ from youtube_dl.utils import (
uppercase_escape, uppercase_escape,
url_basename, url_basename,
urlencode_postdata, urlencode_postdata,
url_infer_protocol,
version_tuple, version_tuple,
xpath_with_ns, xpath_with_ns,
xpath_text, xpath_text,
@ -296,6 +297,10 @@ class TestUtil(unittest.TestCase):
url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'),
'trailer.mp4') 'trailer.mp4')
def test_url_infer_protocol(self):
self.assertEqual(url_infer_protocol('http://foo.com/', '//bar.com/'), 'http://bar.com/')
self.assertEqual(url_infer_protocol('http://foo.com/', 'https://bar.com/'), 'https://bar.com/')
def test_parse_duration(self): def test_parse_duration(self):
self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(None), None)
self.assertEqual(parse_duration(False), None) self.assertEqual(parse_duration(False), None)

@ -557,6 +557,7 @@ from .udemy import (
UdemyIE, UdemyIE,
UdemyCourseIE UdemyCourseIE
) )
from .udn import UDNEmbedIE
from .ultimedia import UltimediaIE from .ultimedia import UltimediaIE
from .unistra import UnistraIE from .unistra import UnistraIE
from .urort import UrortIE from .urort import UrortIE

@ -26,6 +26,7 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_basename, url_basename,
url_infer_protocol,
xpath_text, xpath_text,
) )
from .brightcove import BrightcoveIE from .brightcove import BrightcoveIE
@ -34,6 +35,7 @@ from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
from .smotri import SmotriIE from .smotri import SmotriIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .udn import UDNEmbedIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -650,6 +652,17 @@ class GenericIE(InfoExtractor):
'title': "PFT Live: New leader in the 'new-look' defense", 'title': "PFT Live: New leader in the 'new-look' defense",
'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
}, },
},
# UDN embed
{
'url': 'http://www.udn.com/news/story/7314/822787',
'md5': 'de06b4c90b042c128395a88f0384817e',
'info_dict': {
'id': '300040',
'ext': 'mp4',
'title': '生物老師男變女 全校挺"做自己"',
'thumbnail': 're:^https?://.*\.jpg$',
}
} }
] ]
@ -1268,6 +1281,13 @@ class GenericIE(InfoExtractor):
if nbc_sports_url: if nbc_sports_url:
return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
# Look for UDN embeds
mobj = re.search(
r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
if mobj is not None:
return self.url_result(
url_infer_protocol(url, mobj.group('url')), 'UDNEmbed')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
url_infer_protocol,
js_to_json
)
class UDNEmbedIE(InfoExtractor):
_VALID_URL = r'(?:https?:)?//video\.udn\.com/embed/news/(?P<id>\d+)'
_TESTS = [{
'url': 'http://video.udn.com/embed/news/300040',
'md5': 'de06b4c90b042c128395a88f0384817e',
'info_dict': {
'id': '300040',
'ext': 'mp4',
'title': '生物老師男變女 全校挺"做自己"',
'thumbnail': 're:^https?://.*\.jpg$',
}
}, {
'url': '//video.udn.com/embed/news/300040',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
page = self._download_webpage(url, video_id)
options = json.loads(js_to_json(self._html_search_regex(
r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
video_urls = options['video']
if video_urls.get('youtube'):
return self.url_result(video_urls.get('youtube'), 'Youtube')
try:
del video_urls['youtube']
except KeyError:
pass
formats = [{
'url': self._download_webpage(
url_infer_protocol(url, api_url), video_id,
'retrieve url for %s video' % video_type),
'format_id': video_type,
'preference': 0 if video_type == 'mp4' else -1,
} for video_type, api_url in video_urls.items()]
self._sort_formats(formats)
thumbnail = None
if options.get('gallery') and len(options['gallery']):
thumbnail = options['gallery'][0].get('original')
return {
'id': video_id,
'formats': formats,
'title': options['title'],
'thumbnail': thumbnail
}

@ -1711,6 +1711,17 @@ def determine_protocol(info_dict):
return compat_urllib_parse_urlparse(url).scheme return compat_urllib_parse_urlparse(url).scheme
def url_infer_protocol(ref_url, target_url):
""" Infer protocol for protocol independent target urls """
parsed_target_url = list(compat_urllib_parse_urlparse(target_url))
if parsed_target_url[0]:
return target_url
parsed_target_url[0] = compat_urllib_parse_urlparse(ref_url).scheme
return compat_urlparse.urlunparse(parsed_target_url)
def render_table(header_row, data): def render_table(header_row, data):
""" Render a list of rows, each as a list of values """ """ Render a list of rows, each as a list of values """
table = [header_row] + data table = [header_row] + data

Loading…
Cancel
Save