[polskieradio] Add support for downloading whole programmes.

This extends the Polskie Radio (the Polish national radio) extractor to
enable the user to download all the broadcasts of a single programme.
pull/8/head
Jakub Adam Wieczorek 8 years ago
parent 155bc674c4
commit 8d3737cda7

@ -667,7 +667,7 @@ from .pluralsight import (
) )
from .podomatic import PodomaticIE from .podomatic import PodomaticIE
from .pokemon import PokemonIE from .pokemon import PokemonIE
from .polskieradio import PolskieRadioIE from .polskieradio import PolskieRadioIE, PolskieRadioProgrammeIE
from .porn91 import Porn91IE from .porn91 import Porn91IE
from .porncom import PornComIE from .porncom import PornComIE
from .pornhd import PornHdIE from .pornhd import PornHdIE

@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urlparse
) )
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -15,6 +16,84 @@ from ..utils import (
) )
class PolskieRadioProgrammeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(,[^/]+)?/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
'info_dict': {
'id': '5102',
'title': 'HISTORIA ŻYWA',
},
'playlist_mincount': 34,
}, {
'url': 'http://www.polskieradio.pl/7/4807',
'info_dict': {
'id': '4807',
'title': 'Vademecum 1050. rocznicy Chrztu Polski'
},
'playlist_mincount': 5
}, {
'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
'only_matching': True
}, {
'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
'info_dict': {
'id': '4143',
'title': 'Kierunek Kraków',
},
'playlist_mincount': 61
}, {
'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
'only_matching': True
}]
def _get_entries_from_page_content(self, base_url, content):
entries = []
articles = re.findall(
r'<article class="ID-(\d+) article">\s+<a href="([^"]+)"( data-layer="[^"]*")? class="[^"]*" title="([^"]+)">',
content)
for article_id, article_url, _, article_title in articles:
resolved_article_url = compat_urlparse.urljoin(base_url, article_url)
entries.append(self.url_result(
resolved_article_url,
ie='PolskieRadio',
video_id=article_id,
video_title=article_title))
return entries
@classmethod
def suitable(cls, url):
return False if PolskieRadioIE.suitable(url) else super(PolskieRadioProgrammeIE, cls).suitable(url)
def _real_extract(self, url):
programme_id = self._match_id(url)
webpage = self._download_webpage(url, programme_id)
title = self._html_search_regex(
r'<a href="[^"]+" id=".*_linkCategory" title="[^"]+">(.+?)</a>',
webpage, 'title', fatal=False)
description = None
entries = self._get_entries_from_page_content(url, webpage)
pages = re.findall(r'<a( href="([^"]+/Strona/)\d+")? id="[^"]+" title="strona&#32;(\d+)"', webpage)
page_count = max(int(page_number) for _, _, page_number in pages) if pages else 1
if page_count > 1:
page_url_root = next(url for _, url, _ in pages if len(url) > 0)
for page_number in range(2, page_count + 1):
page_url = page_url_root + str(page_number)
resolved_page_url = compat_urlparse.urljoin(url, page_url)
page_content = self._download_webpage(
resolved_page_url, programme_id,
note="Downloading page number %d" % page_number)
entries.extend(self._get_entries_from_page_content(url, page_content))
return self.playlist_result(entries, programme_id, title, description)
class PolskieRadioIE(InfoExtractor): class PolskieRadioIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
_TESTS = [{ _TESTS = [{

Loading…
Cancel
Save