[extractor/harpodeon] Add extractor (#4540)

Closes #4450
Authored by: eren-kemer
pull/4606/head
Eren Kemer 2 years ago committed by GitHub
parent f0ad6f8c51
commit e251986cbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -631,6 +631,7 @@ from .gronkh import (
GronkhVodsIE GronkhVodsIE
) )
from .groupon import GrouponIE from .groupon import GrouponIE
from .harpodeon import HarpodeonIE
from .hbo import HBOIE from .hbo import HBOIE
from .hearthisat import HearThisAtIE from .hearthisat import HearThisAtIE
from .heise import HeiseIE from .heise import HeiseIE

@ -0,0 +1,70 @@
from .common import InfoExtractor
from ..utils import unified_strdate
class HarpodeonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288',
'md5': '727371564a6a9ebccef2073535b5b6bd',
'skip': 'Free video could become unavailable',
'info_dict': {
'id': '268068288',
'ext': 'mp4',
'title': 'The Smoking Out of Bella Butts',
'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
'creator': 'Vitagraph Company of America',
'release_date': '19150101'
}
}, {
'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288',
'md5': '6dfea5412845f690c7331be703f884db',
'info_dict': {
'id': '268068288',
'ext': 'mp4',
'title': 'The Smoking Out of Bella Butts',
'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
'creator': 'Vitagraph Company of America',
'release_date': '19150101'
}
}, {
'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710',
'md5': '7979df9ca04637282cb7d172ab3a9c3b',
'info_dict': {
'id': '421838710',
'ext': 'mp4',
'title': 'Behind the Screen',
'description': 'md5:008972a3dc51fba3965ee517d2ba9155',
'creator': 'Lone Star Corporation',
'release_date': '19160101'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title, creator, release_year = self._search_regex(
r'''(?x)
<div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2>
(?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''',
webpage, 'title', group=('title', 'creator', 'release_year'),
fatal=False) or (None, None, None)
hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base')
hp_inject_video, hp_resolution = self._search_regex(
r'''(?x)
hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"],
[\'\"](?P<hp_resolution>\d+)[\'\"]''',
webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution'])
return {
'id': video_id,
'title': title,
'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4',
'http_headers': {'Referer': url},
'description': self._html_search_meta('description', webpage, fatal=False),
'creator': creator,
'release_date': unified_strdate(f'{release_year}0101')
}
Loading…
Cancel
Save