[vice] Fix extraction and rework extractors (closes #11101, closes #13019, closes #13622, closes #13778)

pull/15764/head
Sergey M․ 7 years ago
parent c01db237b5
commit 86c8cfc555
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

@ -1210,7 +1210,6 @@ from .vice import (
ViceArticleIE, ViceArticleIE,
ViceShowIE, ViceShowIE,
) )
from .viceland import VicelandIE
from .vidbit import VidbitIE from .vidbit import VidbitIE
from .viddler import ViddlerIE from .viddler import ViddlerIE
from .videa import VideaIE from .videa import VideaIE

@ -103,6 +103,7 @@ from .vshare import VShareIE
from .mediasite import MediasiteIE from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .yapfiles import YapFilesIE from .yapfiles import YapFilesIE
from .vice import ViceIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -2965,6 +2966,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
vice_urls = ViceIE._extract_urls(webpage)
if vice_urls:
return self.playlist_from_matches(
vice_urls, video_id, video_title, ie=ViceIE.ie_key())
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}
for k, v in dict1.items(): for k, v in dict1.items():

@ -5,56 +5,169 @@ import re
import time import time
import hashlib import hashlib
import json import json
import random
from .adobepass import AdobePassIE from .adobepass import AdobePassIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError,
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
str_or_none, str_or_none,
parse_duration, try_get,
ExtractorError,
extract_attributes,
) )
class ViceBaseIE(AdobePassIE): class ViceIE(AdobePassIE):
def _extract_preplay_video(self, url, locale, webpage): IE_NAME = 'vice'
watch_hub_data = extract_attributes(self._search_regex( _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)'
r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) _TESTS = [{
video_id = watch_hub_data['vms-id'] 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
title = watch_hub_data['video-title'] 'info_dict': {
'id': '5e647f0125e145c9aef2069412c0cbde',
'ext': 'mp4',
'title': '10 Questions You Always Wanted To Ask: Pet Cremator',
'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1489664942,
'upload_date': '20170316',
'age_limit': 14,
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
# geo restricted to US
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
'info_dict': {
'id': '930c0ad1f47141cc955087eecaddb0e2',
'ext': 'mp4',
'uploader': 'waypoint',
'title': 'The Signal From Tölva',
'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
'timestamp': 1477941983,
'upload_date': '20161031',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
'info_dict': {
'id': '581b12b60a0e1f4c0fb6ea2f',
'ext': 'mp4',
'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
'uploader': 'VICE',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1485368119,
'upload_date': '20170125',
'age_limit': 14,
},
'params': {
# AES-encrypted m3u8
'skip_download': True,
'proxy': '127.0.0.1:8118',
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
}, {
'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060',
'only_matching': True,
}, {
'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7',
'only_matching': True,
}, {
'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1',
'only_matching': True,
}]
_PREPLAY_HOST = 'vms.vice'
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)',
webpage)
@staticmethod
def _extract_url(webpage):
urls = ViceIE._extract_urls(webpage)
return urls[0] if urls else None
def _real_extract(self, url):
locale, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(
'https://video.vice.com/%s/embed/%s' % (locale, video_id),
video_id)
video = self._parse_json(
self._search_regex(
r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage,
'app state'), video_id)['video']
video_id = video.get('vms_id') or video.get('id') or video_id
title = video['title']
is_locked = video.get('locked')
rating = video.get('rating')
thumbnail = video.get('thumbnail_url')
duration = int_or_none(video.get('duration'))
series = try_get(
video, lambda x: x['episode']['season']['show']['title'],
compat_str)
episode_number = try_get(
video, lambda x: x['episode']['episode_number'])
season_number = try_get(
video, lambda x: x['episode']['season']['season_number'])
uploader = None
query = {} query = {}
is_locked = watch_hub_data.get('video-locked') == '1'
if is_locked: if is_locked:
resource = self._get_mvpd_resource( resource = self._get_mvpd_resource(
'VICELAND', title, video_id, 'VICELAND', title, video_id, rating)
watch_hub_data.get('video-rating'))
query['tvetoken'] = self._extract_mvpd_auth( query['tvetoken'] = self._extract_mvpd_auth(
url, video_id, 'VICELAND', resource) url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in # signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in
# https://www.viceland.com/assets/common/js/web.vendor.bundle.js # https://www.viceland.com/assets/common/js/web.vendor.bundle.js
exp = int(time.time()) + 14400 # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js
exp = int(time.time()) + 1440
query.update({ query.update({
'exp': exp, 'exp': exp,
'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
'_ad_blocked': None,
'_ad_unit': '',
'_debug': '',
'platform': 'desktop',
'rn': random.randint(10000, 100000),
'fbprebidtoken': '',
}) })
try: try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST host = 'www.viceland' if is_locked else self._PREPLAY_HOST
preplay = self._download_json( preplay = self._download_json(
'https://%s.com/%s/preplay/%s' % (host, locale, video_id), 'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id),
video_id, query=query) video_id, query=query)
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401):
error = json.loads(e.cause.read().decode()) error = json.loads(e.cause.read().decode())
error_message = error.get('error_description') or error['details']
raise ExtractorError('%s said: %s' % ( raise ExtractorError('%s said: %s' % (
self.IE_NAME, error['details']), expected=True) self.IE_NAME, error_message), expected=True)
raise raise
video_data = preplay['video'] video_data = preplay['video']
@ -76,92 +189,22 @@ class ViceBaseIE(AdobePassIE):
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': base.get('body') or base.get('display_body'), 'description': base.get('body') or base.get('display_body'),
'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), 'thumbnail': thumbnail,
'duration': int_or_none(video_data.get('video_duration')) or parse_duration(watch_hub_data.get('video-duration')), 'duration': int_or_none(video_data.get('video_duration')) or duration,
'timestamp': int_or_none(video_data.get('created_at'), 1000), 'timestamp': int_or_none(video_data.get('created_at'), 1000),
'age_limit': parse_age_limit(video_data.get('video_rating')), 'age_limit': parse_age_limit(video_data.get('video_rating')),
'series': video_data.get('show_title') or watch_hub_data.get('show-title'), 'series': video_data.get('show_title') or series,
'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), 'episode_number': int_or_none(episode.get('episode_number') or episode_number),
'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
'season_number': int_or_none(watch_hub_data.get('season')), 'season_number': int_or_none(season_number),
'season_id': str_or_none(episode.get('season_id')), 'season_id': str_or_none(episode.get('season_id')),
'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader,
'uploader_id': str_or_none(channel.get('id')), 'uploader_id': str_or_none(channel.get('id')),
'subtitles': subtitles, 'subtitles': subtitles,
'ie_key': 'UplynkPreplay', 'ie_key': 'UplynkPreplay',
} }
class ViceIE(ViceBaseIE):
IE_NAME = 'vice'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
'info_dict': {
'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',
'ext': 'flv',
'title': 'Monkey Labs of Holland',
'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',
},
'add_ie': ['Ooyala'],
}, {
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
'info_dict': {
'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4',
'uploader': 'Waypoint',
'title': 'The Signal From Tölva',
'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
'timestamp': 1477941983,
'upload_date': '20161031',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
'info_dict': {
'id': '581b12b60a0e1f4c0fb6ea2f',
'ext': 'mp4',
'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
'uploader': 'VICE',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1485368119,
'upload_date': '20170125',
'age_limit': 14,
},
'params': {
# AES-encrypted m3u8
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
}]
_PREPLAY_HOST = 'video.vice'
def _real_extract(self, url):
locale, video_id = re.match(self._VALID_URL, url).groups()
webpage, urlh = self._download_webpage_handle(url, video_id)
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
'ooyala embed code', default=None)
if embed_code:
return self.url_result('ooyala:%s' % embed_code, 'Ooyala')
youtube_id = self._search_regex(
r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)
if youtube_id:
return self.url_result(youtube_id, 'Youtube')
return self._extract_preplay_video(urlh.geturl(), locale, webpage)
class ViceShowIE(InfoExtractor): class ViceShowIE(InfoExtractor):
IE_NAME = 'vice:show' IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
@ -203,14 +246,15 @@ class ViceArticleIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
'info_dict': { 'info_dict': {
'id': '58dc0a3dee202d2a0ccfcbd8', 'id': '41eae2a47b174a1398357cec55f1f6fc',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mormon War on Porn ', 'title': 'Mormon War on Porn ',
'description': 'md5:ad396a2481e7f8afb5ed486878421090', 'description': 'md5:6394a8398506581d0346b9ab89093fef',
'uploader': 'VICE', 'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c693', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1489160690, 'timestamp': 1491883129,
'upload_date': '20170310', 'upload_date': '20170411',
'age_limit': 17,
}, },
'params': { 'params': {
# AES-encrypted m3u8 # AES-encrypted m3u8
@ -219,17 +263,35 @@ class ViceArticleIE(InfoExtractor):
'add_ie': ['UplynkPreplay'], 'add_ie': ['UplynkPreplay'],
}, { }, {
'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'md5': '7fe8ebc4fa3323efafc127b82bd821d9',
'info_dict': { 'info_dict': {
'id': '3jstaBeXgAs', 'id': '3jstaBeXgAs',
'ext': 'mp4', 'ext': 'mp4',
'title': 'How to Hack a Car: Phreaked Out (Episode 2)', 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
'uploader_id': 'MotherboardTV',
'uploader': 'Motherboard', 'uploader': 'Motherboard',
'uploader_id': 'MotherboardTV',
'upload_date': '20140529', 'upload_date': '20140529',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
}, {
'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
'info_dict': {
'id': 'e2ed435eb67e43efb66e6ef9a6930a88',
'ext': 'mp4',
'title': "Making The World's First Male Sex Doll",
'description': 'md5:916078ef0e032d76343116208b6cc2c4',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1476919911,
'upload_date': '20161019',
'age_limit': 17,
},
'params': {
'skip_download': True,
},
'add_ie': [ViceIE.ie_key()],
}, { }, {
'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
'only_matching': True, 'only_matching': True,
@ -244,8 +306,8 @@ class ViceArticleIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
prefetch_data = self._parse_json(self._search_regex( prefetch_data = self._parse_json(self._search_regex(
r'window\.__PREFETCH_DATA\s*=\s*({.*});', r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n',
webpage, 'prefetch data'), display_id) webpage, 'app state'), display_id)['pageData']
body = prefetch_data['body'] body = prefetch_data['body']
def _url_res(video_url, ie_key): def _url_res(video_url, ie_key):
@ -256,6 +318,10 @@ class ViceArticleIE(InfoExtractor):
'ie_key': ie_key, 'ie_key': ie_key,
} }
vice_url = ViceIE._extract_url(webpage)
if vice_url:
return _url_res(vice_url, ViceIE.ie_key())
embed_code = self._search_regex( embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', body, r'embedCode=([^&\'"]+)', body,
'ooyala embed code', default=None) 'ooyala embed code', default=None)

@ -1,38 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .vice import ViceBaseIE
class VicelandIE(ViceBaseIE):
_VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
_TEST = {
'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
'info_dict': {
'id': '588a70d0dba8a16007de7316',
'ext': 'mp4',
'title': 'TRAPPED (Series Trailer)',
'description': 'md5:7a8e95c2b6cd86461502a2845e581ccf',
'age_limit': 14,
'timestamp': 1485474122,
'upload_date': '20170126',
'uploader_id': '57a204098cb727dec794c6a3',
'uploader': 'Viceland',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
'skip': '404',
}
_PREPLAY_HOST = 'www.viceland'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
locale = mobj.group('locale')
webpage = self._download_webpage(url, video_id)
return self._extract_preplay_video(url, locale, webpage)
Loading…
Cancel
Save