[twitch] Refactor and add support for past broadcasts

pull/8/head
Sergey M․ 10 years ago
parent 47e0e1e0e2
commit c5db6bb32b

@ -458,7 +458,13 @@ from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE from .tvplay import TVPlayIE
from .twentyfourvideo import TwentyFourVideoIE from .twentyfourvideo import TwentyFourVideoIE
from .twitch import TwitchIE from .twitch import (
TwitchVideoIE,
TwitchChapterIE,
TwitchVodIE,
TwitchProfileIE,
TwitchPastBroadcastsIE,
)
from .ubu import UbuIE from .ubu import UbuIE
from .udemy import ( from .udemy import (
UdemyIE, UdemyIE,

@ -15,44 +15,11 @@ from ..utils import (
) )
class TwitchIE(InfoExtractor): class TwitchBaseIE(InfoExtractor):
# TODO: One broadcast may be split into multiple videos. The key _VALID_URL_BASE = r'http://(?:www\.)?twitch\.tv'
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
(?:
(?P<channelid>[^/]+)|
(?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
/?(?:\#.*)?$
"""
_PAGE_LIMIT = 100
_API_BASE = 'https://api.twitch.tv' _API_BASE = 'https://api.twitch.tv'
_LOGIN_URL = 'https://secure.twitch.tv/user/login' _LOGIN_URL = 'https://secure.twitch.tv/user/login'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}, {
'url': 'http://www.twitch.tv/vanillatv',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}]
def _handle_error(self, response): def _handle_error(self, response):
if not isinstance(response, dict): if not isinstance(response, dict):
@ -64,34 +31,60 @@ class TwitchIE(InfoExtractor):
expected=True) expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'): def _download_json(self, url, video_id, note='Downloading JSON metadata'):
response = super(TwitchIE, self)._download_json(url, video_id, note) response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
self._handle_error(response) self._handle_error(response)
return response return response
def _extract_media(self, item, item_id): def _real_initialize(self):
ITEMS = { self._login()
'a': 'video',
'v': 'vod', def _login(self):
'c': 'chapter', (username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
authenticity_token = self._search_regex(
r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
login_page, 'authenticity token')
login_form = {
'utf8': ''.encode('utf-8'),
'authenticity_token': authenticity_token,
'redirect_on_login': '',
'embed_form': 'false',
'mp_source_action': '',
'follow': '',
'user[login]': username,
'user[password]': password,
} }
info = self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % ITEMS[item]))
if item == 'v': request = compat_urllib_request.Request(
access_token = self._download_json( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, request.add_header('Referer', self._LOGIN_URL)
'Downloading %s access token' % ITEMS[item]) response = self._download_webpage(
formats = self._extract_m3u8_formats( request, None, 'Logging in as %s' % username)
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']), m = re.search(
item_id, 'mp4') r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
info['formats'] = formats if m:
return info raise ExtractorError(
'Unable to login: %s' % m.group('msg').strip(), expected=True)
class TwitchItemBaseIE(TwitchBaseIE):
def _download_info(self, item, item_id):
return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % self._ITEM_TYPE))
def _extract_media(self, item_id):
info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json( response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id, '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
'Downloading %s playlist JSON' % ITEMS[item]) 'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = [] entries = []
chunks = response['chunks'] chunks = response['chunks']
qualities = list(chunks.keys()) qualities = list(chunks.keys())
@ -129,104 +122,87 @@ class TwitchIE(InfoExtractor):
'view_count': info['views'], 'view_count': info['views'],
} }
def _real_initialize(self): def _real_extract(self, url):
self._login() return self._extract_media(self._match_id(url))
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage( class TwitchVideoIE(TwitchItemBaseIE):
self._LOGIN_URL, None, 'Downloading login page') IE_NAME = 'twitch:video'
_VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'video'
_ITEM_SHORTCUT = 'a'
authenticity_token = self._search_regex( _TEST = {
r'<input name="authenticity_token" type="hidden" value="([^"]+)"', 'url': 'http://www.twitch.tv/riotgames/b/577357806',
login_page, 'authenticity token') 'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}
login_form = {
'utf8': ''.encode('utf-8'), class TwitchChapterIE(TwitchItemBaseIE):
'authenticity_token': authenticity_token, IE_NAME = 'twitch:chapter'
'redirect_on_login': '', _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
'embed_form': 'false', _ITEM_TYPE = 'chapter'
'mp_source_action': '', _ITEM_SHORTCUT = 'c'
'follow': '',
'user[login]': username, _TEST = {
'user[password]': password, 'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
} }
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
request.add_header('Referer', self._LOGIN_URL)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
m = re.search( class TwitchVodIE(TwitchItemBaseIE):
r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response) IE_NAME = 'twitch:vod'
if m: _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
raise ExtractorError( _ITEM_TYPE = 'vod'
'Unable to login: %s' % m.group('msg').strip(), expected=True) _ITEM_SHORTCUT = 'v'
def _real_extract(self, url): _TEST = {
mobj = re.match(self._VALID_URL, url) 'url': 'http://www.twitch.tv/ksptv/v/3622000',
if mobj.group('chapterid'): 'info_dict': {
return self._extract_media('c', mobj.group('chapterid')) 'id': 'v3622000',
'ext': 'mp4',
""" 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
webpage = self._download_webpage(url, chapter_id) 'thumbnail': 're:^https?://.*\.jpg$',
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) 'duration': 6951,
if not m: 'timestamp': 1419028564,
raise ExtractorError('Cannot find archive of a chapter') 'upload_date': '20141219',
archive_id = m.group(1) 'uploader': 'KSPTV',
'uploader_id': 'ksptv',
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id 'view_count': int,
doc = self._download_xml( },
api, chapter_id, 'params': {
note='Downloading chapter information', # m3u8 download
errnote='Chapter information download failed') 'skip_download': True,
for a in doc.findall('.//archive'): },
if archive_id == a.find('./id').text:
break
else:
raise ExtractorError('Could not find chapter in chapter information')
video_url = a.find('./video_file_url').text
video_ext = video_url.rpartition('.')[2] or 'flv'
chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
chapter_info = self._download_json(
chapter_api_url, 'c' + chapter_id,
note='Downloading chapter metadata',
errnote='Download of chapter metadata failed')
bracket_start = int(doc.find('.//bracket_start').text)
bracket_end = int(doc.find('.//bracket_end').text)
# TODO determine start (and probably fix up file)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
#video_url += '?start=' + TODO:start_timestamp
# bracket_start is 13290, but we want 51670615
self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
info = {
'id': 'c' + chapter_id,
'url': video_url,
'ext': video_ext,
'title': chapter_info['title'],
'thumbnail': chapter_info['preview'],
'description': chapter_info['description'],
'uploader': chapter_info['channel']['display_name'],
'uploader_id': chapter_info['channel']['name'],
} }
def _real_extract(self, url):
item_id = self._match_id(url)
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
formats = self._extract_m3u8_formats(
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info return info
"""
elif mobj.group('videoid'):
return self._extract_media('a', mobj.group('videoid')) class TwitchPlaylistBaseIE(TwitchBaseIE):
elif mobj.group('vodid'): _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
return self._extract_media('v', mobj.group('vodid')) _PAGE_LIMIT = 100
elif mobj.group('channelid'):
channel_id = mobj.group('channelid') def _extract_playlist(self, channel_id):
info = self._download_json( info = self._download_json(
'%s/kraken/channels/%s' % (self._API_BASE, channel_id), '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
channel_id, 'Downloading channel info JSON') channel_id, 'Downloading channel info JSON')
@ -236,12 +212,45 @@ class TwitchIE(InfoExtractor):
limit = self._PAGE_LIMIT limit = self._PAGE_LIMIT
for counter in itertools.count(1): for counter in itertools.count(1):
response = self._download_json( response = self._download_json(
'%s/kraken/channels/%s/videos/?offset=%d&limit=%d' self._PLAYLIST_URL % (channel_id, offset, limit),
% (self._API_BASE, channel_id, offset, limit), channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
channel_id, 'Downloading channel videos JSON page %d' % counter)
videos = response['videos'] videos = response['videos']
if not videos: if not videos:
break break
entries.extend([self.url_result(video['url'], 'Twitch') for video in videos]) entries.extend([self.url_result(video['url']) for video in videos])
offset += limit offset += limit
return self.playlist_result(entries, channel_id, channel_name) return self.playlist_result(entries, channel_id, channel_name)
def _real_extract(self, url):
return self._extract_playlist(self._match_id(url))
class TwitchProfileIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:profile'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_TYPE = 'profile'
_TEST = {
'url': 'http://www.twitch.tv/vanillatv/profile',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}
class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:profile'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
_PLAYLIST_TYPE = 'past broadcasts'
_TEST = {
'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
'info_dict': {
'id': 'spamfish',
'title': 'Spamfish',
},
'playlist_mincount': 54,
}

Loading…
Cancel
Save