[twitch] Refactor and add support for past broadcasts

pull/8/head
Sergey M․ 10 years ago
parent 47e0e1e0e2
commit c5db6bb32b

@ -458,7 +458,13 @@ from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE from .tvplay import TVPlayIE
from .twentyfourvideo import TwentyFourVideoIE from .twentyfourvideo import TwentyFourVideoIE
from .twitch import TwitchIE from .twitch import (
TwitchVideoIE,
TwitchChapterIE,
TwitchVodIE,
TwitchProfileIE,
TwitchPastBroadcastsIE,
)
from .ubu import UbuIE from .ubu import UbuIE
from .udemy import ( from .udemy import (
UdemyIE, UdemyIE,

@ -15,44 +15,11 @@ from ..utils import (
) )
class TwitchIE(InfoExtractor): class TwitchBaseIE(InfoExtractor):
# TODO: One broadcast may be split into multiple videos. The key _VALID_URL_BASE = r'http://(?:www\.)?twitch\.tv'
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
(?:
(?P<channelid>[^/]+)|
(?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
/?(?:\#.*)?$
"""
_PAGE_LIMIT = 100
_API_BASE = 'https://api.twitch.tv' _API_BASE = 'https://api.twitch.tv'
_LOGIN_URL = 'https://secure.twitch.tv/user/login' _LOGIN_URL = 'https://secure.twitch.tv/user/login'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}, {
'url': 'http://www.twitch.tv/vanillatv',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}]
def _handle_error(self, response): def _handle_error(self, response):
if not isinstance(response, dict): if not isinstance(response, dict):
@ -64,34 +31,60 @@ class TwitchIE(InfoExtractor):
expected=True) expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'): def _download_json(self, url, video_id, note='Downloading JSON metadata'):
response = super(TwitchIE, self)._download_json(url, video_id, note) response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
self._handle_error(response) self._handle_error(response)
return response return response
def _extract_media(self, item, item_id): def _real_initialize(self):
ITEMS = { self._login()
'a': 'video',
'v': 'vod', def _login(self):
'c': 'chapter', (username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
authenticity_token = self._search_regex(
r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
login_page, 'authenticity token')
login_form = {
'utf8': ''.encode('utf-8'),
'authenticity_token': authenticity_token,
'redirect_on_login': '',
'embed_form': 'false',
'mp_source_action': '',
'follow': '',
'user[login]': username,
'user[password]': password,
} }
info = self._extract_info(self._download_json(
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
request.add_header('Referer', self._LOGIN_URL)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
m = re.search(
r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
if m:
raise ExtractorError(
'Unable to login: %s' % m.group('msg').strip(), expected=True)
class TwitchItemBaseIE(TwitchBaseIE):
def _download_info(self, item, item_id):
return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % ITEMS[item])) 'Downloading %s info JSON' % self._ITEM_TYPE))
if item == 'v':
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % ITEMS[item])
formats = self._extract_m3u8_formats(
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info
def _extract_media(self, item_id):
info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json( response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id, '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
'Downloading %s playlist JSON' % ITEMS[item]) 'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = [] entries = []
chunks = response['chunks'] chunks = response['chunks']
qualities = list(chunks.keys()) qualities = list(chunks.keys())
@ -129,119 +122,135 @@ class TwitchIE(InfoExtractor):
'view_count': info['views'], 'view_count': info['views'],
} }
def _real_initialize(self): def _real_extract(self, url):
self._login() return self._extract_media(self._match_id(url))
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage( class TwitchVideoIE(TwitchItemBaseIE):
self._LOGIN_URL, None, 'Downloading login page') IE_NAME = 'twitch:video'
_VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'video'
_ITEM_SHORTCUT = 'a'
authenticity_token = self._search_regex( _TEST = {
r'<input name="authenticity_token" type="hidden" value="([^"]+)"', 'url': 'http://www.twitch.tv/riotgames/b/577357806',
login_page, 'authenticity token') 'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}
login_form = {
'utf8': ''.encode('utf-8'),
'authenticity_token': authenticity_token,
'redirect_on_login': '',
'embed_form': 'false',
'mp_source_action': '',
'follow': '',
'user[login]': username,
'user[password]': password,
}
request = compat_urllib_request.Request( class TwitchChapterIE(TwitchItemBaseIE):
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) IE_NAME = 'twitch:chapter'
request.add_header('Referer', self._LOGIN_URL) _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
response = self._download_webpage( _ITEM_TYPE = 'chapter'
request, None, 'Logging in as %s' % username) _ITEM_SHORTCUT = 'c'
m = re.search( _TEST = {
r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response) 'url': 'http://www.twitch.tv/acracingleague/c/5285812',
if m: 'info_dict': {
raise ExtractorError( 'id': 'c5285812',
'Unable to login: %s' % m.group('msg').strip(), expected=True) 'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}
class TwitchVodIE(TwitchItemBaseIE):
IE_NAME = 'twitch:vod'
_VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'vod'
_ITEM_SHORTCUT = 'v'
_TEST = {
'url': 'http://www.twitch.tv/ksptv/v/3622000',
'info_dict': {
'id': 'v3622000',
'ext': 'mp4',
'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 6951,
'timestamp': 1419028564,
'upload_date': '20141219',
'uploader': 'KSPTV',
'uploader_id': 'ksptv',
'view_count': int,
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) item_id = self._match_id(url)
if mobj.group('chapterid'): info = self._download_info(self._ITEM_SHORTCUT, item_id)
return self._extract_media('c', mobj.group('chapterid')) access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
""" 'Downloading %s access token' % self._ITEM_TYPE)
webpage = self._download_webpage(url, chapter_id) formats = self._extract_m3u8_formats(
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) 'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
if not m: % (item_id, access_token['token'], access_token['sig']),
raise ExtractorError('Cannot find archive of a chapter') item_id, 'mp4')
archive_id = m.group(1) info['formats'] = formats
return info
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
doc = self._download_xml(
api, chapter_id, class TwitchPlaylistBaseIE(TwitchBaseIE):
note='Downloading chapter information', _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
errnote='Chapter information download failed') _PAGE_LIMIT = 100
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text: def _extract_playlist(self, channel_id):
break info = self._download_json(
else: '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
raise ExtractorError('Could not find chapter in chapter information') channel_id, 'Downloading channel info JSON')
channel_name = info.get('display_name') or info.get('name')
video_url = a.find('./video_file_url').text entries = []
video_ext = video_url.rpartition('.')[2] or 'flv' offset = 0
limit = self._PAGE_LIMIT
chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id for counter in itertools.count(1):
chapter_info = self._download_json( response = self._download_json(
chapter_api_url, 'c' + chapter_id, self._PLAYLIST_URL % (channel_id, offset, limit),
note='Downloading chapter metadata', channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
errnote='Download of chapter metadata failed') videos = response['videos']
if not videos:
bracket_start = int(doc.find('.//bracket_start').text) break
bracket_end = int(doc.find('.//bracket_end').text) entries.extend([self.url_result(video['url']) for video in videos])
offset += limit
# TODO determine start (and probably fix up file) return self.playlist_result(entries, channel_id, channel_name)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
#video_url += '?start=' + TODO:start_timestamp def _real_extract(self, url):
# bracket_start is 13290, but we want 51670615 return self._extract_playlist(self._match_id(url))
self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
class TwitchProfileIE(TwitchPlaylistBaseIE):
info = { IE_NAME = 'twitch:profile'
'id': 'c' + chapter_id, _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
'url': video_url, _PLAYLIST_TYPE = 'profile'
'ext': video_ext,
'title': chapter_info['title'], _TEST = {
'thumbnail': chapter_info['preview'], 'url': 'http://www.twitch.tv/vanillatv/profile',
'description': chapter_info['description'], 'info_dict': {
'uploader': chapter_info['channel']['display_name'], 'id': 'vanillatv',
'uploader_id': chapter_info['channel']['name'], 'title': 'VanillaTV',
} },
return info 'playlist_mincount': 412,
""" }
elif mobj.group('videoid'):
return self._extract_media('a', mobj.group('videoid'))
elif mobj.group('vodid'): class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
return self._extract_media('v', mobj.group('vodid')) IE_NAME = 'twitch:profile'
elif mobj.group('channelid'): _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
channel_id = mobj.group('channelid') _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
info = self._download_json( _PLAYLIST_TYPE = 'past broadcasts'
'%s/kraken/channels/%s' % (self._API_BASE, channel_id),
channel_id, 'Downloading channel info JSON') _TEST = {
channel_name = info.get('display_name') or info.get('name') 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
entries = [] 'info_dict': {
offset = 0 'id': 'spamfish',
limit = self._PAGE_LIMIT 'title': 'Spamfish',
for counter in itertools.count(1): },
response = self._download_json( 'playlist_mincount': 54,
'%s/kraken/channels/%s/videos/?offset=%d&limit=%d' }
% (self._API_BASE, channel_id, offset, limit),
channel_id, 'Downloading channel videos JSON page %d' % counter)
videos = response['videos']
if not videos:
break
entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
offset += limit
return self.playlist_result(entries, channel_id, channel_name)

Loading…
Cancel
Save