Merge pull request #12 from siikamiika/youtube-live-chat

Youtube live chat
4 years ago · a9c069012f
parent ba304106de 15eae44d74
commit a9c069012f
4 changed files with 138 additions and 17 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1805,6 +1805,14 @@ class YoutubeDL(object):
                    self.report_error('Cannot write annotations file: ' + annofn)
                    return
        def dl(name, info):
            fd = get_suitable_downloader(info, self.params)(self, self.params)
            for ph in self._progress_hooks:
                fd.add_progress_hook(ph)
            if self.params.get('verbose'):
                self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
            return fd.download(name, info)
        subtitles_are_requested = any([self.params.get('writesubtitles', False),
                                       self.params.get('writeautomaticsub')])
@ -1812,14 +1820,12 @@ class YoutubeDL(object):
            # subtitles download errors are already managed as troubles in relevant IE
            # that way it will silently go on when used with unsupporting IE
            subtitles = info_dict['requested_subtitles']
            ie = self.get_info_extractor(info_dict['extractor_key'])
            for sub_lang, sub_info in subtitles.items():
                sub_format = sub_info['ext']
                sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
                    self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
                else:
                    self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
                    if sub_info.get('data') is not None:
                        try:
                            # Use newline='' to prevent conversion of newline characters
@ -1831,11 +1837,11 @@ class YoutubeDL(object):
                            return
                    else:
                        try:
-                            sub_data = ie._request_webpage(
+                            dl(sub_filename, sub_info)
-                                sub_info['url'], info_dict['id'], note=False).read()
+                        except (ExtractorError, IOError, OSError, ValueError,
-                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                compat_urllib_error.URLError,
-                                subfile.write(sub_data)
+                                compat_http_client.HTTPException,
-                        except (ExtractorError, IOError, OSError, ValueError) as err:
+                                socket.error) as err:
                            self.report_warning('Unable to download subtitle for "%s": %s' %
                                                (sub_lang, error_to_compat_str(err)))
                            continue
@ -1856,14 +1862,6 @@ class YoutubeDL(object):
        if not self.params.get('skip_download', False):
            try:
                def dl(name, info):
                    fd = get_suitable_downloader(info, self.params)(self, self.params)
                    for ph in self._progress_hooks:
                        fd.add_progress_hook(ph)
                    if self.params.get('verbose'):
                        self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
                    return fd.download(name, info)
                if info_dict.get('requested_formats') is not None:
                    downloaded = []
                    success = True
--- a/youtube_dl/downloader/init.py
+++ b/youtube_dl/downloader/init.py
@ -8,6 +8,7 @@ from .rtmp import RtmpFD
 from .dash import DashSegmentsFD
 from .rtsp import RtspFD
 from .ism import IsmFD
 from .youtube_live_chat import YoutubeLiveChatReplayFD
 from .external import (
    get_external_downloader,
    FFmpegFD,
@ -26,6 +27,7 @@ PROTOCOL_MAP = {
    'f4m': F4mFD,
    'http_dash_segments': DashSegmentsFD,
    'ism': IsmFD,
    'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
 }
--- a/youtube_dl/downloader/youtube_live_chat.py
+++ b/youtube_dl/downloader/youtube_live_chat.py
@ -0,0 +1,94 @@
 from __future__ import division, unicode_literals
 import re
 import json
 from .fragment import FragmentFD
 class YoutubeLiveChatReplayFD(FragmentFD):
    """ Downloads YouTube live chat replays fragment by fragment """
    FD_NAME = 'youtube_live_chat_replay'
    def real_download(self, filename, info_dict):
        video_id = info_dict['video_id']
        self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
        test = self.params.get('test', False)
        ctx = {
            'filename': filename,
            'live': True,
            'total_frags': None,
        }
        def dl_fragment(url):
            headers = info_dict.get('http_headers', {})
            return self._download_fragment(ctx, url, info_dict, headers)
        def parse_yt_initial_data(data):
            window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});'
            var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});'
            for patt in window_patt, var_patt:
                try:
                    raw_json = re.search(patt, data).group(1)
                    return json.loads(raw_json)
                except AttributeError:
                    continue
        self._prepare_and_start_frag_download(ctx)
        success, raw_fragment = dl_fragment(
            'https://www.youtube.com/watch?v={}'.format(video_id))
        if not success:
            return False
        data = parse_yt_initial_data(raw_fragment)
        continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
        # no data yet but required to call _append_fragment
        self._append_fragment(ctx, b'')
        first = True
        offset = None
        while continuation_id is not None:
            data = None
            if first:
                url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id)
                success, raw_fragment = dl_fragment(url)
                if not success:
                    return False
                data = parse_yt_initial_data(raw_fragment)
            else:
                url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay'
                       + '?continuation={}'.format(continuation_id)
                       + '&playerOffsetMs={}'.format(offset - 5000)
                       + '&hidden=false'
                       + '&pbj=1')
                success, raw_fragment = dl_fragment(url)
                if not success:
                    return False
                data = json.loads(raw_fragment)['response']
            first = False
            continuation_id = None
            live_chat_continuation = data['continuationContents']['liveChatContinuation']
            offset = None
            processed_fragment = bytearray()
            if 'actions' in live_chat_continuation:
                for action in live_chat_continuation['actions']:
                    if 'replayChatItemAction' in action:
                        replay_chat_item_action = action['replayChatItemAction']
                        offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
                    processed_fragment.extend(
                        json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
                continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
            self._append_fragment(ctx, processed_fragment)
            if test or offset is None:
                break
        self._finish_frag_download(ctx)
        return True
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1435,7 +1435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            raise ExtractorError(
                'Signature extraction failed: ' + tb, cause=e)
-    def _get_subtitles(self, video_id, webpage):
+    def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
        try:
            subs_doc = self._download_xml(
                'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@ -1462,6 +1462,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    'ext': ext,
                })
            sub_lang_list[lang] = sub_formats
        if has_live_chat_replay:
            sub_lang_list['live_chat'] = [
                {
                    'video_id': video_id,
                    'ext': 'json',
                    'protocol': 'youtube_live_chat_replay',
                },
            ]
        if not sub_lang_list:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}
@ -1485,6 +1493,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)
    def _get_yt_initial_data(self, video_id, webpage):
        config = self._search_regex(
            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
            webpage, 'ytInitialData', default=None)
        if config:
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)
    def _get_automatic_captions(self, video_id, webpage):
        """We need the webpage for getting the captions url, pass it as an
           argument to speed up the process."""
@ -1978,6 +1995,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        if is_live is None:
            is_live = bool_or_none(video_details.get('isLive'))
        has_live_chat_replay = False
        if not is_live:
            yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
            try:
                yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
                has_live_chat_replay = True
            except (KeyError, IndexError, TypeError):
                pass
        # Check for "rental" videos
        if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
            raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@ -2385,7 +2411,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, video_webpage)
+        video_subtitles = self.extract_subtitles(
            video_id, video_webpage, has_live_chat_replay)
        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
        video_duration = try_get(