[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a timed sequence of stand-alone images, such as slideshows or thumbnail streams This can be used for implementing: https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762 https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231 https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239 https://github.com/ytdl-org/youtube-dl/issues/9868 https://github.com/ytdl-org/youtube-dl/pull/14951 Authored by: fstirlitz
This also adds extracting storyboards from DASH manifest as mhtml
4 years ago · cdb19aa4c2
parent 4d85fbbdbb
commit cdb19aa4c2
6 changed files with 248 additions and 16 deletions
--- a/yt_dlp/downloader/init.py
+++ b/yt_dlp/downloader/init.py
@ -22,6 +22,7 @@ from .http import HttpFD
 from .rtmp import RtmpFD
 from .rtsp import RtspFD
 from .ism import IsmFD
 from .mhtml import MhtmlFD
 from .niconico import NiconicoDmcFD
 from .youtube_live_chat import YoutubeLiveChatReplayFD
 from .external import (
@ -39,6 +40,7 @@ PROTOCOL_MAP = {
    'f4m': F4mFD,
    'http_dash_segments': DashSegmentsFD,
    'ism': IsmFD,
    'mhtml': MhtmlFD,
    'niconico_dmc': NiconicoDmcFD,
    'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
 }
--- a/yt_dlp/downloader/mhtml.py
+++ b/yt_dlp/downloader/mhtml.py
@ -0,0 +1,202 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import io
 import quopri
 import re
 import uuid
 from .fragment import FragmentFD
 from ..utils import (
    escapeHTML,
    formatSeconds,
    srt_subtitles_timecode,
    urljoin,
 )
 from ..version import __version__ as YT_DLP_VERSION
 class MhtmlFD(FragmentFD):
    FD_NAME = 'mhtml'
    _STYLESHEET = """\
 html, body {
    margin: 0;
    padding: 0;
    height: 100vh;
 }
 html {
    overflow-y: scroll;
    scroll-snap-type: y mandatory;
 }
 body {
    scroll-snap-type: y mandatory;
    display: flex;
    flex-flow: column;
 }
 body > figure {
    max-width: 100vw;
    max-height: 100vh;
    scroll-snap-align: center;
 }
 body > figure > figcaption {
    text-align: center;
    height: 2.5em;
 }
 body > figure > img {
    display: block;
    margin: auto;
    max-width: 100%;
    max-height: calc(100vh - 5em);
 }
 """
    _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
    _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
    @staticmethod
    def _escape_mime(s):
        return '=?utf-8?Q?' + (b''.join(
            bytes((b,)) if b >= 0x20 else b'=%02X' % b
            for b in quopri.encodestring(s.encode('utf-8'), header=True)
        )).decode('us-ascii') + '?='
    def _gen_cid(self, i, fragment, frag_boundary):
        return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
    def _gen_stub(self, *, fragments, frag_boundary, title):
        output = io.StringIO()
        output.write((
            '<!DOCTYPE html>'
            '<html>'
            '<head>'
            ''  '<meta name="generator" content="yt-dlp {version}">'
            ''  '<title>{title}</title>'
            ''  '<style>{styles}</style>'
            '<body>'
        ).format(
            version=escapeHTML(YT_DLP_VERSION),
            styles=self._STYLESHEET,
            title=escapeHTML(title)
        ))
        t0 = 0
        for i, frag in enumerate(fragments):
            output.write('<figure>')
            try:
                t1 = t0 + frag['duration']
                output.write((
                    '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
                ).format(
                    num=i + 1,
                    t0=srt_subtitles_timecode(t0),
                    t1=srt_subtitles_timecode(t1),
                    duration=formatSeconds(frag['duration'], msec=True)
                ))
            except (KeyError, ValueError, TypeError):
                t1 = None
                output.write((
                    '<figcaption>Slide #{num}</figcaption>'
                ).format(num=i + 1))
            output.write('<img src="cid:{cid}">'.format(
                cid=self._gen_cid(i, frag, frag_boundary)))
            output.write('</figure>')
            t0 = t1
        return output.getvalue()
    def real_download(self, filename, info_dict):
        fragment_base_url = info_dict.get('fragment_base_url')
        fragments = info_dict['fragments'][:1] if self.params.get(
            'test', False) else info_dict['fragments']
        title = info_dict['title']
        origin = info_dict['webpage_url']
        ctx = {
            'filename': filename,
            'total_frags': len(fragments),
        }
        self._prepare_and_start_frag_download(ctx)
        extra_state = ctx.setdefault('extra_state', {
            'header_written': False,
            'mime_boundary': str(uuid.uuid4()).replace('-', ''),
        })
        frag_boundary = extra_state['mime_boundary']
        if not extra_state['header_written']:
            stub = self._gen_stub(
                fragments=fragments,
                frag_boundary=frag_boundary,
                title=title
            )
            ctx['dest_stream'].write((
                'MIME-Version: 1.0\r\n'
                'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'Subject: {title}\r\n'
                'Content-type: multipart/related; '
                ''  'boundary="{boundary}"; '
                ''  'type="text/html"\r\n'
                'X.yt-dlp.Origin: {origin}\r\n'
                '\r\n'
                '--{boundary}\r\n'
                'Content-Type: text/html; charset=utf-8\r\n'
                'Content-Length: {length}\r\n'
                '\r\n'
                '{stub}\r\n'
            ).format(
                origin=origin,
                boundary=frag_boundary,
                length=len(stub),
                title=self._escape_mime(title),
                stub=stub
            ).encode('utf-8'))
            extra_state['header_written'] = True
        for i, fragment in enumerate(fragments):
            if (i + 1) <= ctx['fragment_index']:
                continue
            fragment_url = urljoin(fragment_base_url, fragment['path'])
            success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
            if not success:
                continue
            mime_type = b'image/jpeg'
            if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
                mime_type = b'image/png'
            if frag_content.startswith((b'GIF87a', b'GIF89a')):
                mime_type = b'image/gif'
            if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP':
                mime_type = b'image/webp'
            frag_header = io.BytesIO()
            frag_header.write(
                b'--%b\r\n' % frag_boundary.encode('us-ascii'))
            frag_header.write(
                b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
            frag_header.write(
                b'Content-type: %b\r\n' % mime_type)
            frag_header.write(
                b'Content-length: %u\r\n' % len(frag_content))
            frag_header.write(
                b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
            frag_header.write(
                b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
            frag_header.write(b'\r\n')
            self._append_fragment(
                ctx, frag_header.getvalue() + frag_content + b'\r\n')
        ctx['dest_stream'].write(
            b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
        self._finish_frag_download(ctx)
        return True
--- a/yt_dlp/extractor/canvas.py
+++ b/yt_dlp/extractor/canvas.py
@ -24,7 +24,7 @@ class CanvasIE(InfoExtractor):
    _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
-        'md5': '68993eda72ef62386a15ea2cf3c93107',
+        'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
        'info_dict': {
            'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
            'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
@ -32,9 +32,9 @@ class CanvasIE(InfoExtractor):
            'title': 'Nachtwacht: De Greystook',
            'description': 'Nachtwacht: De Greystook',
            'thumbnail': r're:^https?://.*\.jpg$',
-            'duration': 1468.04,
+            'duration': 1468.02,
        },
-        'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
+        'expected_warnings': ['is not a supported codec'],
    }, {
        'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
        'only_matching': True,
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2126,6 +2126,7 @@ class InfoExtractor(object):
                        format_id.append(str(format_index))
                    f = {
                        'format_id': '-'.join(format_id),
                        'format_note': name,
                        'format_index': format_index,
                        'url': manifest_url,
                        'manifest_url': m3u8_url,
@ -2637,7 +2638,7 @@ class InfoExtractor(object):
                    mime_type = representation_attrib['mimeType']
                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
-                    if content_type in ('video', 'audio', 'text'):
+                    if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
                        base_url = ''
                        for element in (representation, adaptation_set, period, mpd_doc):
                            base_url_e = element.find(_add_ns('BaseURL'))
@ -2654,9 +2655,15 @@ class InfoExtractor(object):
                        url_el = representation.find(_add_ns('BaseURL'))
                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
                        bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                        if representation_id is not None:
                            format_id = representation_id
                        else:
                            format_id = content_type
                        if mpd_id:
                            format_id = mpd_id + '-' + format_id
                        if content_type in ('video', 'audio'):
                            f = {
-                                'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+                                'format_id': format_id,
                                'manifest_url': mpd_url,
                                'ext': mimetype2ext(mime_type),
                                'width': int_or_none(representation_attrib.get('width')),
@ -2676,6 +2683,17 @@ class InfoExtractor(object):
                                'manifest_url': mpd_url,
                                'filesize': filesize,
                            }
                        elif mime_type == 'image/jpeg':
                            # See test case in VikiIE
                            # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
                            f = {
                                'format_id': format_id,
                                'ext': 'mhtml',
                                'manifest_url': mpd_url,
                                'format_note': 'DASH storyboards (jpeg)',
                                'acodec': 'none',
                                'vcodec': 'none',
                            }
                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                        def prepare_template(template_name, identifiers):
@ -2694,6 +2712,7 @@ class InfoExtractor(object):
                                    t += c
                            # Next, $...$ templates are translated to their
                            # %(...) counterparts to be used with % operator
                            if representation_id is not None:
                                t = t.replace('$RepresentationID$', representation_id)
                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
@ -2811,7 +2830,7 @@ class InfoExtractor(object):
                                'url': mpd_url or base_url,
                                'fragment_base_url': base_url,
                                'fragments': [],
-                                'protocol': 'http_dash_segments',
+                                'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
                            })
                            if 'initialization_url' in representation_ms_info:
                                initialization_url = representation_ms_info['initialization_url']
@ -2822,7 +2841,7 @@ class InfoExtractor(object):
                        else:
                            # Assuming direct URL to unfragmented media.
                            f['url'] = base_url
-                        if content_type in ('video', 'audio'):
+                        if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
                            formats.append(f)
                        elif content_type == 'text':
                            subtitles.setdefault(lang or 'und', []).append(f)
--- a/yt_dlp/extractor/viki.py
+++ b/yt_dlp/extractor/viki.py
@ -142,6 +142,7 @@ class VikiIE(VikiBaseIE):
    IE_NAME = 'viki'
    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
    _TESTS = [{
        'note': 'Free non-DRM video with storyboards in MPD',
        'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1',
        'info_dict': {
            'id': '1175236v',
@ -155,7 +156,6 @@ class VikiIE(VikiBaseIE):
        'params': {
            'format': 'bestvideo',
        },
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
        'info_dict': {
@ -173,7 +173,6 @@ class VikiIE(VikiBaseIE):
            'format': 'bestvideo',
        },
        'skip': 'Blocked in the US',
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        # clip
        'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@ -225,7 +224,6 @@ class VikiIE(VikiBaseIE):
        'params': {
            'format': 'bestvideo',
        },
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        # youtube external
        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -264,7 +262,6 @@ class VikiIE(VikiBaseIE):
        'params': {
            'format': 'bestvideo',
        },
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }]
    def _real_extract(self, url):
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -2244,6 +2244,17 @@ def unescapeHTML(s):
        r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 def escapeHTML(text):
    return (
        text
        .replace('&', '&amp;')
        .replace('<', '&lt;')
        .replace('>', '&gt;')
        .replace('"', '&quot;')
        .replace("'", '&#39;')
    )
 def process_communicate_or_kill(p, *args, **kwargs):
    try:
        return p.communicate(*args, **kwargs)
@ -2323,13 +2334,14 @@ def decodeOption(optval):
    return optval
-def formatSeconds(secs, delim=':'):
+def formatSeconds(secs, delim=':', msec=False):
    if secs > 3600:
-        return '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
+        ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
    elif secs > 60:
-        return '%d%s%02d' % (secs // 60, delim, secs % 60)
+        ret = '%d%s%02d' % (secs // 60, delim, secs % 60)
    else:
-        return '%d' % secs
+        ret = '%d' % secs
    return '%s.%03d' % (ret, secs % 1) if msec else ret
 def make_HTTPS_handler(params, **kwargs):