yt-dlp/yt_dlp/extractor/common.py

# coding: utf-8
from __future__ import unicode_literals

import base64
import datetime
import hashlib
import json
import netrc
import os
import random
import re
import sys
import time
import math

from ..compat import (
    compat_cookiejar_Cookie,
    compat_cookies_SimpleCookie,
    compat_etree_Element,
    compat_etree_fromstring,
    compat_expanduser,
    compat_getpass,
    compat_http_client,
    compat_os_name,
    compat_str,
    compat_urllib_error,
    compat_urllib_parse_unquote,
    compat_urllib_parse_urlencode,
    compat_urllib_request,
    compat_urlparse,
    compat_xml_parse_error,
)
from ..downloader import FileDownloader
from ..downloader.f4m import (
    get_base_url,
    remove_encrypted_media,
)
from ..utils import (
    age_restricted,
    base_url,
    bug_reports_message,
    clean_html,
    compiled_regex_type,
    determine_ext,
    determine_protocol,
    dict_get,
    error_to_compat_str,
    extract_attributes,
    ExtractorError,
    fix_xml_ampersands,
    float_or_none,
    format_field,
    GeoRestrictedError,
    GeoUtils,
    int_or_none,
    js_to_json,
    JSON_LD_RE,
    mimetype2ext,
    network_exceptions,
    NO_DEFAULT,
    orderedSet,
    parse_bitrate,
    parse_codecs,
    parse_duration,
    parse_iso8601,
    parse_m3u8_attributes,
    parse_resolution,
    RegexNotFoundError,
    sanitize_filename,
    sanitized_Request,
    str_or_none,
    str_to_int,
    strip_or_none,
    traverse_obj,
    unescapeHTML,
    unified_strdate,
    unified_timestamp,
    update_Request,
    update_url_query,
    url_basename,
    url_or_none,
    urljoin,
    variadic,
    xpath_element,
    xpath_text,
    xpath_with_ns,
)


class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the YoutubeDL. The YoutubeDL processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The type field determines the type of the result.
    By far the most common value (and the default if _type is missing) is
    "video", which indicates a single video.

    For a video, the dictionaries must include the following fields:

    id:             Video identifier.
    title:          Video title, unescaped.

    Additionally, it must contain either a formats entry or a url one:

    formats:        A list of dictionaries for each format available, ordered
                    from worst to best quality.

                    Potential fields:
                    * url        The mandatory URL representing the media:
                                   for plain file media - HTTP URL of this file,
                                   for RTMP - RTMP URL,
                                   for HLS - URL of the M3U8 media playlist,
                                   for HDS - URL of the F4M manifest,
                                   for DASH
                                     - HTTP URL to plain file media (in case of
                                       unfragmented media)
                                     - URL of the MPD manifest or base URL
                                       representing the media if MPD manifest
                                       is parsed from a string (in case of
                                       fragmented media)
                                   for MSS - URL of the ISM manifest.
                    * manifest_url
                                 The URL of the manifest file in case of
                                 fragmented media:
                                   for HLS - URL of the M3U8 master playlist,
                                   for HDS - URL of the F4M manifest,
                                   for DASH - URL of the MPD manifest,
                                   for MSS - URL of the ISM manifest.
                    * ext        Will be calculated from URL if missing
                    * format     A human-readable description of the format
                                 ("mp4 container with h264/opus").
                                 Calculated from the format_id, width, height.
                                 and format_note fields if missing.
                    * format_id  A short description of the format
                                 ("mp4_h264_opus" or "19").
                                Technically optional, but strongly recommended.
                    * format_note Additional info about the format
                                 ("3D" or "DASH video")
                    * width      Width of the video, if known
                    * height     Height of the video, if known
                    * resolution Textual description of width and height
                    * tbr        Average bitrate of audio and video in KBit/s
                    * abr        Average audio bitrate in KBit/s
                    * acodec     Name of the audio codec in use
                    * asr        Audio sampling rate in Hertz
                    * vbr        Average video bitrate in KBit/s
                    * fps        Frame rate
                    * vcodec     Name of the video codec in use
                    * container  Name of the container format
                    * filesize   The number of bytes, if known in advance
                    * filesize_approx  An estimate for the number of bytes
                    * player_url SWF Player URL (used for rtmpdump).
                    * protocol   The protocol that will be used for the actual
                                 download, lower-case.
                                 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
                                 "m3u8", "m3u8_native" or "http_dash_segments".
                    * fragment_base_url
                                 Base URL for fragments. Each fragment's path
                                 value (if present) will be relative to
                                 this URL.
                    * fragments  A list of fragments of a fragmented media.
                                 Each fragment entry must contain either an url
                                 or a path. If an url is present it should be
                                 considered by a client. Otherwise both path and
                                 fragment_base_url must be present. Here is
                                 the list of all potential fields:
                                 * "url" - fragment's URL
                                 * "path" - fragment's path relative to
                                            fragment_base_url
                                 * "duration" (optional, int or float)
                                 * "filesize" (optional, int)
                    * preference Order number of this format. If this field is
                                 present and not None, the formats get sorted
                                 by this field, regardless of all other values.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                                 < -1000 to hide the format (if there is
                                    another one which is strictly better)
                    * language   Language code, e.g. "de" or "en-US".
                    * language_preference  Is this in the language mentioned in
                                 the URL?
                                 10 if it's what the URL is about,
                                 -1 for default (don't know),
                                 -10 otherwise, other values reserved for now.
                    * quality    Order number of the video quality of this
                                 format, irrespective of the file format.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                    * source_preference  Order number for this video source
                                  (quality takes higher priority)
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                    * http_headers  A dictionary of additional HTTP headers
                                 to add to the request.
                    * stretched_ratio  If given and not 1, indicates that the
                                 video's pixels are not square.
                                 width : height ratio as float.
                    * no_resume  The server does not support resuming the
                                 (HTTP or RTMP) download. Boolean.
                    * has_drm    The format has DRM and cannot be downloaded. Boolean
                    * downloader_options  A dictionary of downloader options as
                                 described in FileDownloader
                    RTMP formats can also have the additional fields: page_url,
                    app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
                    rtmp_protocol, rtmp_real_time

    url:            Final video URL.
    ext:            Video filename extension.
    format:         The video format, defaults to ext (used for --get-format)
    player_url:     SWF Player URL (used for rtmpdump).

    The following fields are optional:

    alt_title:      A secondary title of the video.
    display_id      An alternative identifier for the video, not necessarily
                    unique, but available before title. Typically, id is
                    something like "4234987", title "Dancing naked mole rats",
                    and display_id "dancing-naked-mole-rats"
    thumbnails:     A list of dictionaries, with the following entries:
                        * "id" (optional, string) - Thumbnail format ID
                        * "url"
                        * "preference" (optional, int) - quality of the image
                        * "width" (optional, int)
                        * "height" (optional, int)
                        * "resolution" (optional, string "{width}x{height}",
                                        deprecated)
                        * "filesize" (optional, int)
                        * "_test_url" (optional, bool) - If true, test the URL
    thumbnail:      Full URL to a video thumbnail image.
    description:    Full video description.
    uploader:       Full name of the video uploader.
    license:        License name the video is licensed under.
    creator:        The creator of the video.
    release_timestamp: UNIX timestamp of the moment the video was released.
    release_date:   The date (YYYYMMDD) when the video was released.
    timestamp:      UNIX timestamp of the moment the video was uploaded
    upload_date:    Video upload date (YYYYMMDD).
                    If not explicitly set, calculated from timestamp.
    uploader_id:    Nickname or id of the video uploader.
    uploader_url:   Full URL to a personal webpage of the video uploader.
    channel:        Full name of the channel the video is uploaded on.
                    Note that channel fields may or may not repeat uploader
                    fields. This depends on a particular extractor.
    channel_id:     Id of the channel.
    channel_url:    Full URL to a channel webpage.
    location:       Physical location where the video was filmed.
    subtitles:      The available subtitles as a dictionary in the format
                    {tag: subformats}. "tag" is usually a language code, and
                    "subformats" is a list sorted from lower to higher
                    preference, each element is a dictionary with the "ext"
                    entry and one of:
                        * "data": The subtitles file contents
                        * "url": A URL pointing to the subtitles file
                    It can optionally also have:
                        * "name": Name or description of the subtitles
                    "ext" will be calculated from URL if missing
    automatic_captions: Like 'subtitles'; contains automatically generated
                    captions instead of normal subtitles
    duration:       Length of the video in seconds, as an integer or float.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    repost_count:   Number of reposts of the video
    average_rating: Average rating give by users, the scale used depends on the webpage
    comment_count:  Number of comments on the video
    comments:       A list of comments, each with one or more of the following
                    properties (all but one of text or html optional):
                        * "author" - human-readable name of the comment author
                        * "author_id" - user ID of the comment author
                        * "author_thumbnail" - The thumbnail of the comment author
                        * "id" - Comment ID
                        * "html" - Comment as HTML
                        * "text" - Plain text of the comment
                        * "timestamp" - UNIX timestamp of comment
                        * "parent" - ID of the comment this one is replying to.
                                     Set to "root" to indicate that this is a
                                     comment to the original video.
                        * "like_count" - Number of positive ratings of the comment
                        * "dislike_count" - Number of negative ratings of the comment
                        * "is_favorited" - Whether the comment is marked as
                                           favorite by the video uploader
                        * "author_is_uploader" - Whether the comment is made by
                                                 the video uploader
    age_limit:      Age restriction for the video, as an integer (years)
    webpage_url:    The URL to the video webpage, if given to yt-dlp it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)
    categories:     A list of categories that the video falls in, for example
                    ["Sports", "Berlin"]
    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
    cast:           A list of the video cast
    is_live:        True, False, or None (=unknown). Whether this video is a
                    live stream that goes on instead of a fixed-length video.
    was_live:       True, False, or None (=unknown). Whether this video was
                    originally a live stream.
    live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
                    If absent, automatically set from is_live, was_live
    start_time:     Time in seconds where the reproduction should start, as
                    specified in the URL.
    end_time:       Time in seconds where the reproduction should end, as
                    specified in the URL.
    chapters:       A list of dictionaries, with the following entries:
                        * "start_time" - The start time of the chapter in seconds
                        * "end_time" - The end time of the chapter in seconds
                        * "title" (optional, string)
    playable_in_embed: Whether this video is allowed to play in embedded
                    players on other sites. Can be True (=always allowed),
                    False (=never allowed), None (=unknown), or a string
                    specifying the criteria for embedability (Eg: 'whitelist')
    availability:   Under what condition the video is available. One of
                    'private', 'premium_only', 'subscriber_only', 'needs_auth',
                    'unlisted' or 'public'. Use 'InfoExtractor._availability'
                    to set it
    __post_extractor: A function to be called just before the metadata is
                    written to either disk, logger or console. The function
                    must return a dict which will be added to the info_dict.
                    This is usefull for additional information that is
                    time-consuming to extract. Note that the fields thus
                    extracted will not be available to output template and
                    match_filter. So, only "comments" and "comment_count" are
                    currently allowed to be extracted via this method.

    The following fields should only be used when the video belongs to some logical
    chapter or section:

    chapter:        Name or title of the chapter the video belongs to.
    chapter_number: Number of the chapter the video belongs to, as an integer.
    chapter_id:     Id of the chapter the video belongs to, as a unicode string.

    The following fields should only be used when the video is an episode of some
    series, programme or podcast:

    series:         Title of the series or programme the video episode belongs to.
    season:         Title of the season the video episode belongs to.
    season_number:  Number of the season the video episode belongs to, as an integer.
    season_id:      Id of the season the video episode belongs to, as a unicode string.
    episode:        Title of the video episode. Unlike mandatory video title field,
                    this field should denote the exact title of the video episode
                    without any kind of decoration.
    episode_number: Number of the video episode within a season, as an integer.
    episode_id:     Id of the video episode, as a unicode string.

    The following fields should only be used when the media is a track or a part of
    a music album:

    track:          Title of the track.
    track_number:   Number of the track within an album or a disc, as an integer.
    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
                    as a unicode string.
    artist:         Artist(s) of the track.
    genre:          Genre(s) of the track.
    album:          Title of the album the track belongs to.
    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
    album_artist:   List of all artists appeared on the album (e.g.
                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
                    and compilations).
    disc_number:    Number of the disc or other physical medium the track belongs to,
                    as an integer.
    release_year:   Year (YYYY) when the album was released.

    Unless mentioned otherwise, the fields should be Unicode strings.

    Unless mentioned otherwise, None is equivalent to absence of information.


    _type "playlist" indicates multiple videos.
    There must be a key "entries", which is a list, an iterable, or a PagedList
    object, each element of which is a valid dictionary by this specification.

    Additionally, playlists can have "id", "title", and any other relevent
    attributes with the same semantics as videos (see above).


    _type "multi_video" indicates that there are multiple videos that
    form a single show, for examples multiple acts of an opera or TV episode.
    It must have an entries key like a playlist and contain all the keys
    required for a video at the same time.


    _type "url" indicates that the video must be extracted from another
    location, possibly by a different extractor. Its only required key is:
    "url" - the next URL to extract.
    The key "ie_key" can be set to the class name (minus the trailing "IE",
    e.g. "Youtube") if the extractor class is known in advance.
    Additionally, the dictionary may have any properties of the resolved entity
    known in advance, for example "title" if the title of the referred video is
    known ahead of time.


    _type "url_transparent" entities have the same specification as "url", but
    indicate that the given additional information is more precise than the one
    associated with the resolved URL.
    This is useful when a site employs a video service that hosts the video and
    its technical metadata, but that video service does not embed a useful
    title, description etc.


    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _GEO_BYPASS attribute may be set to False in order to disable
    geo restriction bypass mechanisms for a particular extractor.
    Though it won't disable explicit geo restriction bypass based on
    country code provided with geo_bypass_country.

    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
    countries for this extractor. One of these countries will be used by
    geo restriction bypass mechanism right away in order to bypass
    geo restriction, of course, if the mechanism is not disabled.

    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
    IP blocks in CIDR notation for this extractor. One of these IP blocks
    will be used by geo restriction bypass mechanism similarly
    to _GEO_COUNTRIES.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _x_forwarded_for_ip = None
    _GEO_BYPASS = True
    _GEO_COUNTRIES = None
    _GEO_IP_BLOCKS = None
    _WORKING = True

    _LOGIN_HINTS = {
        'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
        'cookies': (
            'Use --cookies for the authentication. '
            'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
        'password': 'Use --username and --password or --netrc to provide account credentials',
    }

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self._x_forwarded_for_ip = None
        self._printed_messages = set()
        self.set_downloader(downloader)

    @classmethod
    def _match_valid_url(cls, url):
        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
        # This function must import everything it needs (except other extractors),
        # so that lazy_extractors works correctly
        return cls._match_valid_url(url) is not None

    @classmethod
    def _match_id(cls, url):
        return cls._match_valid_url(url).group('id')

    @classmethod
    def get_temp_id(cls, url):
        try:
            return cls._match_id(url)
        except (IndexError, AttributeError):
            return None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        self._printed_messages = set()
        self._initialize_geo_bypass({
            'countries': self._GEO_COUNTRIES,
            'ip_blocks': self._GEO_IP_BLOCKS,
        })
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def _initialize_geo_bypass(self, geo_bypass_context):
        """
        Initialize geo restriction bypass mechanism.

        This method is used to initialize geo bypass mechanism based on faking
        X-Forwarded-For HTTP header. A random country from provided country list
        is selected and a random IP belonging to this country is generated. This
        IP will be passed as X-Forwarded-For HTTP header in all subsequent
        HTTP requests.

        This method will be used for initial geo bypass mechanism initialization
        during the instance initialization with _GEO_COUNTRIES and
        _GEO_IP_BLOCKS.

        You may also manually call it from extractor's code if geo bypass
        information is not available beforehand (e.g. obtained during
        extraction) or due to some other reason. In this case you should pass
        this information in geo bypass context passed as first argument. It may
        contain following fields:

        countries:  List of geo unrestricted countries (similar
                    to _GEO_COUNTRIES)
        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
                    (similar to _GEO_IP_BLOCKS)

        """
        if not self._x_forwarded_for_ip:

            # Geo bypass mechanism is explicitly disabled by user
            if not self.get_param('geo_bypass', True):
                return

            if not geo_bypass_context:
                geo_bypass_context = {}

            # Backward compatibility: previously _initialize_geo_bypass
            # expected a list of countries, some 3rd party code may still use
            # it this way
            if isinstance(geo_bypass_context, (list, tuple)):
                geo_bypass_context = {
                    'countries': geo_bypass_context,
                }

            # The whole point of geo bypass mechanism is to fake IP
            # as X-Forwarded-For HTTP header based on some IP block or
            # country code.

            # Path 1: bypassing based on IP block in CIDR notation

            # Explicit IP block specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            ip_block = self.get_param('geo_bypass_ip_block', None)

            # Otherwise use random IP block from geo bypass context but only
            # if extractor is known as geo bypassable
            if not ip_block:
                ip_blocks = geo_bypass_context.get('ip_blocks')
                if self._GEO_BYPASS and ip_blocks:
                    ip_block = random.choice(ip_blocks)

            if ip_block:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
                self._downloader.write_debug(
                    '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
                return

            # Path 2: bypassing based on country code

            # Explicit country code specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            country = self.get_param('geo_bypass_country', None)

            # Otherwise use random country code from geo bypass context but
            # only if extractor is known as geo bypassable
            if not country:
                countries = geo_bypass_context.get('countries')
                if self._GEO_BYPASS and countries:
                    country = random.choice(countries)

            if country:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
                self._downloader.write_debug(
                    'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        try:
            for _ in range(2):
                try:
                    self.initialize()
                    self.write_debug('Extracting URL: %s' % url)
                    ie_result = self._real_extract(url)
                    if ie_result is None:
                        return None
                    if self._x_forwarded_for_ip:
                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
                    subtitles = ie_result.get('subtitles')
                    if (subtitles and 'live_chat' in subtitles
                            and 'no-live-chat' in self.get_param('compat_opts', [])):
                        del subtitles['live_chat']
                    return ie_result
                except GeoRestrictedError as e:
                    if self.__maybe_fake_ip_and_retry(e.countries):
                        continue
                    raise
        except ExtractorError as e:
            video_id = e.video_id or self.get_temp_id(url)
            raise ExtractorError(
                e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
        except compat_http_client.IncompleteRead as e:
            raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
        except (KeyError, StopIteration) as e:
            raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))

    def __maybe_fake_ip_and_retry(self, countries):
        if (not self.get_param('geo_bypass_country', None)
                and self._GEO_BYPASS
                and self.get_param('geo_bypass', True)
                and not self._x_forwarded_for_ip
                and countries):
            country_code = random.choice(countries)
            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
            if self._x_forwarded_for_ip:
                self.report_warning(
                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
                    % (self._x_forwarded_for_ip, country_code.upper()))
                return True
        return False

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @property
    def IE_NAME(self):
        return compat_str(type(self).__name__[:-2])

    @staticmethod
    def __can_accept_status_code(err, expected_status):
        assert isinstance(err, compat_urllib_error.HTTPError)
        if expected_status is None:
            return False
        elif callable(expected_status):
            return expected_status(err.code) is True
        else:
            return err.code in variadic(expected_status)

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
        """
        Return the response handle.

        See _download_webpage docstring for arguments specification.
        """
        if not self._downloader._first_webpage_request:
            sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
            if sleep_interval > 0:
                self.to_screen('Sleeping %s seconds ...' % sleep_interval)
                time.sleep(sleep_interval)
        else:
            self._downloader._first_webpage_request = False

        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen('%s' % (note,))
            else:
                self.to_screen('%s: %s' % (video_id, note))

        # Some sites check X-Forwarded-For HTTP header in order to figure out
        # the origin of the client behind proxy. This allows bypassing geo
        # restriction by faking this header's value to IP that belongs to some
        # geo unrestricted country. We will do so once we encounter any
        # geo restriction error.
        if self._x_forwarded_for_ip:
            if 'X-Forwarded-For' not in headers:
                headers['X-Forwarded-For'] = self._x_forwarded_for_ip

        if isinstance(url_or_request, compat_urllib_request.Request):
            url_or_request = update_Request(
                url_or_request, data=data, headers=headers, query=query)
        else:
            if query:
                url_or_request = update_url_query(url_or_request, query)
            if data is not None or headers:
                url_or_request = sanitized_Request(url_or_request, data, headers)
        try:
            return self._downloader.urlopen(url_or_request)
        except network_exceptions as err:
            if isinstance(err, compat_urllib_error.HTTPError):
                if self.__can_accept_status_code(err, expected_status):
                    # Retain reference to error to prevent file object from
                    # being closed before it can be read. Works around the
                    # effects of <https://bugs.python.org/issue15002>
                    # introduced in Python 3.4.1.
                    err.fp._error = err
                    return err.fp

            if errnote is False:
                return False
            if errnote is None:
                errnote = 'Unable to download webpage'

            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
        """
        Return a tuple (page content as string, URL handle).

        See _download_webpage docstring for arguments specification.
        """
        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
        if urlh is False:
            assert not fatal
            return False
        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
        return (content, urlh)

    @staticmethod
    def _guess_encoding_from_content(content_type, webpage_bytes):
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            elif webpage_bytes.startswith(b'\xff\xfe'):
                encoding = 'utf-16'
            else:
                encoding = 'utf-8'

        return encoding

    def __check_blocked(self, content):
        first_block = content[:512]
        if ('<title>Access to this site is blocked</title>' in content
                and 'Websense' in first_block):
            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
            blocked_iframe = self._html_search_regex(
                r'<iframe src="([^"]+)"', content,
                'Websense information URL', default=None)
            if blocked_iframe:
                msg += ' Visit %s for more details' % blocked_iframe
            raise ExtractorError(msg, expected=True)
        if '<title>The URL you requested has been blocked</title>' in first_block:
            msg = (
                'Access to this webpage has been blocked by Indian censorship. '
                'Use a VPN or proxy server (with --proxy) to route around it.')
            block_msg = self._html_search_regex(
                r'</h1><p>(.*?)</p>',
                content, 'block message', default=None)
            if block_msg:
                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
            raise ExtractorError(msg, expected=True)
        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
                and 'blocklist.rkn.gov.ru' in content):
            raise ExtractorError(
                'Access to this webpage has been blocked by decision of the Russian government. '
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)

    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
        if not encoding:
            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
        if self.get_param('dump_intermediate_pages', False):
            self.to_screen('Dumping request to ' + urlh.geturl())
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self.get_param('write_pages', False):
            basen = '%s_%s' % (video_id, urlh.geturl())
            if len(basen) > 240:
                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
                basen = basen[:240 - len(h)] + h
            raw_filename = basen + '.dump'
            filename = sanitize_filename(raw_filename, restricted=True)
            self.to_screen('Saving request to ' + filename)
            # Working around MAX_PATH limitation on Windows (see
            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
            if compat_os_name == 'nt':
                absfilepath = os.path.abspath(filename)
                if len(absfilepath) > 259:
                    filename = '\\\\?\\' + absfilepath
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        try:
            content = webpage_bytes.decode(encoding, 'replace')
        except LookupError:
            content = webpage_bytes.decode('utf-8', 'replace')

        self.__check_blocked(content)

        return content

    def _download_webpage(
            self, url_or_request, video_id, note=None, errnote=None,
            fatal=True, tries=1, timeout=5, encoding=None, data=None,
            headers={}, query={}, expected_status=None):
        """
        Return the data of the page as a string.

        Arguments:
        url_or_request -- plain text URL as a string or
            a compat_urllib_request.Requestobject
        video_id -- Video/playlist/item identifier (string)

        Keyword arguments:
        note -- note printed before downloading (string)
        errnote -- note printed in case of an error (string)
        fatal -- flag denoting whether error should be considered fatal,
            i.e. whether it should cause ExtractionError to be raised,
            otherwise a warning will be reported and extraction continued
        tries -- number of tries
        timeout -- sleep interval between tries
        encoding -- encoding for a page content decoding, guessed automatically
            when not explicitly specified
        data -- POST data (bytes)
        headers -- HTTP headers (dict)
        query -- URL query (dict)
        expected_status -- allows to accept failed HTTP requests (non 2xx
            status code) by explicitly specifying a set of accepted status
            codes. Can be any of the following entities:
                - an integer type specifying an exact failed status code to
                  accept
                - a list or a tuple of integer types specifying a list of
                  failed status codes to accept
                - a callable accepting an actual failed status code and
                  returning True if it should be accepted
            Note that this argument does not affect success status codes (2xx)
            which are always accepted.
        """

        success = False
        try_count = 0
        while success is False:
            try:
                res = self._download_webpage_handle(
                    url_or_request, video_id, note, errnote, fatal,
                    encoding=encoding, data=data, headers=headers, query=query,
                    expected_status=expected_status)
                success = True
            except compat_http_client.IncompleteRead as e:
                try_count += 1
                if try_count >= tries:
                    raise e
                self._sleep(timeout, video_id)
        if res is False:
            return res
        else:
            content, _ = res
            return content

    def _download_xml_handle(
            self, url_or_request, video_id, note='Downloading XML',
            errnote='Unable to download XML', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (xml as an compat_etree_Element, URL handle).

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
        if res is False:
            return res
        xml_string, urlh = res
        return self._parse_xml(
            xml_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh

    def _download_xml(
            self, url_or_request, video_id,
            note='Downloading XML', errnote='Unable to download XML',
            transform_source=None, fatal=True, encoding=None,
            data=None, headers={}, query={}, expected_status=None):
        """
        Return the xml as an compat_etree_Element.

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_xml_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
            data=data, headers=headers, query=query,
            expected_status=expected_status)
        return res if res is False else res[0]

    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
        if transform_source:
            xml_string = transform_source(xml_string)
        try:
            return compat_etree_fromstring(xml_string.encode('utf-8'))
        except compat_xml_parse_error as ve:
            errmsg = '%s: Failed to parse XML ' % video_id
            if fatal:
                raise ExtractorError(errmsg, cause=ve)
            else:
                self.report_warning(errmsg + str(ve))

    def _download_json_handle(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (JSON object, URL handle).

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
        if res is False:
            return res
        json_string, urlh = res
        return self._parse_json(
            json_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh

    def _download_json(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return the JSON object as a dict.

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_json_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
            data=data, headers=headers, query=query,
            expected_status=expected_status)
        return res if res is False else res[0]

    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
        if transform_source:
            json_string = transform_source(json_string)
        try:
            return json.loads(json_string)
        except ValueError as ve:
            errmsg = '%s: Failed to parse JSON ' % video_id
            if fatal:
                raise ExtractorError(errmsg, cause=ve)
            else:
                self.report_warning(errmsg + str(ve))

    def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
        return self._parse_json(
            data[data.find('{'):data.rfind('}') + 1],
            video_id, transform_source, fatal)

    def _download_socket_json_handle(
            self, url_or_request, video_id, note='Polling socket',
            errnote='Unable to poll socket', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (JSON object, URL handle).

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
        if res is False:
            return res
        webpage, urlh = res
        return self._parse_socket_response_as_json(
            webpage, video_id, transform_source=transform_source,
            fatal=fatal), urlh

    def _download_socket_json(
            self, url_or_request, video_id, note='Polling socket',
            errnote='Unable to poll socket', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return the JSON object as a dict.

        See _download_webpage docstring for arguments specification.
        """
        res = self._download_socket_json_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
            data=data, headers=headers, query=query,
            expected_status=expected_status)
        return res if res is False else res[0]

    def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
        idstr = format_field(video_id, template='%s: ')
        msg = f'[{self.IE_NAME}] {idstr}{msg}'
        if only_once:
            if f'WARNING: {msg}' in self._printed_messages:
                return
            self._printed_messages.add(f'WARNING: {msg}')
        self._downloader.report_warning(msg, *args, **kwargs)

    def to_screen(self, msg, *args, **kwargs):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)

    def write_debug(self, msg, *args, **kwargs):
        self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)

    def get_param(self, name, default=None, *args, **kwargs):
        if self._downloader:
            return self._downloader.params.get(name, default, *args, **kwargs)
        return default

    def report_drm(self, video_id, partial=False):
        self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen('%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen('%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen('Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen('Logging in')

    def raise_login_required(
            self, msg='This video is only available for registered users',
            metadata_available=False, method='any'):
        if metadata_available and self.get_param('ignore_no_formats_error'):
            self.report_warning(msg)
        if method is not None:
            msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
        raise ExtractorError(msg, expected=True)

    def raise_geo_restricted(
            self, msg='This video is not available from your location due to geo restriction',
            countries=None, metadata_available=False):
        if metadata_available and self.get_param('ignore_no_formats_error'):
            self.report_warning(msg)
        else:
            raise GeoRestrictedError(msg, countries=countries)

    def raise_no_formats(self, msg, expected=False, video_id=None):
        if expected and self.get_param('ignore_no_formats_error'):
            self.report_warning(msg, video_id)
        elif isinstance(msg, ExtractorError):
            raise msg
        else:
            raise ExtractorError(msg, expected=expected, video_id=video_id)

    # Methods for following #608
    @staticmethod
    def url_result(url, ie=None, video_id=None, video_title=None):
        """Returns a URL that points to a page that should be processed"""
        # TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        if video_id is not None:
            video_info['id'] = video_id
        if video_title is not None:
            video_info['title'] = video_title
        return video_info

    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
        urls = orderedSet(
            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
            for m in matches)
        return self.playlist_result(
            urls, playlist_id=playlist_id, playlist_title=playlist_title)

    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        video_info.update(kwargs)
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        if playlist_description is not None:
            video_info['description'] = playlist_description
        return video_info

    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj:
                    break

        if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
            _name = '\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            if group is None:
                # return the first matching group
                return next(g for g in mobj.groups() if g is not None)
            elif isinstance(group, (list, tuple)):
                return tuple(mobj.group(g) for g in group)
            else:
                return mobj.group(group)
        elif default is not NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError('Unable to extract %s' % _name)
        else:
            self.report_warning('unable to extract %s' % _name + bug_reports_message())
            return None

    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_netrc_login_info(self, netrc_machine=None):
        username = None
        password = None
        netrc_machine = netrc_machine or self._NETRC_MACHINE

        if self.get_param('usenetrc', False):
            try:
                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
                if os.path.isdir(netrc_file):
                    netrc_file = os.path.join(netrc_file, '.netrc')
                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError(
                        'No authenticators for %s' % netrc_machine)
            except (IOError, netrc.NetrcParseError) as err:
                self.report_warning(
                    'parsing .netrc: %s' % error_to_compat_str(err))

        return username, password

    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
        """
        Get the login info as (username, password)
        First look for the manually specified credentials using username_option
        and password_option as keys in params dictionary. If no such credentials
        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
        value.
        If there's no info available, return (None, None)
        """

        # Attempt to use provided username and password or .netrc data
        username = self.get_param(username_option)
        if username is not None:
            password = self.get_param(password_option)
        else:
            username, password = self._get_netrc_login_info(netrc_machine)

        return username, password

    def _get_tfa_info(self, note='two-factor verification code'):
        """
        Get the two-factor authentication info
        TODO - asking the user will be required for sms/phone verify
        currently just uses the command line option
        If there's no info available, return None
        """

        tfa = self.get_param('twofactor')
        if tfa is not None:
            return tfa

        return compat_getpass('Type %s and press [Return]: ' % note)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
                       % {'prop': re.escape(prop)})
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    @staticmethod
    def _meta_regex(prop):
        return r'''(?isx)<meta
                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)

    def _og_search_property(self, prop, html, name=None, **kargs):
        prop = variadic(prop)
        if name is None:
            name = 'OpenGraph %s' % prop[0]
        og_regexes = []
        for p in prop:
            og_regexes.extend(self._og_regexes(p))
        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video') + self._og_regexes('video:url')
        if secure:
            regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _og_search_url(self, html, **kargs):
        return self._og_search_property('url', html, **kargs)

    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
        name = variadic(name)
        if display_name is None:
            display_name = name[0]
        return self._html_search_regex(
            [self._meta_regex(n) for n in name],
            html, display_name, fatal=fatal, group='content', **kwargs)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    def _rta_search(self, html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        return 0

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower())

    def _family_friendly_search(self, html):
        # See http://schema.org/VideoObject
        family_friendly = self._html_search_meta(
            'isFamilyFriendly', html, default=None)

        if not family_friendly:
            return None

        RATING_TABLE = {
            '1': 0,
            'true': 0,
            '0': 18,
            'false': 18,
        }
        return RATING_TABLE.get(family_friendly.lower())

    def _twitter_search_player(self, html):
        return self._html_search_meta('twitter:player', html,
                                      'twitter card player')

    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
        json_ld_list = list(re.finditer(JSON_LD_RE, html))
        default = kwargs.get('default', NO_DEFAULT)
        # JSON-LD may be malformed and thus `fatal` should be respected.
        # At the same time `default` may be passed that assumes `fatal=False`
        # for _search_regex. Let's simulate the same behavior here as well.
        fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
        json_ld = []
        for mobj in json_ld_list:
            json_ld_item = self._parse_json(
                mobj.group('json_ld'), video_id, fatal=fatal)
            if not json_ld_item:
                continue
            if isinstance(json_ld_item, dict):
                json_ld.append(json_ld_item)
            elif isinstance(json_ld_item, (list, tuple)):
                json_ld.extend(json_ld_item)
        if json_ld:
            json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
        if json_ld:
            return json_ld
        if default is not NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError('Unable to extract JSON-LD')
        else:
            self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
            return {}

    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
        if isinstance(json_ld, compat_str):
            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
        if not json_ld:
            return {}
        info = {}
        if not isinstance(json_ld, (list, tuple, dict)):
            return info
        if isinstance(json_ld, dict):
            json_ld = [json_ld]

        INTERACTION_TYPE_MAP = {
            'CommentAction': 'comment',
            'AgreeAction': 'like',
            'DisagreeAction': 'dislike',
            'LikeAction': 'like',
            'DislikeAction': 'dislike',
            'ListenAction': 'view',
            'WatchAction': 'view',
            'ViewAction': 'view',
        }

        def extract_interaction_type(e):
            interaction_type = e.get('interactionType')
            if isinstance(interaction_type, dict):
                interaction_type = interaction_type.get('@type')
            return str_or_none(interaction_type)

        def extract_interaction_statistic(e):
            interaction_statistic = e.get('interactionStatistic')
            if isinstance(interaction_statistic, dict):
                interaction_statistic = [interaction_statistic]
            if not isinstance(interaction_statistic, list):
                return
            for is_e in interaction_statistic:
                if not isinstance(is_e, dict):
                    continue
                if is_e.get('@type') != 'InteractionCounter':
                    continue
                interaction_type = extract_interaction_type(is_e)
                if not interaction_type:
                    continue
                # For interaction count some sites provide string instead of
                # an integer (as per spec) with non digit characters (e.g. ",")
                # so extracting count with more relaxed str_to_int
                interaction_count = str_to_int(is_e.get('userInteractionCount'))
                if interaction_count is None:
                    continue
                count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
                if not count_kind:
                    continue
                count_key = '%s_count' % count_kind
                if info.get(count_key) is not None:
                    continue
                info[count_key] = interaction_count

        def extract_video_object(e):
            assert e['@type'] == 'VideoObject'
            author = e.get('author')
            info.update({
                'url': url_or_none(e.get('contentUrl')),
                'title': unescapeHTML(e.get('name')),
                'description': unescapeHTML(e.get('description')),
                'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
                'duration': parse_duration(e.get('duration')),
                'timestamp': unified_timestamp(e.get('uploadDate')),
                # author can be an instance of 'Organization' or 'Person' types.
                # both types can have 'name' property(inherited from 'Thing' type). [1]
                # however some websites are using 'Text' type instead.
                # 1. https://schema.org/VideoObject
                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
                'filesize': float_or_none(e.get('contentSize')),
                'tbr': int_or_none(e.get('bitrate')),
                'width': int_or_none(e.get('width')),
                'height': int_or_none(e.get('height')),
                'view_count': int_or_none(e.get('interactionCount')),
            })
            extract_interaction_statistic(e)

        for e in json_ld:
            if '@context' in e:
                item_type = e.get('@type')
                if expected_type is not None and expected_type != item_type:
                    continue
                if item_type in ('TVEpisode', 'Episode'):
                    episode_name = unescapeHTML(e.get('name'))
                    info.update({
                        'episode': episode_name,
                        'episode_number': int_or_none(e.get('episodeNumber')),
                        'description': unescapeHTML(e.get('description')),
                    })
                    if not info.get('title') and episode_name:
                        info['title'] = episode_name
                    part_of_season = e.get('partOfSeason')
                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
                        info.update({
                            'season': unescapeHTML(part_of_season.get('name')),
                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
                        })
                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
                        info['series'] = unescapeHTML(part_of_series.get('name'))
                elif item_type == 'Movie':
                    info.update({
                        'title': unescapeHTML(e.get('name')),
                        'description': unescapeHTML(e.get('description')),
                        'duration': parse_duration(e.get('duration')),
                        'timestamp': unified_timestamp(e.get('dateCreated')),
                    })
                elif item_type in ('Article', 'NewsArticle'):
                    info.update({
                        'timestamp': parse_iso8601(e.get('datePublished')),
                        'title': unescapeHTML(e.get('headline')),
                        'description': unescapeHTML(e.get('articleBody')),
                    })
                elif item_type == 'VideoObject':
                    extract_video_object(e)
                    if expected_type is None:
                        continue
                    else:
                        break
                video = e.get('video')
                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
                    extract_video_object(video)
                if expected_type is None:
                    continue
                else:
                    break
        return dict((k, v) for k, v in info.items() if v is not None)

    @staticmethod
    def _hidden_inputs(html):
        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
        hidden_inputs = {}
        for input in re.findall(r'(?i)(<input[^>]+>)', html):
            attrs = extract_attributes(input)
            if not input:
                continue
            if attrs.get('type') not in ('hidden', 'submit'):
                continue
            name = attrs.get('name') or attrs.get('id')
            value = attrs.get('value')
            if name and value is not None:
                hidden_inputs[name] = value
        return hidden_inputs

    def _form_hidden_inputs(self, form_id, html):
        form = self._search_regex(
            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
            html, '%s form' % form_id, group='form')
        return self._hidden_inputs(form)

    class FormatSort:
        regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'

        default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
                   'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
                   'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
        ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
                        'height', 'width', 'proto', 'vext', 'abr', 'aext',
                        'fps', 'fs_approx', 'source', 'format_id')

        settings = {
            'vcodec': {'type': 'ordered', 'regex': True,
                       'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
            'acodec': {'type': 'ordered', 'regex': True,
                       'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
            'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
            'vext': {'type': 'ordered', 'field': 'video_ext',
                     'order': ('mp4', 'webm', 'flv', '', 'none'),
                     'order_free': ('webm', 'mp4', 'flv', '', 'none')},
            'aext': {'type': 'ordered', 'field': 'audio_ext',
                     'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
                     'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
            'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
            'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
                           'field': ('vcodec', 'acodec'),
                           'function': lambda it: int(any(v != 'none' for v in it))},
            'ie_pref': {'priority': True, 'type': 'extractor'},
            'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
            'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
            'lang': {'convert': 'ignore', 'field': 'language_preference'},
            'quality': {'convert': 'float_none', 'default': -1},
            'filesize': {'convert': 'bytes'},
            'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
            'id': {'convert': 'string', 'field': 'format_id'},
            'height': {'convert': 'float_none'},
            'width': {'convert': 'float_none'},
            'fps': {'convert': 'float_none'},
            'tbr': {'convert': 'float_none'},
            'vbr': {'convert': 'float_none'},
            'abr': {'convert': 'float_none'},
            'asr': {'convert': 'float_none'},
            'source': {'convert': 'ignore', 'field': 'source_preference'},

            'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
            'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
            'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
            'ext': {'type': 'combined', 'field': ('vext', 'aext')},
            'res': {'type': 'multiple', 'field': ('height', 'width'),
                    'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},

            # Most of these exist only for compatibility reasons
            'dimension': {'type': 'alias', 'field': 'res'},
            'resolution': {'type': 'alias', 'field': 'res'},
            'extension': {'type': 'alias', 'field': 'ext'},
            'bitrate': {'type': 'alias', 'field': 'br'},
            'total_bitrate': {'type': 'alias', 'field': 'tbr'},
            'video_bitrate': {'type': 'alias', 'field': 'vbr'},
            'audio_bitrate': {'type': 'alias', 'field': 'abr'},
            'framerate': {'type': 'alias', 'field': 'fps'},
            'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
            'protocol': {'type': 'alias', 'field': 'proto'},
            'source_preference': {'type': 'alias', 'field': 'source'},
            'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
            'filesize_estimate': {'type': 'alias', 'field': 'size'},
            'samplerate': {'type': 'alias', 'field': 'asr'},
            'video_ext': {'type': 'alias', 'field': 'vext'},
            'audio_ext': {'type': 'alias', 'field': 'aext'},
            'video_codec': {'type': 'alias', 'field': 'vcodec'},
            'audio_codec': {'type': 'alias', 'field': 'acodec'},
            'video': {'type': 'alias', 'field': 'hasvid'},
            'has_video': {'type': 'alias', 'field': 'hasvid'},
            'audio': {'type': 'alias', 'field': 'hasaud'},
            'has_audio': {'type': 'alias', 'field': 'hasaud'},
            'extractor': {'type': 'alias', 'field': 'ie_pref'},
            'preference': {'type': 'alias', 'field': 'ie_pref'},
            'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
            'format_id': {'type': 'alias', 'field': 'id'},
        }

        _order = []

        def _get_field_setting(self, field, key):
            if field not in self.settings:
                self.settings[field] = {}
            propObj = self.settings[field]
            if key not in propObj:
                type = propObj.get('type')
                if key == 'field':
                    default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
                elif key == 'convert':
                    default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
                else:
                    default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
                propObj[key] = default
            return propObj[key]

        def _resolve_field_value(self, field, value, convertNone=False):
            if value is None:
                if not convertNone:
                    return None
            else:
                value = value.lower()
            conversion = self._get_field_setting(field, 'convert')
            if conversion == 'ignore':
                return None
            if conversion == 'string':
                return value
            elif conversion == 'float_none':
                return float_or_none(value)
            elif conversion == 'bytes':
                return FileDownloader.parse_bytes(value)
            elif conversion == 'order':
                order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
                use_regex = self._get_field_setting(field, 'regex')
                list_length = len(order_list)
                empty_pos = order_list.index('') if '' in order_list else list_length + 1
                if use_regex and value is not None:
                    for i, regex in enumerate(order_list):
                        if regex and re.match(regex, value):
                            return list_length - i
                    return list_length - empty_pos  # not in list
                else:  # not regex or  value = None
                    return list_length - (order_list.index(value) if value in order_list else empty_pos)
            else:
                if value.isnumeric():
                    return float(value)
                else:
                    self.settings[field]['convert'] = 'string'
                    return value

        def evaluate_params(self, params, sort_extractor):
            self._use_free_order = params.get('prefer_free_formats', False)
            self._sort_user = params.get('format_sort', [])
            self._sort_extractor = sort_extractor

            def add_item(field, reverse, closest, limit_text):
                field = field.lower()
                if field in self._order:
                    return
                self._order.append(field)
                limit = self._resolve_field_value(field, limit_text)
                data = {
                    'reverse': reverse,
                    'closest': False if limit is None else closest,
                    'limit_text': limit_text,
                    'limit': limit}
                if field in self.settings:
                    self.settings[field].update(data)
                else:
                    self.settings[field] = data

            sort_list = (
                tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
                + (tuple() if params.get('format_sort_force', False)
                   else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
                + tuple(self._sort_user) + tuple(sort_extractor) + self.default)

            for item in sort_list:
                match = re.match(self.regex, item)
                if match is None:
                    raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
                field = match.group('field')
                if field is None:
                    continue
                if self._get_field_setting(field, 'type') == 'alias':
                    field = self._get_field_setting(field, 'field')
                reverse = match.group('reverse') is not None
                closest = match.group('separator') == '~'
                limit_text = match.group('limit')

                has_limit = limit_text is not None
                has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
                has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')

                fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
                limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
                limit_count = len(limits)
                for (i, f) in enumerate(fields):
                    add_item(f, reverse, closest,
                             limits[i] if i < limit_count
                             else limits[0] if has_limit and not has_multiple_limits
                             else None)

        def print_verbose_info(self, write_debug):
            if self._sort_user:
                write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
            if self._sort_extractor:
                write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
            write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
                '+' if self._get_field_setting(field, 'reverse') else '', field,
                '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
                              self._get_field_setting(field, 'limit_text'),
                              self._get_field_setting(field, 'limit'))
                if self._get_field_setting(field, 'limit_text') is not None else '')
                for field in self._order if self._get_field_setting(field, 'visible')]))

        def _calculate_field_preference_from_value(self, format, field, type, value):
            reverse = self._get_field_setting(field, 'reverse')
            closest = self._get_field_setting(field, 'closest')
            limit = self._get_field_setting(field, 'limit')

            if type == 'extractor':
                maximum = self._get_field_setting(field, 'max')
                if value is None or (maximum is not None and value >= maximum):
                    value = -1
            elif type == 'boolean':
                in_list = self._get_field_setting(field, 'in_list')
                not_in_list = self._get_field_setting(field, 'not_in_list')
                value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
            elif type == 'ordered':
                value = self._resolve_field_value(field, value, True)

            # try to convert to number
            val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
            is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
            if is_num:
                value = val_num

            return ((-10, 0) if value is None
                    else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
                    else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
                    else (0, value, 0) if not reverse and (limit is None or value <= limit)
                    else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
                    else (-1, value, 0))

        def _calculate_field_preference(self, format, field):
            type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
            get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
            if type == 'multiple':
                type = 'field'  # Only 'field' is allowed in multiple for now
                actual_fields = self._get_field_setting(field, 'field')

                value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
            else:
                value = get_value(field)
            return self._calculate_field_preference_from_value(format, field, type, value)

        def calculate_preference(self, format):
            # Determine missing protocol
            if not format.get('protocol'):
                format['protocol'] = determine_protocol(format)

            # Determine missing ext
            if not format.get('ext') and 'url' in format:
                format['ext'] = determine_ext(format['url'])
            if format.get('vcodec') == 'none':
                format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
                format['video_ext'] = 'none'
            else:
                format['video_ext'] = format['ext']
                format['audio_ext'] = 'none'
            # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
            #    format['preference'] = -1000

            # Determine missing bitrates
            if format.get('tbr') is None:
                if format.get('vbr') is not None and format.get('abr') is not None:
                    format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
            else:
                if format.get('vcodec') != "none" and format.get('vbr') is None:
                    format['vbr'] = format.get('tbr') - format.get('abr', 0)
                if format.get('acodec') != "none" and format.get('abr') is None:
                    format['abr'] = format.get('tbr') - format.get('vbr', 0)

            return tuple(self._calculate_field_preference(format, field) for field in self._order)

    def _sort_formats(self, formats, field_preference=[]):
        if not formats:
            return
        format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
        format_sort.evaluate_params(self._downloader.params, field_preference)
        if self.get_param('verbose', False):
            format_sort.print_verbose_info(self._downloader.write_debug)
        formats.sort(key=lambda f: format_sort.calculate_preference(f))

    def _check_formats(self, formats, video_id):
        if formats:
            formats[:] = filter(
                lambda f: self._is_valid_url(
                    f['url'], video_id,
                    item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
                formats)

    @staticmethod
    def _remove_duplicate_formats(formats):
        format_urls = set()
        unique_formats = []
        for f in formats:
            if f['url'] not in format_urls:
                format_urls.add(f['url'])
                unique_formats.append(f)
        formats[:] = unique_formats

    def _is_valid_url(self, url, video_id, item='video', headers={}):
        url = self._proto_relative_url(url, scheme='http:')
        # For now assume non HTTP(S) URLs always valid
        if not (url.startswith('http://') or url.startswith('https://')):
            return True
        try:
            self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
            return True
        except ExtractorError as e:
            self.to_screen(
                '%s: %s URL is invalid, skipping: %s'
                % (video_id, item, error_to_compat_str(e.cause)))
            return False

    def http_scheme(self):
        """ Either "http:" or "https:", depending on the user's preferences """
        return (
            'http:'
            if self.get_param('prefer_insecure', False)
            else 'https:')

    def _proto_relative_url(self, url, scheme=None):
        if url is None:
            return url
        if url.startswith('//'):
            if scheme is None:
                scheme = self.http_scheme()
            return scheme + url
        else:
            return url

    def _sleep(self, timeout, video_id, msg_template=None):
        if msg_template is None:
            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
        self.to_screen(msg)
        time.sleep(timeout)

    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
        manifest = self._download_xml(
            manifest_url, video_id, 'Downloading f4m manifest',
            'Unable to download f4m manifest',
            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
            transform_source=transform_source,
            fatal=fatal, data=data, headers=headers, query=query)

        if manifest is False:
            return []

        return self._parse_f4m_formats(
            manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
                           fatal=True, m3u8_id=None):
        if not isinstance(manifest, compat_etree_Element) and not fatal:
            return []

        # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
        if akamai_pv is not None and ';' in akamai_pv.text:
            playerVerificationChallenge = akamai_pv.text.split(';')[0]
            if playerVerificationChallenge.strip() != '':
                return []

        formats = []
        manifest_version = '1.0'
        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
        if not media_nodes:
            manifest_version = '2.0'
            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
        # Remove unsupported DRM protected media from final formats
        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
        media_nodes = remove_encrypted_media(media_nodes)
        if not media_nodes:
            return formats

        manifest_base_url = get_base_url(manifest)

        bootstrap_info = xpath_element(
            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
            'bootstrap info', default=None)

        vcodec = None
        mime_type = xpath_text(
            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
            'base URL', default=None)
        if mime_type and mime_type.startswith('audio/'):
            vcodec = 'none'

        for i, media_el in enumerate(media_nodes):
            tbr = int_or_none(media_el.attrib.get('bitrate'))
            width = int_or_none(media_el.attrib.get('width'))
            height = int_or_none(media_el.attrib.get('height'))
            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
            # If <bootstrapInfo> is present, the specified f4m is a
            # stream-level manifest, and only set-level manifests may refer to
            # external resources.  See section 11.4 and section 4 of F4M spec
            if bootstrap_info is None:
                media_url = None
                # @href is introduced in 2.0, see section 11.6 of F4M spec
                if manifest_version == '2.0':
                    media_url = media_el.attrib.get('href')
                if media_url is None:
                    media_url = media_el.attrib.get('url')
                if not media_url:
                    continue
                manifest_url = (
                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                # If media_url is itself a f4m manifest do the recursive extraction
                # since bitrates in parent manifest (this one) and media_url manifest
                # may differ leading to inability to resolve the format by requested
                # bitrate in f4m downloader
                ext = determine_ext(manifest_url)
                if ext == 'f4m':
                    f4m_formats = self._extract_f4m_formats(
                        manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
                        transform_source=transform_source, fatal=fatal)
                    # Sometimes stream-level manifest contains single media entry that
                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
                    # At the same time parent's media entry in set-level manifest may
                    # contain it. We will copy it from parent in such cases.
                    if len(f4m_formats) == 1:
                        f = f4m_formats[0]
                        f.update({
                            'tbr': f.get('tbr') or tbr,
                            'width': f.get('width') or width,
                            'height': f.get('height') or height,
                            'format_id': f.get('format_id') if not tbr else format_id,
                            'vcodec': vcodec,
                        })
                    formats.extend(f4m_formats)
                    continue
                elif ext == 'm3u8':
                    formats.extend(self._extract_m3u8_formats(
                        manifest_url, video_id, 'mp4', preference=preference,
                        quality=quality, m3u8_id=m3u8_id, fatal=fatal))
                    continue
            formats.append({
                'format_id': format_id,
                'url': manifest_url,
                'manifest_url': manifest_url,
                'ext': 'flv' if bootstrap_info is not None else None,
                'protocol': 'f4m',
                'tbr': tbr,
                'width': width,
                'height': height,
                'vcodec': vcodec,
                'preference': preference,
                'quality': quality,
            })
        return formats

    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
        return {
            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
            'url': m3u8_url,
            'ext': ext,
            'protocol': 'm3u8',
            'preference': preference - 100 if preference else -100,
            'quality': quality,
            'resolution': 'multiple',
            'format_note': 'Quality selection URL',
        }

    def _extract_m3u8_formats(self, *args, **kwargs):
        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the HLS manifest; "
                "if any subtitle tracks are missing,"
            ), only_once=True)
        return fmts

    def _extract_m3u8_formats_and_subtitles(
            self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
            preference=None, quality=None, m3u8_id=None, note=None,
            errnote=None, fatal=True, live=False, data=None, headers={},
            query={}):

        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note='Downloading m3u8 information' if note is None else note,
            errnote='Failed to download m3u8 information' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)

        if res is False:
            return [], {}

        m3u8_doc, urlh = res
        m3u8_url = urlh.geturl()

        return self._parse_m3u8_formats_and_subtitles(
            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
            preference=preference, quality=quality, m3u8_id=m3u8_id,
            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
            headers=headers, query=query, video_id=video_id)

    def _parse_m3u8_formats_and_subtitles(
            self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
            preference=None, quality=None, m3u8_id=None, live=False, note=None,
            errnote=None, fatal=True, data=None, headers={}, query={},
            video_id=None):
        formats, subtitles = [], {}

        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
            return formats, subtitles

        has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)

        def format_url(url):
            return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)

        if self.get_param('hls_split_discontinuity', False):
            def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
                if not m3u8_doc:
                    if not manifest_url:
                        return []
                    m3u8_doc = self._download_webpage(
                        manifest_url, video_id, fatal=fatal, data=data, headers=headers,
                        note=False, errnote='Failed to download m3u8 playlist information')
                    if m3u8_doc is False:
                        return []
                return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))

        else:
            def _extract_m3u8_playlist_indices(*args, **kwargs):
                return [None]

        # References:
        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923

        # We should try extracting formats only from master playlists [1, 4.3.4],
        # i.e. playlists that describe available qualities. On the other hand
        # media playlists [1, 4.3.3] should be returned as is since they contain
        # just the media without qualities renditions.
        # Fortunately, master playlist can be easily distinguished from media
        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
        # master playlist tags MUST NOT appear in a media playlist and vice versa.
        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
        # media playlist and MUST NOT appear in master playlist thus we can
        # clearly detect media playlist with this criterion.

        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
            formats = [{
                'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
                'format_index': idx,
                'url': m3u8_url,
                'ext': ext,
                'protocol': entry_protocol,
                'preference': preference,
                'quality': quality,
                'has_drm': has_drm,
            } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]

            return formats, subtitles

        groups = {}
        last_stream_inf = {}

        def extract_media(x_media_line):
            media = parse_m3u8_attributes(x_media_line)
            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
            if not (media_type and group_id and name):
                return
            groups.setdefault(group_id, []).append(media)
            # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
            if media_type == 'SUBTITLES':
                # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
                # EXT-X-MEDIA tag if the media type is SUBTITLES.
                # However, lack of URI has been spotted in the wild.
                # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
                if not media.get('URI'):
                    return
                url = format_url(media['URI'])
                sub_info = {
                    'url': url,
                    'ext': determine_ext(url),
                }
                if sub_info['ext'] == 'm3u8':
                    # Per RFC 8216 §3.1, the only possible subtitle format m3u8
                    # files may contain is WebVTT:
                    # <https://tools.ietf.org/html/rfc8216#section-3.1>
                    sub_info['ext'] = 'vtt'
                    sub_info['protocol'] = 'm3u8_native'
                lang = media.get('LANGUAGE') or 'und'
                subtitles.setdefault(lang, []).append(sub_info)
            if media_type not in ('VIDEO', 'AUDIO'):
                return
            media_url = media.get('URI')
            if media_url:
                manifest_url = format_url(media_url)
                formats.extend({
                    'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
                    'format_note': name,
                    'format_index': idx,
                    'url': manifest_url,
                    'manifest_url': m3u8_url,
                    'language': media.get('LANGUAGE'),
                    'ext': ext,
                    'protocol': entry_protocol,
                    'preference': preference,
                    'quality': quality,
                    'vcodec': 'none' if media_type == 'AUDIO' else None,
                } for idx in _extract_m3u8_playlist_indices(manifest_url))

        def build_stream_name():
            # Despite specification does not mention NAME attribute for
            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
            stream_name = last_stream_inf.get('NAME')
            if stream_name:
                return stream_name
            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
            # from corresponding rendition group
            stream_group_id = last_stream_inf.get('VIDEO')
            if not stream_group_id:
                return
            stream_group = groups.get(stream_group_id)
            if not stream_group:
                return stream_group_id
            rendition = stream_group[0]
            return rendition.get('NAME') or stream_group_id

        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
        # chance to detect video only formats when EXT-X-STREAM-INF tags
        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-MEDIA:'):
                extract_media(line)

        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-STREAM-INF:'):
                last_stream_inf = parse_m3u8_attributes(line)
            elif line.startswith('#') or not line.strip():
                continue
            else:
                tbr = float_or_none(
                    last_stream_inf.get('AVERAGE-BANDWIDTH')
                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
                manifest_url = format_url(line.strip())

                for idx in _extract_m3u8_playlist_indices(manifest_url):
                    format_id = [m3u8_id, None, idx]
                    # Bandwidth of live streams may differ over time thus making
                    # format_id unpredictable. So it's better to keep provided
                    # format_id intact.
                    if not live:
                        stream_name = build_stream_name()
                        format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
                    f = {
                        'format_id': '-'.join(map(str, filter(None, format_id))),
                        'format_index': idx,
                        'url': manifest_url,
                        'manifest_url': m3u8_url,
                        'tbr': tbr,
                        'ext': ext,
                        'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
                        'protocol': entry_protocol,
                        'preference': preference,
                        'quality': quality,
                    }
                    resolution = last_stream_inf.get('RESOLUTION')
                    if resolution:
                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
                        if mobj:
                            f['width'] = int(mobj.group('width'))
                            f['height'] = int(mobj.group('height'))
                    # Unified Streaming Platform
                    mobj = re.search(
                        r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
                    if mobj:
                        abr, vbr = mobj.groups()
                        abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
                        f.update({
                            'vbr': vbr,
                            'abr': abr,
                        })
                    codecs = parse_codecs(last_stream_inf.get('CODECS'))
                    f.update(codecs)
                    audio_group_id = last_stream_inf.get('AUDIO')
                    # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
                    # references a rendition group MUST have a CODECS attribute.
                    # However, this is not always respected, for example, [2]
                    # contains EXT-X-STREAM-INF tag which references AUDIO
                    # rendition group but does not have CODECS and despite
                    # referencing an audio group it represents a complete
                    # (with audio and video) format. So, for such cases we will
                    # ignore references to rendition groups and treat them
                    # as complete formats.
                    if audio_group_id and codecs and f.get('vcodec') != 'none':
                        audio_group = groups.get(audio_group_id)
                        if audio_group and audio_group[0].get('URI'):
                            # TODO: update acodec for audio only formats with
                            # the same GROUP-ID
                            f['acodec'] = 'none'
                    if not f.get('ext'):
                        f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
                    formats.append(f)

                    # for DailyMotion
                    progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
                    if progressive_uri:
                        http_f = f.copy()
                        del http_f['manifest_url']
                        http_f.update({
                            'format_id': f['format_id'].replace('hls-', 'http-'),
                            'protocol': 'http',
                            'url': progressive_uri,
                        })
                        formats.append(http_f)

                last_stream_inf = {}
        return formats, subtitles

    @staticmethod
    def _xpath_ns(path, namespace=None):
        if not namespace:
            return path
        out = []
        for c in path.split('/'):
            if not c or c == '.':
                out.append(c)
            else:
                out.append('{%s}%s' % (namespace, c))
        return '/'.join(out)

    def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)

        if smil is False:
            assert not fatal
            return []

        namespace = self._parse_smil_namespace(smil)

        fmts = self._parse_smil_formats(
            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
        subs = self._parse_smil_subtitles(
            smil, namespace=namespace)

        return fmts, subs

    def _extract_smil_formats(self, *args, **kwargs):
        fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the SMIL manifest; "
                "if any subtitle tracks are missing,"
            ), only_once=True)
        return fmts

    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
        smil = self._download_smil(smil_url, video_id, fatal=fatal)
        if smil is False:
            return {}
        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
        return self._download_xml(
            smil_url, video_id, 'Downloading SMIL file',
            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)

    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
        namespace = self._parse_smil_namespace(smil)

        formats = self._parse_smil_formats(
            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)

        video_id = os.path.splitext(url_basename(smil_url))[0]
        title = None
        description = None
        upload_date = None
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            name = meta.attrib.get('name')
            content = meta.attrib.get('content')
            if not name or not content:
                continue
            if not title and name == 'title':
                title = content
            elif not description and name in ('description', 'abstract'):
                description = content
            elif not upload_date and name == 'date':
                upload_date = unified_strdate(content)

        thumbnails = [{
            'id': image.get('type'),
            'url': image.get('src'),
            'width': int_or_none(image.get('width')),
            'height': int_or_none(image.get('height')),
        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]

        return {
            'id': video_id,
            'title': title or video_id,
            'description': description,
            'upload_date': upload_date,
            'thumbnails': thumbnails,
            'formats': formats,
            'subtitles': subtitles,
        }

    def _parse_smil_namespace(self, smil):
        return self._search_regex(
            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
        base = smil_url
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            b = meta.get('base') or meta.get('httpBase')
            if b:
                base = b
                break

        formats = []
        rtmp_count = 0
        http_count = 0
        m3u8_count = 0

        srcs = []
        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
        for medium in media:
            src = medium.get('src')
            if not src or src in srcs:
                continue
            srcs.append(src)

            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
            width = int_or_none(medium.get('width'))
            height = int_or_none(medium.get('height'))
            proto = medium.get('proto')
            ext = medium.get('ext')
            src_ext = determine_ext(src)
            streamer = medium.get('streamer') or base

            if proto == 'rtmp' or streamer.startswith('rtmp'):
                rtmp_count += 1
                formats.append({
                    'url': streamer,
                    'play_path': src,
                    'ext': 'flv',
                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })
                if transform_rtmp_url:
                    streamer, src = transform_rtmp_url(streamer, src)
                    formats[-1].update({
                        'url': streamer,
                        'play_path': src,
                    })
                continue

            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
            src_url = src_url.strip()

            if proto == 'm3u8' or src_ext == 'm3u8':
                m3u8_formats = self._extract_m3u8_formats(
                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
                if len(m3u8_formats) == 1:
                    m3u8_count += 1
                    m3u8_formats[0].update({
                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
                        'tbr': bitrate,
                        'width': width,
                        'height': height,
                    })
                formats.extend(m3u8_formats)
            elif src_ext == 'f4m':
                f4m_url = src_url
                if not f4m_params:
                    f4m_params = {
                        'hdcore': '3.2.0',
                        'plugin': 'flowplayer-3.2.0.1',
                    }
                f4m_url += '&' if '?' in f4m_url else '?'
                f4m_url += compat_urllib_parse_urlencode(f4m_params)
                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
            elif src_ext == 'mpd':
                formats.extend(self._extract_mpd_formats(
                    src_url, video_id, mpd_id='dash', fatal=False))
            elif re.search(r'\.ism/[Mm]anifest', src_url):
                formats.extend(self._extract_ism_formats(
                    src_url, video_id, ism_id='mss', fatal=False))
            elif src_url.startswith('http') and self._is_valid_url(src, video_id):
                http_count += 1
                formats.append({
                    'url': src_url,
                    'ext': ext or src_ext or 'flv',
                    'format_id': 'http-%d' % (bitrate or http_count),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })

        return formats

    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
        urls = []
        subtitles = {}
        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
            src = textstream.get('src')
            if not src or src in urls:
                continue
            urls.append(src)
            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
            subtitles.setdefault(lang, []).append({
                'url': src,
                'ext': ext,
            })
        return subtitles

    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
        xspf = self._download_xml(
            xspf_url, playlist_id, 'Downloading xpsf playlist',
            'Unable to download xspf manifest', fatal=fatal)
        if xspf is False:
            return []
        return self._parse_xspf(
            xspf, playlist_id, xspf_url=xspf_url,
            xspf_base_url=base_url(xspf_url))

    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
        NS_MAP = {
            'xspf': 'http://xspf.org/ns/0/',
            's1': 'http://static.streamone.nl/player/ns/0',
        }

        entries = []
        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
            title = xpath_text(
                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
            description = xpath_text(
                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
            thumbnail = xpath_text(
                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
            duration = float_or_none(
                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)

            formats = []
            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
                format_url = urljoin(xspf_base_url, location.text)
                if not format_url:
                    continue
                formats.append({
                    'url': format_url,
                    'manifest_url': xspf_url,
                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
                })
            self._sort_formats(formats)

            entries.append({
                'id': playlist_id,
                'title': title,
                'description': description,
                'thumbnail': thumbnail,
                'duration': duration,
                'formats': formats,
            })
        return entries

    def _extract_mpd_formats(self, *args, **kwargs):
        fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the DASH manifest; "
                "if any subtitle tracks are missing,"
            ), only_once=True)
        return fmts

    def _extract_mpd_formats_and_subtitles(
            self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
            fatal=True, data=None, headers={}, query={}):
        res = self._download_xml_handle(
            mpd_url, video_id,
            note='Downloading MPD manifest' if note is None else note,
            errnote='Failed to download MPD manifest' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)
        if res is False:
            return [], {}
        mpd_doc, urlh = res
        if mpd_doc is None:
            return [], {}
        mpd_base_url = base_url(urlh.geturl())

        return self._parse_mpd_formats_and_subtitles(
            mpd_doc, mpd_id, mpd_base_url, mpd_url)

    def _parse_mpd_formats(self, *args, **kwargs):
        fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the DASH manifest; "
                "if any subtitle tracks are missing,"
            ), only_once=True)
        return fmts

    def _parse_mpd_formats_and_subtitles(
            self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
        """
        Parse formats from MPD manifest.
        References:
         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
        """
        if not self.get_param('dynamic_mpd', True):
            if mpd_doc.get('type') == 'dynamic':
                return [], {}

        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)

        def _add_ns(path):
            return self._xpath_ns(path, namespace)

        def is_drm_protected(element):
            return element.find(_add_ns('ContentProtection')) is not None

        def extract_multisegment_info(element, ms_parent_info):
            ms_info = ms_parent_info.copy()

            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
            # common attributes and elements.  We will only extract relevant
            # for us.
            def extract_common(source):
                segment_timeline = source.find(_add_ns('SegmentTimeline'))
                if segment_timeline is not None:
                    s_e = segment_timeline.findall(_add_ns('S'))
                    if s_e:
                        ms_info['total_number'] = 0
                        ms_info['s'] = []
                        for s in s_e:
                            r = int(s.get('r', 0))
                            ms_info['total_number'] += 1 + r
                            ms_info['s'].append({
                                't': int(s.get('t', 0)),
                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
                                'd': int(s.attrib['d']),
                                'r': r,
                            })
                start_number = source.get('startNumber')
                if start_number:
                    ms_info['start_number'] = int(start_number)
                timescale = source.get('timescale')
                if timescale:
                    ms_info['timescale'] = int(timescale)
                segment_duration = source.get('duration')
                if segment_duration:
                    ms_info['segment_duration'] = float(segment_duration)

            def extract_Initialization(source):
                initialization = source.find(_add_ns('Initialization'))
                if initialization is not None:
                    ms_info['initialization_url'] = initialization.attrib['sourceURL']

            segment_list = element.find(_add_ns('SegmentList'))
            if segment_list is not None:
                extract_common(segment_list)
                extract_Initialization(segment_list)
                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
                if segment_urls_e:
                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
            else:
                segment_template = element.find(_add_ns('SegmentTemplate'))
                if segment_template is not None:
                    extract_common(segment_template)
                    media = segment_template.get('media')
                    if media:
                        ms_info['media'] = media
                    initialization = segment_template.get('initialization')
                    if initialization:
                        ms_info['initialization'] = initialization
                    else:
                        extract_Initialization(segment_template)
            return ms_info

        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
        formats, subtitles = [], {}
        stream_numbers = {'audio': 0, 'video': 0}
        for period in mpd_doc.findall(_add_ns('Period')):
            period_duration = parse_duration(period.get('duration')) or mpd_duration
            period_ms_info = extract_multisegment_info(period, {
                'start_number': 1,
                'timescale': 1,
            })
            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
                for representation in adaptation_set.findall(_add_ns('Representation')):
                    representation_attrib = adaptation_set.attrib.copy()
                    representation_attrib.update(representation.attrib)
                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
                    mime_type = representation_attrib['mimeType']
                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

                    codecs = representation_attrib.get('codecs', '')
                    if content_type not in ('video', 'audio', 'text'):
                        if mime_type == 'image/jpeg':
                            content_type = mime_type
                        elif codecs.split('.')[0] == 'stpp':
                            content_type = 'text'
                        else:
                            self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
                            continue

                    base_url = ''
                    for element in (representation, adaptation_set, period, mpd_doc):
                        base_url_e = element.find(_add_ns('BaseURL'))
                        if base_url_e is not None:
                            base_url = base_url_e.text + base_url
                            if re.match(r'^https?://', base_url):
                                break
                    if mpd_base_url and base_url.startswith('/'):
                        base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
                    elif mpd_base_url and not re.match(r'^https?://', base_url):
                        if not mpd_base_url.endswith('/'):
                            mpd_base_url += '/'
                        base_url = mpd_base_url + base_url
                    representation_id = representation_attrib.get('id')
                    lang = representation_attrib.get('lang')
                    url_el = representation.find(_add_ns('BaseURL'))
                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
                    bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                    if representation_id is not None:
                        format_id = representation_id
                    else:
                        format_id = content_type
                    if mpd_id:
                        format_id = mpd_id + '-' + format_id
                    if content_type in ('video', 'audio'):
                        f = {
                            'format_id': format_id,
                            'manifest_url': mpd_url,
                            'ext': mimetype2ext(mime_type),
                            'width': int_or_none(representation_attrib.get('width')),
                            'height': int_or_none(representation_attrib.get('height')),
                            'tbr': float_or_none(bandwidth, 1000),
                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                            'fps': int_or_none(representation_attrib.get('frameRate')),
                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                            'format_note': 'DASH %s' % content_type,
                            'filesize': filesize,
                            'container': mimetype2ext(mime_type) + '_dash',
                            'manifest_stream_number': stream_numbers[content_type]
                        }
                        f.update(parse_codecs(codecs))
                        stream_numbers[content_type] += 1
                    elif content_type == 'text':
                        f = {
                            'ext': mimetype2ext(mime_type),
                            'manifest_url': mpd_url,
                            'filesize': filesize,
                        }
                    elif content_type == 'image/jpeg':
                        # See test case in VikiIE
                        # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
                        f = {
                            'format_id': format_id,
                            'ext': 'mhtml',
                            'manifest_url': mpd_url,
                            'format_note': 'DASH storyboards (jpeg)',
                            'acodec': 'none',
                            'vcodec': 'none',
                        }
                    if is_drm_protected(adaptation_set) or is_drm_protected(representation):
                        f['has_drm'] = True
                    representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

                    def prepare_template(template_name, identifiers):
                        tmpl = representation_ms_info[template_name]
                        # First of, % characters outside $...$ templates
                        # must be escaped by doubling for proper processing
                        # by % operator string formatting used further (see
                        # https://github.com/ytdl-org/youtube-dl/issues/16867).
                        t = ''
                        in_template = False
                        for c in tmpl:
                            t += c
                            if c == '$':
                                in_template = not in_template
                            elif c == '%' and not in_template:
                                t += c
                        # Next, $...$ templates are translated to their
                        # %(...) counterparts to be used with % operator
                        if representation_id is not None:
                            t = t.replace('$RepresentationID$', representation_id)
                        t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
                        t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
                        t.replace('$$', '$')
                        return t

                    # @initialization is a regular template like @media one
                    # so it should be handled just the same way (see
                    # https://github.com/ytdl-org/youtube-dl/issues/11605)
                    if 'initialization' in representation_ms_info:
                        initialization_template = prepare_template(
                            'initialization',
                            # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
                            # $Time$ shall not be included for @initialization thus
                            # only $Bandwidth$ remains
                            ('Bandwidth', ))
                        representation_ms_info['initialization_url'] = initialization_template % {
                            'Bandwidth': bandwidth,
                        }

                    def location_key(location):
                        return 'url' if re.match(r'^https?://', location) else 'path'

                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

                        media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
                        media_location_key = location_key(media_template)

                        # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                        # can't be used at the same time
                        if '%(Number' in media_template and 's' not in representation_ms_info:
                            segment_duration = None
                            if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
                                segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                            representation_ms_info['fragments'] = [{
                                media_location_key: media_template % {
                                    'Number': segment_number,
                                    'Bandwidth': bandwidth,
                                },
                                'duration': segment_duration,
                            } for segment_number in range(
                                representation_ms_info['start_number'],
                                representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                        else:
                            # $Number*$ or $Time$ in media template with S list available
                            # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
                            # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
                            representation_ms_info['fragments'] = []
                            segment_time = 0
                            segment_d = None
                            segment_number = representation_ms_info['start_number']

                            def add_segment_url():
                                segment_url = media_template % {
                                    'Time': segment_time,
                                    'Bandwidth': bandwidth,
                                    'Number': segment_number,
                                }
                                representation_ms_info['fragments'].append({
                                    media_location_key: segment_url,
                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
                                })

                            for num, s in enumerate(representation_ms_info['s']):
                                segment_time = s.get('t') or segment_time
                                segment_d = s['d']
                                add_segment_url()
                                segment_number += 1
                                for r in range(s.get('r', 0)):
                                    segment_time += segment_d
                                    add_segment_url()
                                    segment_number += 1
                                segment_time += segment_d
                    elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
                        # No media template
                        # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
                        # or any YouTube dashsegments video
                        fragments = []
                        segment_index = 0
                        timescale = representation_ms_info['timescale']
                        for s in representation_ms_info['s']:
                            duration = float_or_none(s['d'], timescale)
                            for r in range(s.get('r', 0) + 1):
                                segment_uri = representation_ms_info['segment_urls'][segment_index]
                                fragments.append({
                                    location_key(segment_uri): segment_uri,
                                    'duration': duration,
                                })
                                segment_index += 1
                        representation_ms_info['fragments'] = fragments
                    elif 'segment_urls' in representation_ms_info:
                        # Segment URLs with no SegmentTimeline
                        # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
                        # https://github.com/ytdl-org/youtube-dl/pull/14844
                        fragments = []
                        segment_duration = float_or_none(
                            representation_ms_info['segment_duration'],
                            representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
                        for segment_url in representation_ms_info['segment_urls']:
                            fragment = {
                                location_key(segment_url): segment_url,
                            }
                            if segment_duration:
                                fragment['duration'] = segment_duration
                            fragments.append(fragment)
                        representation_ms_info['fragments'] = fragments
                    # If there is a fragments key available then we correctly recognized fragmented media.
                    # Otherwise we will assume unfragmented media with direct access. Technically, such
                    # assumption is not necessarily correct since we may simply have no support for
                    # some forms of fragmented media renditions yet, but for now we'll use this fallback.
                    if 'fragments' in representation_ms_info:
                        f.update({
                            # NB: mpd_url may be empty when MPD manifest is parsed from a string
                            'url': mpd_url or base_url,
                            'fragment_base_url': base_url,
                            'fragments': [],
                            'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
                        })
                        if 'initialization_url' in representation_ms_info:
                            initialization_url = representation_ms_info['initialization_url']
                            if not f.get('url'):
                                f['url'] = initialization_url
                            f['fragments'].append({location_key(initialization_url): initialization_url})
                        f['fragments'].extend(representation_ms_info['fragments'])
                    else:
                        # Assuming direct URL to unfragmented media.
                        f['url'] = base_url
                    if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
                        formats.append(f)
                    elif content_type == 'text':
                        subtitles.setdefault(lang or 'und', []).append(f)

        return formats, subtitles

    def _extract_ism_formats(self, *args, **kwargs):
        fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the ISM manifest; "
                "if any subtitle tracks are missing,"
            ))
        return fmts

    def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
        res = self._download_xml_handle(
            ism_url, video_id,
            note='Downloading ISM manifest' if note is None else note,
            errnote='Failed to download ISM manifest' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)
        if res is False:
            return [], {}
        ism_doc, urlh = res
        if ism_doc is None:
            return [], {}

        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)

    def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
        """
        Parse formats from ISM manifest.
        References:
         1. [MS-SSTR]: Smooth Streaming Protocol,
            https://msdn.microsoft.com/en-us/library/ff469518.aspx
        """
        if ism_doc.get('IsLive') == 'TRUE':
            return [], {}

        duration = int(ism_doc.attrib['Duration'])
        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000

        formats = []
        subtitles = {}
        for stream in ism_doc.findall('StreamIndex'):
            stream_type = stream.get('Type')
            if stream_type not in ('video', 'audio', 'text'):
                continue
            url_pattern = stream.attrib['Url']
            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
            stream_name = stream.get('Name')
            stream_language = stream.get('Language', 'und')
            for track in stream.findall('QualityLevel'):
                fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
                # TODO: add support for WVC1 and WMAP
                if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
                    self.report_warning('%s is not a supported codec' % fourcc)
                    continue
                tbr = int(track.attrib['Bitrate']) // 1000
                # [1] does not mention Width and Height attributes. However,
                # they're often present while MaxWidth and MaxHeight are
                # missing, so should be used as fallbacks
                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
                sampling_rate = int_or_none(track.get('SamplingRate'))

                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
                track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)

                fragments = []
                fragment_ctx = {
                    'time': 0,
                }
                stream_fragments = stream.findall('c')
                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
                    if not fragment_ctx['duration']:
                        try:
                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
                        except IndexError:
                            next_fragment_time = duration
                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
                    for _ in range(fragment_repeat):
                        fragments.append({
                            'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
                            'duration': fragment_ctx['duration'] / stream_timescale,
                        })
                        fragment_ctx['time'] += fragment_ctx['duration']

                format_id = []
                if ism_id:
                    format_id.append(ism_id)
                if stream_name:
                    format_id.append(stream_name)
                format_id.append(compat_str(tbr))

                if stream_type == 'text':
                    subtitles.setdefault(stream_language, []).append({
                        'ext': 'ismt',
                        'protocol': 'ism',
                        'url': ism_url,
                        'manifest_url': ism_url,
                        'fragments': fragments,
                        '_download_params': {
                            'stream_type': stream_type,
                            'duration': duration,
                            'timescale': stream_timescale,
                            'fourcc': fourcc,
                            'language': stream_language,
                            'codec_private_data': track.get('CodecPrivateData'),
                        }
                    })
                elif stream_type in ('video', 'audio'):
                    formats.append({
                        'format_id': '-'.join(format_id),
                        'url': ism_url,
                        'manifest_url': ism_url,
                        'ext': 'ismv' if stream_type == 'video' else 'isma',
                        'width': width,
                        'height': height,
                        'tbr': tbr,
                        'asr': sampling_rate,
                        'vcodec': 'none' if stream_type == 'audio' else fourcc,
                        'acodec': 'none' if stream_type == 'video' else fourcc,
                        'protocol': 'ism',
                        'fragments': fragments,
                        'has_drm': ism_doc.find('Protection') is not None,
                        '_download_params': {
                            'stream_type': stream_type,
                            'duration': duration,
                            'timescale': stream_timescale,
                            'width': width or 0,
                            'height': height or 0,
                            'fourcc': fourcc,
                            'language': stream_language,
                            'codec_private_data': track.get('CodecPrivateData'),
                            'sampling_rate': sampling_rate,
                            'channels': int_or_none(track.get('Channels', 2)),
                            'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
                            'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
                        },
                    })
        return formats, subtitles

    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
        def absolute_url(item_url):
            return urljoin(base_url, item_url)

        def parse_content_type(content_type):
            if not content_type:
                return {}
            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
            if ctr:
                mimetype, codecs = ctr.groups()
                f = parse_codecs(codecs)
                f['ext'] = mimetype2ext(mimetype)
                return f
            return {}

        def _media_formats(src, cur_media_type, type_info={}):
            full_url = absolute_url(src)
            ext = type_info.get('ext') or determine_ext(full_url)
            if ext == 'm3u8':
                is_plain_url = False
                formats = self._extract_m3u8_formats(
                    full_url, video_id, ext='mp4',
                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
                    preference=preference, quality=quality, fatal=False)
            elif ext == 'mpd':
                is_plain_url = False
                formats = self._extract_mpd_formats(
                    full_url, video_id, mpd_id=mpd_id, fatal=False)
            else:
                is_plain_url = True
                formats = [{
                    'url': full_url,
                    'vcodec': 'none' if cur_media_type == 'audio' else None,
                }]
            return is_plain_url, formats

        entries = []
        # amp-video and amp-audio are very similar to their HTML5 counterparts
        # so we wll include them right here (see
        # https://www.ampproject.org/docs/reference/components/amp-video)
        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
        media_tags = [(media_tag, media_tag_name, media_type, '')
                      for media_tag, media_tag_name, media_type
                      in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
        media_tags.extend(re.findall(
            # We only allow video|audio followed by a whitespace or '>'.
            # Allowing more characters may end up in significant slow down (see
            # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
            # http://www.porntrex.com/maps/videositemap.xml).
            r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
        for media_tag, _, media_type, media_content in media_tags:
            media_info = {
                'formats': [],
                'subtitles': {},
            }
            media_attributes = extract_attributes(media_tag)
            src = strip_or_none(media_attributes.get('src'))
            if src:
                _, formats = _media_formats(src, media_type)
                media_info['formats'].extend(formats)
            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
            if media_content:
                for source_tag in re.findall(r'<source[^>]+>', media_content):
                    s_attr = extract_attributes(source_tag)
                    # data-video-src and data-src are non standard but seen
                    # several times in the wild
                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
                    if not src:
                        continue
                    f = parse_content_type(s_attr.get('type'))
                    is_plain_url, formats = _media_formats(src, media_type, f)
                    if is_plain_url:
                        # width, height, res, label and title attributes are
                        # all not standard but seen several times in the wild
                        labels = [
                            s_attr.get(lbl)
                            for lbl in ('label', 'title')
                            if str_or_none(s_attr.get(lbl))
                        ]
                        width = int_or_none(s_attr.get('width'))
                        height = (int_or_none(s_attr.get('height'))
                                  or int_or_none(s_attr.get('res')))
                        if not width or not height:
                            for lbl in labels:
                                resolution = parse_resolution(lbl)
                                if not resolution:
                                    continue
                                width = width or resolution.get('width')
                                height = height or resolution.get('height')
                        for lbl in labels:
                            tbr = parse_bitrate(lbl)
                            if tbr:
                                break
                        else:
                            tbr = None
                        f.update({
                            'width': width,
                            'height': height,
                            'tbr': tbr,
                            'format_id': s_attr.get('label') or s_attr.get('title'),
                        })
                        f.update(formats[0])
                        media_info['formats'].append(f)
                    else:
                        media_info['formats'].extend(formats)
                for track_tag in re.findall(r'<track[^>]+>', media_content):
                    track_attributes = extract_attributes(track_tag)
                    kind = track_attributes.get('kind')
                    if not kind or kind in ('subtitles', 'captions'):
                        src = strip_or_none(track_attributes.get('src'))
                        if not src:
                            continue
                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
                        media_info['subtitles'].setdefault(lang, []).append({
                            'url': absolute_url(src),
                        })
            for f in media_info['formats']:
                f.setdefault('http_headers', {})['Referer'] = base_url
            if media_info['formats'] or media_info['subtitles']:
                entries.append(media_info)
        return entries

    def _extract_akamai_formats(self, *args, **kwargs):
        fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
        if subs:
            self.report_warning(bug_reports_message(
                "Ignoring subtitle tracks found in the manifests; "
                "if any subtitle tracks are missing,"
            ))
        return fmts

    def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
        signed = 'hdnea=' in manifest_url
        if not signed:
            # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
            manifest_url = re.sub(
                r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
                '', manifest_url).strip('?')

        formats = []
        subtitles = {}

        hdcore_sign = 'hdcore=3.7.0'
        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
        hds_host = hosts.get('hds')
        if hds_host:
            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
        if 'hdcore=' not in f4m_url:
            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
        f4m_formats = self._extract_f4m_formats(
            f4m_url, video_id, f4m_id='hds', fatal=False)
        for entry in f4m_formats:
            entry.update({'extra_param_to_segment_url': hdcore_sign})
        formats.extend(f4m_formats)

        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
        hls_host = hosts.get('hls')
        if hls_host:
            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
        m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
            m3u8_url, video_id, 'mp4', 'm3u8_native',
            m3u8_id='hls', fatal=False)
        formats.extend(m3u8_formats)
        subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)

        http_host = hosts.get('http')
        if http_host and m3u8_formats and not signed:
            REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
            qualities_length = len(qualities)
            if len(m3u8_formats) in (qualities_length, qualities_length + 1):
                i = 0
                for f in m3u8_formats:
                    if f['vcodec'] != 'none':
                        for protocol in ('http', 'https'):
                            http_f = f.copy()
                            del http_f['manifest_url']
                            http_url = re.sub(
                                REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
                            http_f.update({
                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                'url': http_url,
                                'protocol': protocol,
                            })
                            formats.append(http_f)
                        i += 1

        return formats, subtitles

    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
        query = compat_urlparse.urlparse(url).query
        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
        mobj = re.search(
            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
        url_base = mobj.group('url')
        http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
        formats = []

        def manifest_url(manifest):
            m_url = '%s/%s' % (http_base_url, manifest)
            if query:
                m_url += '?%s' % query
            return m_url

        if 'm3u8' not in skip_protocols:
            formats.extend(self._extract_m3u8_formats(
                manifest_url('playlist.m3u8'), video_id, 'mp4',
                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
        if 'f4m' not in skip_protocols:
            formats.extend(self._extract_f4m_formats(
                manifest_url('manifest.f4m'),
                video_id, f4m_id='hds', fatal=False))
        if 'dash' not in skip_protocols:
            formats.extend(self._extract_mpd_formats(
                manifest_url('manifest.mpd'),
                video_id, mpd_id='dash', fatal=False))
        if re.search(r'(?:/smil:|\.smil)', url_base):
            if 'smil' not in skip_protocols:
                rtmp_formats = self._extract_smil_formats(
                    manifest_url('jwplayer.smil'),
                    video_id, fatal=False)
                for rtmp_format in rtmp_formats:
                    rtsp_format = rtmp_format.copy()
                    rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
                    del rtsp_format['play_path']
                    del rtsp_format['ext']
                    rtsp_format.update({
                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
                        'protocol': 'rtsp',
                    })
                    formats.extend([rtmp_format, rtsp_format])
        else:
            for protocol in ('rtmp', 'rtsp'):
                if protocol not in skip_protocols:
                    formats.append({
                        'url': '%s:%s' % (protocol, url_base),
                        'format_id': protocol,
                        'protocol': protocol,
                    })
        return formats

    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
        mobj = re.search(
            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
            webpage)
        if mobj:
            try:
                jwplayer_data = self._parse_json(mobj.group('options'),
                                                 video_id=video_id,
                                                 transform_source=transform_source)
            except ExtractorError:
                pass
            else:
                if isinstance(jwplayer_data, dict):
                    return jwplayer_data

    def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
        jwplayer_data = self._find_jwplayer_data(
            webpage, video_id, transform_source=js_to_json)
        return self._parse_jwplayer_data(
            jwplayer_data, video_id, *args, **kwargs)

    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                             m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
        # JWPlayer backward compatibility: flattened playlists
        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
        if 'playlist' not in jwplayer_data:
            jwplayer_data = {'playlist': [jwplayer_data]}

        entries = []

        # JWPlayer backward compatibility: single playlist item
        # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
        if not isinstance(jwplayer_data['playlist'], list):
            jwplayer_data['playlist'] = [jwplayer_data['playlist']]

        for video_data in jwplayer_data['playlist']:
            # JWPlayer backward compatibility: flattened sources
            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
            if 'sources' not in video_data:
                video_data['sources'] = [video_data]

            this_video_id = video_id or video_data['mediaid']

            formats = self._parse_jwplayer_formats(
                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

            subtitles = {}
            tracks = video_data.get('tracks')
            if tracks and isinstance(tracks, list):
                for track in tracks:
                    if not isinstance(track, dict):
                        continue
                    track_kind = track.get('kind')
                    if not track_kind or not isinstance(track_kind, compat_str):
                        continue
                    if track_kind.lower() not in ('captions', 'subtitles'):
                        continue
                    track_url = urljoin(base_url, track.get('file'))
                    if not track_url:
                        continue
                    subtitles.setdefault(track.get('label') or 'en', []).append({
                        'url': self._proto_relative_url(track_url)
                    })

            entry = {
                'id': this_video_id,
                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
                'description': clean_html(video_data.get('description')),
                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
                'timestamp': int_or_none(video_data.get('pubdate')),
                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
                'subtitles': subtitles,
            }
            # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
            if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
                entry.update({
                    '_type': 'url_transparent',
                    'url': formats[0]['url'],
                })
            else:
                self._sort_formats(formats)
                entry['formats'] = formats
            entries.append(entry)
        if len(entries) == 1:
            return entries[0]
        else:
            return self.playlist_result(entries)

    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
        urls = []
        formats = []
        for source in jwplayer_sources_data:
            if not isinstance(source, dict):
                continue
            source_url = urljoin(
                base_url, self._proto_relative_url(source.get('file')))
            if not source_url or source_url in urls:
                continue
            urls.append(source_url)
            source_type = source.get('type') or ''
            ext = mimetype2ext(source_type) or determine_ext(source_url)
            if source_type == 'hls' or ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id=m3u8_id, fatal=False))
            elif source_type == 'dash' or ext == 'mpd':
                formats.extend(self._extract_mpd_formats(
                    source_url, video_id, mpd_id=mpd_id, fatal=False))
            elif ext == 'smil':
                formats.extend(self._extract_smil_formats(
                    source_url, video_id, fatal=False))
            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
            elif source_type.startswith('audio') or ext in (
                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
                formats.append({
                    'url': source_url,
                    'vcodec': 'none',
                    'ext': ext,
                })
            else:
                height = int_or_none(source.get('height'))
                if height is None:
                    # Often no height is provided but there is a label in
                    # format like "1080p", "720p SD", or 1080.
                    height = int_or_none(self._search_regex(
                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
                        'height', default=None))
                a_format = {
                    'url': source_url,
                    'width': int_or_none(source.get('width')),
                    'height': height,
                    'tbr': int_or_none(source.get('bitrate')),
                    'ext': ext,
                }
                if source_url.startswith('rtmp'):
                    a_format['ext'] = 'flv'
                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
                    # of jwplayer.flash.swf
                    rtmp_url_parts = re.split(
                        r'((?:mp4|mp3|flv):)', source_url, 1)
                    if len(rtmp_url_parts) == 3:
                        rtmp_url, prefix, play_path = rtmp_url_parts
                        a_format.update({
                            'url': rtmp_url,
                            'play_path': prefix + play_path,
                        })
                    if rtmp_params:
                        a_format.update(rtmp_params)
                formats.append(a_format)
        return formats

    def _live_title(self, name):
        """ Generate the title for a live video """
        now = datetime.datetime.now()
        now_str = now.strftime('%Y-%m-%d %H:%M')
        return name + ' ' + now_str

    def _int(self, v, name, fatal=False, **kwargs):
        res = int_or_none(v, **kwargs)
        if 'get_attr' in kwargs:
            print(getattr(v, kwargs['get_attr']))
        if res is None:
            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
            if fatal:
                raise ExtractorError(msg)
            else:
                self.report_warning(msg)
        return res

    def _float(self, v, name, fatal=False, **kwargs):
        res = float_or_none(v, **kwargs)
        if res is None:
            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
            if fatal:
                raise ExtractorError(msg)
            else:
                self.report_warning(msg)
        return res

    def _set_cookie(self, domain, name, value, expire_time=None, port=None,
                    path='/', secure=False, discard=False, rest={}, **kwargs):
        cookie = compat_cookiejar_Cookie(
            0, name, value, port, port is not None, domain, True,
            domain.startswith('.'), path, True, secure, expire_time,
            discard, None, None, rest)
        self._downloader.cookiejar.set_cookie(cookie)

    def _get_cookies(self, url):
        """ Return a compat_cookies_SimpleCookie with the cookies for the url """
        req = sanitized_Request(url)
        self._downloader.cookiejar.add_cookie_header(req)
        return compat_cookies_SimpleCookie(req.get_header('Cookie'))

    def _apply_first_set_cookie_header(self, url_handle, cookie):
        """
        Apply first Set-Cookie header instead of the last. Experimental.

        Some sites (e.g. [1-3]) may serve two cookies under the same name
        in Set-Cookie header and expect the first (old) one to be set rather
        than second (new). However, as of RFC6265 the newer one cookie
        should be set into cookie store what actually happens.
        We will workaround this issue by resetting the cookie to
        the first one manually.
        1. https://new.vk.com/
        2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
        3. https://learning.oreilly.com/
        """
        for header, cookies in url_handle.headers.items():
            if header.lower() != 'set-cookie':
                continue
            if sys.version_info[0] >= 3:
                cookies = cookies.encode('iso-8859-1')
            cookies = cookies.decode('utf-8')
            cookie_value = re.search(
                r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
            if cookie_value:
                value, domain = cookie_value.groups()
                self._set_cookie(domain, cookie, value)
                break

    def get_testcases(self, include_onlymatching=False):
        t = getattr(self, '_TEST', None)
        if t:
            assert not hasattr(self, '_TESTS'), \
                '%s has _TEST and _TESTS' % type(self).__name__
            tests = [t]
        else:
            tests = getattr(self, '_TESTS', [])
        for t in tests:
            if not include_onlymatching and t.get('only_matching', False):
                continue
            t['name'] = type(self).__name__[:-len('IE')]
            yield t

    def is_suitable(self, age_limit):
        """ Test whether the extractor is generally suitable for the given
        age limit (i.e. pornographic sites are not, all others usually are) """

        any_restricted = False
        for tc in self.get_testcases(include_onlymatching=False):
            if tc.get('playlist', []):
                tc = tc['playlist'][0]
            is_restricted = age_restricted(
                tc.get('info_dict', {}).get('age_limit'), age_limit)
            if not is_restricted:
                return True
            any_restricted = any_restricted or is_restricted
        return not any_restricted

    def extract_subtitles(self, *args, **kwargs):
        if (self.get_param('writesubtitles', False)
                or self.get_param('listsubtitles')):
            return self._get_subtitles(*args, **kwargs)
        return {}

    def _get_subtitles(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    @staticmethod
    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
        """ Merge subtitle items for one language. Items with duplicated URLs
        will be dropped. """
        list1_urls = set([item['url'] for item in subtitle_list1])
        ret = list(subtitle_list1)
        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
        return ret

    @classmethod
    def _merge_subtitles(cls, *dicts, target=None):
        """ Merge subtitle dictionaries, language by language. """
        if target is None:
            target = {}
        for d in dicts:
            for lang, subs in d.items():
                target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
        return target

    def extract_automatic_captions(self, *args, **kwargs):
        if (self.get_param('writeautomaticsub', False)
                or self.get_param('listsubtitles')):
            return self._get_automatic_captions(*args, **kwargs)
        return {}

    def _get_automatic_captions(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    def mark_watched(self, *args, **kwargs):
        if (self.get_param('mark_watched', False)
                and (self._get_login_info()[0] is not None
                     or self.get_param('cookiefile') is not None)):
            self._mark_watched(*args, **kwargs)

    def _mark_watched(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    def geo_verification_headers(self):
        headers = {}
        geo_verification_proxy = self.get_param('geo_verification_proxy')
        if geo_verification_proxy:
            headers['Ytdl-request-proxy'] = geo_verification_proxy
        return headers

    def _generic_id(self, url):
        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

    def _generic_title(self, url):
        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])

    @staticmethod
    def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
        all_known = all(map(
            lambda x: x is not None,
            (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
        return (
            'private' if is_private
            else 'premium_only' if needs_premium
            else 'subscriber_only' if needs_subscription
            else 'needs_auth' if needs_auth
            else 'unlisted' if is_unlisted
            else 'public' if all_known
            else None)

    def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
        '''
        @returns            A list of values for the extractor argument given by "key"
                            or "default" if no such key is present
        @param default      The default value to return when the key is not present (default: [])
        @param casesense    When false, the values are converted to lower case
        '''
        val = traverse_obj(
            self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
        if val is None:
            return [] if default is NO_DEFAULT else default
        return list(val) if casesense else [x.lower() for x in val]


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError('Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError('This method must be implemented by subclasses')

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
-												[extractor/common] Add coding cookie

											
										
										
											8 years ago
+								# coding: utf-8
-												[common] use float conversion instead of using division from __future__

											
										
										
											9 years ago
+								from __future__ import unicode_literals
-												[extractor/common] Modernize

											
										
										
											10 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								import base64
-												[muenchentv] Move live title generation to common

											
										
										
											10 years ago
+								import datetime
-												[extractor/common] Limit --write-pages filename to 200 chars

This avoids problems with very long URLs.

											
										
										
											11 years ago
+								import hashlib
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											11 years ago
+								import json
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											10 years ago
+								import netrc
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								import os
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								import random
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								import re
 								import sys
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											10 years ago
+								import time
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								import math
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[util] Move compatibility functions out of util

utils is large enough without these compatibility functions.

Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py .
Everything else (i.e. youtube-dl-specific helpers) goes into utils.py .

											
										
										
											10 years ago
+								from ..compat import (
-												[extractor/common] Use compat_cookiejar_Cookie for _set_cookie (closes #23256, closes #24776)

To always ensure cookie name and value are bytestrings on python 2.

											
										
										
											5 years ago
+								    compat_cookiejar_Cookie,
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											4 years ago
+								    compat_cookies_SimpleCookie,
-												Use compat_etree_Element

											
										
										
											6 years ago
+								    compat_etree_Element,
-												[jython] Introduce compat_os_name

os.name is always 'java' on Jython

											
										
										
											9 years ago
+								    compat_etree_fromstring,
-												Add option `--netrc-location`
Closes #792, #963

											
										
										
											3 years ago
+								    compat_expanduser,
-												[extractor/common] Interactive TFA code input

											
										
										
											9 years ago
+								    compat_getpass,
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    compat_http_client,
-												[jython] Introduce compat_os_name

os.name is always 'java' on Jython

											
										
										
											9 years ago
+								    compat_os_name,
 								    compat_str,
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    compat_urllib_error,
-												[extractor/common] Add id and title helpers for generic IEs

											
										
										
											8 years ago
+								    compat_urllib_parse_unquote,
-												[compat] Add compat_urllib_parse_urlencode and eliminate encode_dict

encode_dict functionality has been improved and moved directly into compat_urllib_parse_urlencode
All occurrences of compat_urllib_parse.urlencode throughout the codebase have been replaced by compat_urllib_parse_urlencode

Closes #8974

											
										
										
											9 years ago
+								    compat_urllib_parse_urlencode,
-												[extractor/common] Improve _request_webpage

* Do not ignore data, headers and query for Requests
* Default values for headers and query switched to dicts since these are used by urllib itself

											
										
										
											9 years ago
+								    compat_urllib_request,
-												[vevo] Support 1080p videos (Fixes #3656)

											
										
										
											10 years ago
+								    compat_urlparse,
-												[extractor/common] Introduce _parse_xml

											
										
										
											7 years ago
+								    compat_xml_parse_error,
-												[util] Move compatibility functions out of util

utils is large enough without these compatibility functions.

Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py .
Everything else (i.e. youtube-dl-specific helpers) goes into utils.py .

											
										
										
											10 years ago
+								)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								from ..downloader import FileDownloader
-												[f4m] Prefer baseURL for relative URLs (closes #14660)

											
										
										
											7 years ago
+								from ..downloader.f4m import (
 								    get_base_url,
 								    remove_encrypted_media,
 								)
-												[util] Move compatibility functions out of util

utils is large enough without these compatibility functions.

Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py .
Everything else (i.e. youtube-dl-specific helpers) goes into utils.py .

											
										
										
											10 years ago
+								from ..utils import (
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											10 years ago
+								    age_restricted,
-												[utils] Introduce base_url

											
										
										
											8 years ago
+								    base_url,
-												InfoExtractor._search_regex: Suggest updating when the regex is not found (suggested in #5442)

Reuse the same message from ExtractorError

											
										
										
											10 years ago
+								    bug_reports_message,
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    clean_html,
 								    compiled_regex_type,
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											9 years ago
+								    determine_ext,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    determine_protocol,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								    dict_get,
-												Rename error_to_str to error_to_compat_str

											
										
										
											9 years ago
+								    error_to_compat_str,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    extract_attributes,
-												[cleanup] Misc

											
										
										
											3 years ago
+								    ExtractorError,
-												[extractor/common] Handle malformed f4m manifests

											
										
										
											9 years ago
+								    fix_xml_ampersands,
-												[golem] Simplify (#3828)

											
										
										
											10 years ago
+								    float_or_none,
-												[cleanup] Misc

											
										
										
											3 years ago
+								    format_field,
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								    GeoRestrictedError,
 								    GeoUtils,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								    int_or_none,
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								    js_to_json,
-												[utils] Share JSON-LD regex

											
										
										
											6 years ago
+								    JSON_LD_RE,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    mimetype2ext,
-												[utils] Add `network_exceptions`

											
										
										
											4 years ago
+								    network_exceptions,
-												[cleanup] Misc

											
										
										
											3 years ago
+								    NO_DEFAULT,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    orderedSet,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								    parse_bitrate,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    parse_codecs,
 								    parse_duration,
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											9 years ago
+								    parse_iso8601,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    parse_m3u8_attributes,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								    parse_resolution,
-												[vimeo] Fix pro videos and player.vimeo.com urls

The old process can still be used for those videos.
Added RegexNotFoundError, which is raised by _search_regex if it can't extract the info.

											
										
										
											11 years ago
+								    RegexNotFoundError,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    sanitize_filename,
-												[cleanup] Misc

											
										
										
											3 years ago
+								    sanitized_Request,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								    str_or_none,
-												[extractor/common] Relax interaction count extraction in _json_ld

											
										
										
											4 years ago
+								    str_to_int,
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											6 years ago
+								    strip_or_none,
-												Add `--extractor-args` to pass extractor-specific arguments

											
										
										
											3 years ago
+								    traverse_obj,
-												Use unescapeHTML for OpenGraph properties

These are attribute values, so we don't need the more complex and whitespace-destroying cleanHTML - we just need to unescape quotes, that's it.

											
										
										
											11 years ago
+								    unescapeHTML,
-												[extractor/common] Extract upload date from SMIL

											
										
										
											9 years ago
+								    unified_strdate,
-												[extractor/common] Extract more metadata for VideoObject in _json_ld

											
										
										
											8 years ago
+								    unified_timestamp,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								    update_Request,
 								    update_url_query,
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								    url_basename,
-												[extractor/common] Add validation for JSON-LD URLs

											
										
										
											6 years ago
+								    url_or_none,
-												[cleanup] Misc

											
										
										
											3 years ago
+								    urljoin,
-												[utils] Add `variadic`

											
										
										
											3 years ago
+								    variadic,
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											9 years ago
+								    xpath_element,
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
+								    xpath_text,
 								    xpath_with_ns,
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								)
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											9 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								class InfoExtractor(object):
 								    """Information Extractor class.
 								    Information extractors are the classes that, given a URL, extract
 								    information about the video (or videos) the URL refers to. This
 								    information includes the real video URL, the video title, author and
 								    others. The information is stored in a dictionary which is then
-												[extractor/common] Update docstring: replace FileDownloader with YoutubeDL

											
										
										
											10 years ago
+								    passed to the YoutubeDL. The YoutubeDL processes this
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    information possibly downloading the video to the file system, among
 								    other possible outcomes.
-												Typo: twice "the the" to "the"
											
										
										
											10 years ago
+								    The type field determines the type of the result.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											10 years ago
+								    By far the most common value (and the default if _type is missing) is
 								    "video", which indicates a single video.
 								    For a video, the dictionaries must include the following fields:
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								    id:             Video identifier.
 								    title:          Video title, unescaped.
-												Reorder info_dict documentation

											
										
										
											11 years ago
-												Add a resolution field and improve general --list-formats output

											
										
										
											11 years ago
+								    Additionally, it must contain either a formats entry or a url one:
-												Reorder info_dict documentation

											
										
										
											11 years ago
-												Add a resolution field and improve general --list-formats output

											
										
										
											11 years ago
+								    formats:        A list of dictionaries for each format available, ordered
 								                    from worst to best quality.
 								                    Potential fields:
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											6 years ago
+								                    * url        The mandatory URL representing the media:
 								                                   for plain file media - HTTP URL of this file,
 								                                   for RTMP - RTMP URL,
 								                                   for HLS - URL of the M3U8 media playlist,
 								                                   for HDS - URL of the F4M manifest,
-												[extractor/common] Fix url meta field for unfragmented DASH formats (closes #20346)

											
										
										
											6 years ago
+								                                   for DASH
 								                                     - HTTP URL to plain file media (in case of
 								                                       unfragmented media)
 								                                     - URL of the MPD manifest or base URL
 								                                       representing the media if MPD manifest
-												[extractor/common] Fix typo

											
										
										
											6 years ago
+								                                       is parsed from a string (in case of
-												[extractor/common] Fix url meta field for unfragmented DASH formats (closes #20346)

											
										
										
											6 years ago
+								                                       fragmented media)
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											6 years ago
+								                                   for MSS - URL of the ISM manifest.
-												Refactor fragments interface and dash segments downloader
- Eliminate segment_urls and initialization_url
+ Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly)
* Rewrite dashsegments downloader to use fragments data
* Improve generic mpd extraction

											
										
										
											8 years ago
+								                    * manifest_url
 								                                 The URL of the manifest file in case of
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											6 years ago
+								                                 fragmented media:
 								                                   for HLS - URL of the M3U8 master playlist,
 								                                   for HDS - URL of the F4M manifest,
 								                                   for DASH - URL of the MPD manifest,
 								                                   for MSS - URL of the ISM manifest.
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								                    * ext        Will be calculated from URL if missing
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * format     A human-readable description of the format
 								                                 ("mp4 container with h264/opus").
 								                                 Calculated from the format_id, width, height.
 								                                 and format_note fields if missing.
 								                    * format_id  A short description of the format
-												Document that format_id field should be present

											
										
										
											11 years ago
+								                                 ("mp4_h264_opus" or "19").
 								                                Technically optional, but strongly recommended.
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * format_note Additional info about the format
 								                                 ("3D" or "DASH video")
 								                    * width      Width of the video, if known
 								                    * height     Height of the video, if known
-												Add a resolution field and improve general --list-formats output

											
										
										
											11 years ago
+								                    * resolution Textual description of width and height
-												[yahoo] Use centralized sorting, and add tbr field

											
										
										
											11 years ago
+								                    * tbr        Average bitrate of audio and video in KBit/s
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * abr        Average audio bitrate in KBit/s
 								                    * acodec     Name of the audio codec in use
-												[youtube] Download DASH manifest

If given, download and parse the DASH manifest file, in order to get ultra-HQ formats.
Fixes #2166

											
										
										
											11 years ago
+								                    * asr        Audio sampling rate in Hertz
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * vbr        Average video bitrate in KBit/s
-												[youtube] Add formats 298, 299 (Fixes #4056)

											
										
										
											10 years ago
+								                    * fps        Frame rate
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * vcodec     Name of the video codec in use
-												[youtube] Add new formats (Fixes #2221)

											
										
										
											11 years ago
+								                    * container  Name of the container format
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * filesize   The number of bytes, if known in advance
-												[snotr] PEP8 and minor fixes (#3296)

											
										
										
											10 years ago
+								                    * filesize_approx  An estimate for the number of bytes
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								                    * player_url SWF Player URL (used for rtmpdump).
-												[zdf] Use centralized sorting

											
										
										
											11 years ago
+								                    * protocol   The protocol that will be used for the actual
 								                                 download, lower-case.
-												[whowatch] Add extractor #292

closes #223

Authored by: nao20010128nao 
Modified from: https://github.com/nao20010128nao/ytdl-patched/blob/9e4a0e061a558cdb05a618e27f47ca0ac56ece94/youtube_dl/extractor/whowatch.py
											
										
										
											4 years ago
+								                                 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
-												[common] Document protocol http_dash_segments

											
										
										
											9 years ago
+								                                 "m3u8", "m3u8_native" or "http_dash_segments".
-												[extractor/common] Document forgotten fragment base and path interfaces

											
										
										
											8 years ago
+								                    * fragment_base_url
 								                                 Base URL for fragments. Each fragment's path
 								                                 value (if present) will be relative to
 								                                 this URL.
 								                    * fragments  A list of fragments of a fragmented media.
 								                                 Each fragment entry must contain either an url
 								                                 or a path. If an url is present it should be
 								                                 considered by a client. Otherwise both path and
 								                                 fragment_base_url must be present. Here is
 								                                 the list of all potential fields:
 								                                 * "url" - fragment's URL
 								                                 * "path" - fragment's path relative to
 								                                            fragment_base_url
-												[extractor/common] Introduce fragments interface

											
										
										
											8 years ago
+								                                 * "duration" (optional, int or float)
 								                                 * "filesize" (optional, int)
-												Add a resolution field and improve general --list-formats output

											
										
										
											11 years ago
+								                    * preference Order number of this format. If this field is
-												[wistia] Prefer original video format above all others

We could also set up a formula which would weigh filesize/bitrate and vcodec/acodec (say, 1GB h264 < 3 GB MPEG2 < 2 GB h264), but that would get really messy real soon.

											
										
										
											11 years ago
+								                                 present and not None, the formats get sorted
-												[extractor/common] Clarify preference key in formats

											
										
										
											11 years ago
+								                                 by this field, regardless of all other values.
-												Add a resolution field and improve general --list-formats output

											
										
										
											11 years ago
+								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[youtube] Correct handling when DASH manifest is not necessary to find all formats

											
										
										
											10 years ago
+								                                 < -1000 to hide the format (if there is
 								                                    another one which is strictly better)
-												[ccc] Add language information to formats

											
										
										
											9 years ago
+								                    * language   Language code, e.g. "de" or "en-US".
 								                    * language_preference  Is this in the language mentioned in
 								                                 the URL?
-												[arte] Clean up format sorting mess

We now use our standard sorting facilities. As a side effect, it's finally possible to download German videos from French URLs and vice versa.

											
										
										
											10 years ago
+if it's what the URL is about,
 								                                 -1 for default (don't know),
 								                                 -10 otherwise, other values reserved for now.
-												[orf] Use new extraction method (Fixes #2057)

											
										
										
											11 years ago
+								                    * quality    Order number of the video quality of this
 								                                 format, irrespective of the file format.
 								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[viddler] Use API

											
										
										
											10 years ago
+								                    * source_preference  Order number for this video source
 								                                  (quality takes higher priority)
 								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[grooveshark,http] Make HTTP POST downloads work

											
										
										
											10 years ago
+								                    * http_headers  A dictionary of additional HTTP headers
 								                                 to add to the request.
-												[youtube|ffmpeg] Automatically correct video with non-square pixels (Fixes #4674)

											
										
										
											10 years ago
+								                    * stretched_ratio  If given and not 1, indicates that the
-												[rtl2] PEP8, simplify, make rtmp tests run (#470)

											
										
										
											10 years ago
+								                                 video's pixels are not square.
 								                                 width : height ratio as float.
 								                    * no_resume  The server does not support resuming the
 								                                 (HTTP or RTMP) download. Boolean.
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								                    * has_drm    The format has DRM and cannot be downloaded. Boolean
-												[downloader/http] Add ability to pass downloader options via info dict

											
										
										
											7 years ago
+								                    * downloader_options  A dictionary of downloader options as
 								                                 described in FileDownloader
-												Release 2021.06.08

											
										
										
											3 years ago
+								                    RTMP formats can also have the additional fields: page_url,
 								                    app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 								                    rtmp_protocol, rtmp_real_time
-												[rtl2] PEP8, simplify, make rtmp tests run (#470)

											
										
										
											10 years ago
-												Document duration field

											
										
										
											11 years ago
+								    url:            Final video URL.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    ext:            Video filename extension.
-												Reorder info_dict documentation

											
										
										
											11 years ago
+								    format:         The video format, defaults to ext (used for --get-format)
 								    player_url:     SWF Player URL (used for rtmpdump).
-												Clarify that url and ext are optional when formats is given (#980)

											
										
										
											11 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    The following fields are optional:
-												[vine] Provide alt_title (Fixes #4448)

											
										
										
											10 years ago
+								    alt_title:      A secondary title of the video.
-												Add display_id field

											
										
										
											11 years ago
+								    display_id      An alternative identifier for the video, not necessarily
 								                    unique, but available before title. Typically, id is
 								                    something like "4234987", title "Dancing naked mole rats",
 								                    and display_id "dancing-naked-mole-rats"
-												[spiegeltv] Simplify and PEP8

											
										
										
											11 years ago
+								    thumbnails:     A list of dictionaries, with the following entries:
-												Add --list-thumbnails

											
										
										
											10 years ago
+								                        * "id" (optional, string) - Thumbnail format ID
-												[spiegeltv] Simplify and PEP8

											
										
										
											11 years ago
+								                        * "url"
-												Add --list-thumbnails

											
										
										
											10 years ago
+								                        * "preference" (optional, int) - quality of the image
-												[spiegeltv] Simplify and PEP8

											
										
										
											11 years ago
+								                        * "width" (optional, int)
 								                        * "height" (optional, int)
-												[extractor/common] Fix typo in thumbnails resolution description (#21817)


											
										
										
											5 years ago
+								                        * "resolution" (optional, string "{width}x{height}",
-												[spiegeltv] Simplify and PEP8

											
										
										
											11 years ago
+								                                        deprecated)
-												[extractor/common] Introduce filesize metafield for thumbnails

											
										
										
											8 years ago
+								                        * "filesize" (optional, int)
-												[youtube] Extract more thumbnails

* The thumbnail URLs are hard-coded and their actual existence is tested lazily
* Added option `--no-check-formats` to not test them

Closes #340, Related: #402, #337, https://github.com/ytdl-org/youtube-dl/issues/29049

											
										
										
											3 years ago
+								                        * "_test_url" (optional, bool) - If true, test the URL
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    thumbnail:      Full URL to a video thumbnail image.
-												[vine] Provide alt_title (Fixes #4448)

											
										
										
											10 years ago
+								    description:    Full video description.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    uploader:       Full name of the video uploader.
-												[extractor/common] Document license metafield

											
										
										
											9 years ago
+								    license:        License name the video is licensed under.
-												[extractor/common] Relax wording for creator metafield

											
										
										
											9 years ago
+								    creator:        The creator of the video.
-												Update to ytdl-commit-3be0980
https://github.com/ytdl-org/youtube-dl/commit/3be098010f667b14075e3dfad1e74e5e2becc8ea

											
										
										
											4 years ago
+								    release_timestamp: UNIX timestamp of the moment the video was released.
-												[extractor/common] Document release_date field

											
										
										
											9 years ago
+								    release_date:   The date (YYYYMMDD) when the video was released.
-												Update to ytdl-commit-3be0980
https://github.com/ytdl-org/youtube-dl/commit/3be098010f667b14075e3dfad1e74e5e2becc8ea

											
										
										
											4 years ago
+								    timestamp:      UNIX timestamp of the moment the video was uploaded
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    upload_date:    Video upload date (YYYYMMDD).
-												Rename upload_timestamp to timestamp

											
										
										
											11 years ago
+								                    If not explicitly set, calculated from timestamp.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    uploader_id:    Nickname or id of the video uploader.
-												[extractor/common] Document uploader_url

											
										
										
											9 years ago
+								    uploader_url:   Full URL to a personal webpage of the video uploader.
-												[extractor/common] Introduce channel meta fields

											
										
										
											6 years ago
+								    channel:        Full name of the channel the video is uploaded on.
-												[extractor/common] Fix typos

											
										
										
											6 years ago
+								                    Note that channel fields may or may not repeat uploader
-												[extractor/common] Introduce channel meta fields

											
										
										
											6 years ago
+								                    fields. This depends on a particular extractor.
 								    channel_id:     Id of the channel.
 								    channel_url:    Full URL to a channel webpage.
-												[muscivault] Add extractor (Fixes #3593)

											
										
										
											10 years ago
+								    location:       Physical location where the video was filmed.
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											10 years ago
+								    subtitles:      The available subtitles as a dictionary in the format
-												[extractor/common] Allow non-lang in subtitles' keys

See 264e77c406a3b14f15aafcd036524cb6fe86aa20

											
										
										
											8 years ago
+								                    {tag: subformats}. "tag" is usually a language code, and
 								                    "subformats" is a list sorted from lower to higher
 								                    preference, each element is a dictionary with the "ext"
 								                    entry and one of:
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											10 years ago
+								                        * "data": The subtitles file contents
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								                        * "url": A URL pointing to the subtitles file
-												Add field `name` for subtitles

Co-authored by: pukkandan, tpikonen

Based on: #310, https://github.com/ytdl-org/youtube-dl/pull/26112

											
										
										
											4 years ago
+								                    It can optionally also have:
 								                        * "name": Name or description of the subtitles
-												[YoutubeDL] Autocalculate ext for subtitles when missing

											
										
										
											9 years ago
+								                    "ext" will be calculated from URL if missing
-												[documentation] Add deprecated options and aliases in readme

											
										
										
											4 years ago
+								    automatic_captions: Like 'subtitles'; contains automatically generated
 								                    captions instead of normal subtitles
-												[extractor/common] Clarify duration can be float

											
										
										
											9 years ago
+								    duration:       Length of the video in seconds, as an integer or float.
-												Document view_count (Closes #963)

											
										
										
											12 years ago
+								    view_count:     How many users have watched the video on the platform.
-												[9gag] Like/dislike count (#1895)

											
										
										
											11 years ago
+								    like_count:     Number of positive ratings of the video
 								    dislike_count:  Number of negative ratings of the video
-												[extractor/common] Document repost_count

											
										
										
											9 years ago
+								    repost_count:   Number of reposts of the video
-												[youtube] Extract average rating (closes #2362)

											
										
										
											10 years ago
+								    average_rating: Average rating give by users, the scale used depends on the webpage
-												[9gag] Like/dislike count (#1895)

											
										
										
											11 years ago
+								    comment_count:  Number of comments on the video
-												[netzkino] Add new extractor (Fixes #4669)

											
										
										
											10 years ago
+								    comments:       A list of comments, each with one or more of the following
 								                    properties (all but one of text or html optional):
 								                        * "author" - human-readable name of the comment author
 								                        * "author_id" - user ID of the comment author
-												[Youtube] Rewrite comment extraction (#167)

Closes #121

TODO:
* Add an option for the user to specify newest/popular and max number of comments
* Refactor the download code and generalize with TabIE
* Parse time_text to timestamp

											
										
										
											4 years ago
+								                        * "author_thumbnail" - The thumbnail of the comment author
-												[netzkino] Add new extractor (Fixes #4669)

											
										
										
											10 years ago
+								                        * "id" - Comment ID
 								                        * "html" - Comment as HTML
 								                        * "text" - Plain text of the comment
 								                        * "timestamp" - UNIX timestamp of comment
 								                        * "parent" - ID of the comment this one is replying to.
 								                                     Set to "root" to indicate that this is a
 								                                     comment to the original video.
-												[Youtube] Rewrite comment extraction (#167)

Closes #121

TODO:
* Add an option for the user to specify newest/popular and max number of comments
* Refactor the download code and generalize with TabIE
* Parse time_text to timestamp

											
										
										
											4 years ago
+								                        * "like_count" - Number of positive ratings of the comment
 								                        * "dislike_count" - Number of negative ratings of the comment
 								                        * "is_favorited" - Whether the comment is marked as
 								                                           favorite by the video uploader
 								                        * "author_is_uploader" - Whether the comment is made by
 								                                                 the video uploader
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											11 years ago
+								    age_limit:      Age restriction for the video, as an integer (years)
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											4 years ago
+								    webpage_url:    The URL to the video webpage, if given to yt-dlp it
-												Add the 'webpage_url' field to info_dict

The url for the video page, it must allow to reproduce the result.
It's automatically set by YoutubeDL if it's missing.

											
										
										
											11 years ago
+								                    should allow to get the same result again. (It will be set
 								                    by YoutubeDL if it's missing)
-												Document and test categories (#2923)

											
										
										
											11 years ago
+								    categories:     A list of categories that the video falls in, for example
 								                    ["Sports", "Berlin"]
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											9 years ago
+								    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
-												[pornhub] Extract `cast`
Closes #406, https://github.com/ytdl-org/youtube-dl/pull/27384

											
										
										
											3 years ago
+								    cast:           A list of the video cast
-												[muenchentv] Add support (Fixes #3507)

											
										
										
											10 years ago
+								    is_live:        True, False, or None (=unknown). Whether this video is a
 								                    live stream that goes on instead of a fixed-length video.
-												[youtube] Show if video was a live stream in info

											
										
										
											4 years ago
+								    was_live:       True, False, or None (=unknown). Whether this video was
 								                    originally a live stream.
-												[RCTIPlus] Support events and TV (#625)

Authored by: MinePlayersPE
											
										
										
											3 years ago
+								    live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
-												Add field `live_status`

											
										
										
											3 years ago
+								                    If absent, automatically set from is_live, was_live
-												[youtube] Extract start_time

From the 't=*' in the url.
Currently youtube-dl doesn't use the value, but it was requested for the mpv plugin.

											
										
										
											9 years ago
+								    start_time:     Time in seconds where the reproduction should start, as
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								                    specified in the URL.
-												[youtube] Extract end_time

											
										
										
											9 years ago
+								    end_time:       Time in seconds where the reproduction should end, as
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								                    specified in the URL.
-												[common] introduce chapters field

											
										
										
											9 years ago
+								    chapters:       A list of dictionaries, with the following entries:
 								                        * "start_time" - The start time of the chapter in seconds
 								                        * "end_time" - The end time of the chapter in seconds
 								                        * "title" (optional, string)
-												[documentaion] Document `playable_in_embed`

:ci skip all

											
										
										
											4 years ago
+								    playable_in_embed: Whether this video is allowed to play in embedded
 								                    players on other sites. Can be True (=always allowed),
 								                    False (=never allowed), None (=unknown), or a string
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											4 years ago
+								                    specifying the criteria for embedability (Eg: 'whitelist')
 								    availability:   Under what condition the video is available. One of
 								                    'private', 'premium_only', 'subscriber_only', 'needs_auth',
 								                    'unlisted' or 'public'. Use 'InfoExtractor._availability'
 								                    to set it
-												Extract comments only when needed #95 (Closes #94)


											
										
										
											4 years ago
+								    __post_extractor: A function to be called just before the metadata is
 								                    written to either disk, logger or console. The function
 								                    must return a dict which will be added to the info_dict.
 								                    This is usefull for additional information that is
 								                    time-consuming to extract. Note that the fields thus
 								                    extracted will not be available to output template and
 								                    match_filter. So, only "comments" and "comment_count" are
 								                    currently allowed to be extracted via this method.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor/common] Document chapter and series fields

											
										
										
											9 years ago
+								    The following fields should only be used when the video belongs to some logical
 								    chapter or section:
 								    chapter:        Name or title of the chapter the video belongs to.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											9 years ago
+								    chapter_number: Number of the chapter the video belongs to, as an integer.
 								    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											9 years ago
 								    The following fields should only be used when the video is an episode of some
-												[extractor/common] Mention podcast in series fields section

											
										
										
											8 years ago
+								    series, programme or podcast:
-												[extractor/common] Document chapter and series fields

											
										
										
											9 years ago
 								    series:         Title of the series or programme the video episode belongs to.
 								    season:         Title of the season the video episode belongs to.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											9 years ago
+								    season_number:  Number of the season the video episode belongs to, as an integer.
 								    season_id:      Id of the season the video episode belongs to, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											9 years ago
+								    episode:        Title of the video episode. Unlike mandatory video title field,
 								                    this field should denote the exact title of the video episode
 								                    without any kind of decoration.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											9 years ago
+								    episode_number: Number of the video episode within a season, as an integer.
 								    episode_id:     Id of the video episode, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											9 years ago
-												[extractor/common] Introduce music album metafields

											
										
										
											9 years ago
+								    The following fields should only be used when the media is a track or a part of
 								    a music album:
 								    track:          Title of the track.
 								    track_number:   Number of the track within an album or a disc, as an integer.
 								    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 								                    as a unicode string.
 								    artist:         Artist(s) of the track.
 								    genre:          Genre(s) of the track.
 								    album:          Title of the album the track belongs to.
 								    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 								    album_artist:   List of all artists appeared on the album (e.g.
 								                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 								                    and compilations).
 								    disc_number:    Number of the disc or other physical medium the track belongs to,
 								                    as an integer.
 								    release_year:   Year (YYYY) when the album was released.
-												Document formats (for #980)

											
										
										
											11 years ago
+								    Unless mentioned otherwise, the fields should be Unicode strings.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[utils] Default age_limit to None

If we can't parse it, it means we don't have any information, not that the content is unrestricted.

											
										
										
											10 years ago
+								    Unless mentioned otherwise, None is equivalent to absence of information.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											10 years ago
 								    _type "playlist" indicates multiple videos.
-												Allow iterators for playlist result entries

											
										
										
											10 years ago
+								    There must be a key "entries", which is a list, an iterable, or a PagedList
 								    object, each element of which is a valid dictionary by this specification.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											10 years ago
-												[youtube] More metadata extraction for channels/playlists

											
										
										
											4 years ago
+								    Additionally, playlists can have "id", "title", and any other relevent
 								    attributes with the same semantics as videos (see above).
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											10 years ago
 								    _type "multi_video" indicates that there are multiple videos that
 								    form a single show, for examples multiple acts of an opera or TV episode.
 								    It must have an entries key like a playlist and contain all the keys
 								    required for a video at the same time.
 								    _type "url" indicates that the video must be extracted from another
 								    location, possibly by a different extractor. Its only required key is:
 								    "url" - the next URL to extract.
-												[extractor/common] Document ie_key in url results

											
										
										
											10 years ago
+								    The key "ie_key" can be set to the class name (minus the trailing "IE",
 								    e.g. "Youtube") if the extractor class is known in advance.
 								    Additionally, the dictionary may have any properties of the resolved entity
 								    known in advance, for example "title" if the title of the referred video is
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											10 years ago
+								    known ahead of time.
 								    _type "url_transparent" entities have the same specification as "url", but
 								    indicate that the given additional information is more precise than the one
 								    associated with the resolved URL.
 								    This is useful when a site employs a video service that hosts the video and
 								    its technical metadata, but that video service does not embed a useful
 								    title, description etc.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    Subclasses of this one should re-define the _real_initialize() and
 								    _real_extract() methods and define a _VALID_URL regexp.
 								    Probably, they should also be added to the list of extractors.
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								    _GEO_BYPASS attribute may be set to False in order to disable
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								    geo restriction bypass mechanisms for a particular extractor.
 								    Though it won't disable explicit geo restriction bypass based on
-												Remove experimental mark for some options

											
										
										
											7 years ago
+								    country code provided with geo_bypass_country.
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
 								    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 								    countries for this extractor. One of these countries will be used by
 								    geo restriction bypass mechanism right away in order to bypass
-												Remove experimental mark for some options

											
										
										
											7 years ago
+								    geo restriction, of course, if the mechanism is not disabled.
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 								    IP blocks in CIDR notation for this extractor. One of these IP blocks
 								    will be used by geo restriction bypass mechanism similarly
-												Remove experimental mark for some options

											
										
										
											7 years ago
+								    to _GEO_COUNTRIES.
-												[extractor/common] Emphasize geo bypass APIs are experimental

											
										
										
											8 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    Finally, the _WORKING attribute should be set to False for broken IEs
 								    in order to warn the users and skip the tests.
 								    """
 								    _ready = False
 								    _downloader = None
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								    _x_forwarded_for_ip = None
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								    _GEO_BYPASS = True
 								    _GEO_COUNTRIES = None
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								    _GEO_IP_BLOCKS = None
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    _WORKING = True
-												[youtube] Better message when login required

											
										
										
											4 years ago
+								    _LOGIN_HINTS = {
 								        'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 								        'cookies': (
 								            'Use --cookies for the authentication. '
 								            'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 								        'password': 'Use --username and --password or --netrc to provide account credentials',
 								    }
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    def __init__(self, downloader=None):
 								        """Constructor. Receives an optional downloader."""
 								        self._ready = False
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								        self._x_forwarded_for_ip = None
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								        self._printed_messages = set()
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        self.set_downloader(downloader)
 								    @classmethod
-												[extractor] Common function `_match_valid_url`

											
										
										
											3 years ago
+								    def _match_valid_url(cls, url):
-												Cache suitable regular expressions

This speeds up TestAllURLsMatching.test_no_duplicates by about 8000% at the cost of minimal memory overhead.

											
										
										
											11 years ago
+								        # This does not use has/getattr intentionally - we want to know whether
 								        # we have cached the regexp for *this* class, whereas getattr would also
 								        # match the superclass
 								        if '_VALID_URL_RE' not in cls.__dict__:
 								            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-												[extractor] Common function `_match_valid_url`

											
										
										
											3 years ago
+								        return cls._VALID_URL_RE.match(url)
 								    @classmethod
 								    def suitable(cls, url):
 								        """Receives a URL and returns True if suitable for this IE."""
-												[lazy_extractors] Fix `suitable` and add flake8 test

											
										
										
											3 years ago
+								        # This function must import everything it needs (except other extractors),
 								        # so that lazy_extractors works correctly
-												[extractor] Common function `_match_valid_url`

											
										
										
											3 years ago
+								        return cls._match_valid_url(url) is not None
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[common] Add new helper function _match_id

											
										
										
											10 years ago
+								    @classmethod
 								    def _match_id(cls, url):
-												[extractor] Common function `_match_valid_url`

											
										
										
											3 years ago
+								        return cls._match_valid_url(url).group('id')
-												[common] Add new helper function _match_id

											
										
										
											10 years ago
-												[extractor] Show video id in error messages if possible

											
										
										
											3 years ago
+								    @classmethod
 								    def get_temp_id(cls, url):
 								        try:
 								            return cls._match_id(url)
 								        except (IndexError, AttributeError):
 								            return None
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    @classmethod
 								    def working(cls):
 								        """Getter method for _WORKING."""
 								        return cls._WORKING
 								    def initialize(self):
 								        """Initializes an instance (authentication, etc)."""
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								        self._printed_messages = set()
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								        self._initialize_geo_bypass({
 								            'countries': self._GEO_COUNTRIES,
 								            'ip_blocks': self._GEO_IP_BLOCKS,
 								        })
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								        if not self._ready:
 								            self._real_initialize()
 								            self._ready = True
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								    def _initialize_geo_bypass(self, geo_bypass_context):
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											8 years ago
+								        """
 								        Initialize geo restriction bypass mechanism.
 								        This method is used to initialize geo bypass mechanism based on faking
 								        X-Forwarded-For HTTP header. A random country from provided country list
-												[extractor/common] Fix typo

											
										
										
											8 years ago
+								        is selected and a random IP belonging to this country is generated. This
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											8 years ago
+								        IP will be passed as X-Forwarded-For HTTP header in all subsequent
 								        HTTP requests.
 								        This method will be used for initial geo bypass mechanism initialization
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								        during the instance initialization with _GEO_COUNTRIES and
 								        _GEO_IP_BLOCKS.
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											8 years ago
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								        You may also manually call it from extractor's code if geo bypass
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											8 years ago
+								        information is not available beforehand (e.g. obtained during
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								        extraction) or due to some other reason. In this case you should pass
 								        this information in geo bypass context passed as first argument. It may
 								        contain following fields:
 								        countries:  List of geo unrestricted countries (similar
 								                    to _GEO_COUNTRIES)
 								        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 								                    (similar to _GEO_IP_BLOCKS)
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											8 years ago
+								        """
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								        if not self._x_forwarded_for_ip:
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
 								            # Geo bypass mechanism is explicitly disabled by user
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								            if not self.get_param('geo_bypass', True):
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								                return
 								            if not geo_bypass_context:
 								                geo_bypass_context = {}
 								            # Backward compatibility: previously _initialize_geo_bypass
 								            # expected a list of countries, some 3rd party code may still use
 								            # it this way
 								            if isinstance(geo_bypass_context, (list, tuple)):
 								                geo_bypass_context = {
 								                    'countries': geo_bypass_context,
 								                }
 								            # The whole point of geo bypass mechanism is to fake IP
 								            # as X-Forwarded-For HTTP header based on some IP block or
 								            # country code.
 								            # Path 1: bypassing based on IP block in CIDR notation
 								            # Explicit IP block specified by user, use it right away
 								            # regardless of whether extractor is geo bypassable or not
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								            ip_block = self.get_param('geo_bypass_ip_block', None)
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
 								            # Otherwise use random IP block from geo bypass context but only
 								            # if extractor is known as geo bypassable
 								            if not ip_block:
 								                ip_blocks = geo_bypass_context.get('ip_blocks')
 								                if self._GEO_BYPASS and ip_blocks:
 								                    ip_block = random.choice(ip_blocks)
 								            if ip_block:
 								                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
-												Standardize `write_debug`

											
										
										
											4 years ago
+								                self._downloader.write_debug(
 								                    '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
+								                return
 								            # Path 2: bypassing based on country code
 								            # Explicit country code specified by user, use it right away
 								            # regardless of whether extractor is geo bypassable or not
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								            country = self.get_param('geo_bypass_country', None)
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											7 years ago
 								            # Otherwise use random country code from geo bypass context but
 								            # only if extractor is known as geo bypassable
 								            if not country:
 								                countries = geo_bypass_context.get('countries')
 								                if self._GEO_BYPASS and countries:
 								                    country = random.choice(countries)
 								            if country:
 								                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
-												Standardize `write_debug`

											
										
										
											4 years ago
+								                self._downloader.write_debug(
 								                    'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								    def extract(self, url):
 								        """Extracts URL information and returns it in list of dicts."""
-												[extractor/common] Wrap extractor errors (Fixes #1194)

For now, we just wrap some common errors. More may follow. We do not want to catch actual programming errors in the extractors, such as 1 // 0.

											
										
										
											10 years ago
+								        try:
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								            for _ in range(2):
 								                try:
 								                    self.initialize()
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								                    self.write_debug('Extracting URL: %s' % url)
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											8 years ago
+								                    ie_result = self._real_extract(url)
-												[cleanup] linter, code formatting and readme

											
										
										
											4 years ago
+								                    if ie_result is None:
 								                        return None
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											8 years ago
+								                    if self._x_forwarded_for_ip:
 								                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
-												Option `--compat-options` to revert some of yt-dlp's changes
* Deprecates `--list-formats-as-table`, `--list-formats-old`

											
										
										
											4 years ago
+								                    subtitles = ie_result.get('subtitles')
 								                    if (subtitles and 'live_chat' in subtitles
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								                            and 'no-live-chat' in self.get_param('compat_opts', [])):
-												Option `--compat-options` to revert some of yt-dlp's changes
* Deprecates `--list-formats-as-table`, `--list-formats-old`

											
										
										
											4 years ago
+								                        del subtitles['live_chat']
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											8 years ago
+								                    return ie_result
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								                except GeoRestrictedError as e:
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								                    if self.__maybe_fake_ip_and_retry(e.countries):
 								                        continue
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											8 years ago
+								                    raise
-												[extractor] Show video id in error messages if possible

											
										
										
											3 years ago
+								        except ExtractorError as e:
 								            video_id = e.video_id or self.get_temp_id(url)
 								            raise ExtractorError(
 								                e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
-												[extractor/common] Wrap extractor errors (Fixes #1194)

For now, we just wrap some common errors. More may follow. We do not want to catch actual programming errors in the extractors, such as 1 // 0.

											
										
										
											10 years ago
+								        except compat_http_client.IncompleteRead as e:
-												[extractor] Show video id in error messages if possible

											
										
										
											3 years ago
+								            raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
-												[escapist] Filter video differently (Fixes #4919)

											
										
										
											10 years ago
+								        except (KeyError, StopIteration) as e:
-												[extractor] Show video id in error messages if possible

											
										
										
											3 years ago
+								            raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								    def __maybe_fake_ip_and_retry(self, countries):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if (not self.get_param('geo_bypass_country', None)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								                and self._GEO_BYPASS
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								                and self.get_param('geo_bypass', True)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								                and not self._x_forwarded_for_ip
 								                and countries):
-												[extractor/common] Print origin country for fake IP

											
										
										
											8 years ago
+								            country_code = random.choice(countries)
 								            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								            if self._x_forwarded_for_ip:
 								                self.report_warning(
-												[extractor/common] Print origin country for fake IP

											
										
										
											8 years ago
+								                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 								                    % (self._x_forwarded_for_ip, country_code.upper()))
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											8 years ago
+								                return True
 								        return False
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    def set_downloader(self, downloader):
 								        """Sets the downloader for this IE."""
 								        self._downloader = downloader
 								    def _real_initialize(self):
 								        """Real initialization process. Redefine in subclasses."""
 								        pass
 								    def _real_extract(self, url):
 								        """Real extraction process. Redefine in subclasses."""
 								        pass
-												YoutubeIE: reuse instances of InfoExtractors (closes #998)

When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one.

That way _real_initialize is only called once for each IE, saving time if it needs to login for example.

											
										
										
											11 years ago
+								    @classmethod
 								    def ie_key(cls):
 								        """A string for getting the InfoExtractor with get_info_extractor"""
-												[lazy_extractors] Fix `suitable` and add flake8 test

											
										
										
											3 years ago
+								        return cls.__name__[:-2]
-												YoutubeIE: reuse instances of InfoExtractors (closes #998)

When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one.

That way _real_initialize is only called once for each IE, saving time if it needs to login for example.

											
										
										
											11 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    @property
 								    def IE_NAME(self):
-												[extractor/common] Make ie_key and IE_NAME return unicode string

											
										
										
											9 years ago
+								        return compat_str(type(self).__name__[:-2])
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								    @staticmethod
 								    def __can_accept_status_code(err, expected_status):
 								        assert isinstance(err, compat_urllib_error.HTTPError)
 								        if expected_status is None:
 								            return False
 								        elif callable(expected_status):
 								            return expected_status(err.code) is True
 								        else:
-												[utils] Add `variadic`

											
										
										
											3 years ago
+								            return err.code in variadic(expected_status)
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
 								    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 								        """
 								        Return the response handle.
 								        See _download_webpage docstring for arguments specification.
 								        """
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											4 years ago
+								        if not self._downloader._first_webpage_request:
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								            sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											4 years ago
+								            if sleep_interval > 0:
-												Release 2021.03.01

											
										
										
											4 years ago
+								                self.to_screen('Sleeping %s seconds ...' % sleep_interval)
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											4 years ago
+								                time.sleep(sleep_interval)
 								        else:
 								            self._downloader._first_webpage_request = False
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        if note is None:
 								            self.report_download_webpage(video_id)
 								        elif note is not False:
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								            if video_id is None:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								                self.to_screen('%s' % (note,))
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								            else:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								                self.to_screen('%s: %s' % (video_id, note))
-												[extractor/common] Move X-Forwarded-For setup code into _request_webpage

											
										
										
											7 years ago
 								        # Some sites check X-Forwarded-For HTTP header in order to figure out
 								        # the origin of the client behind proxy. This allows bypassing geo
 								        # restriction by faking this header's value to IP that belongs to some
 								        # geo unrestricted country. We will do so once we encounter any
 								        # geo restriction error.
 								        if self._x_forwarded_for_ip:
 								            if 'X-Forwarded-For' not in headers:
 								                headers['X-Forwarded-For'] = self._x_forwarded_for_ip
-												[extractor/common] Improve _request_webpage

* Do not ignore data, headers and query for Requests
* Default values for headers and query switched to dicts since these are used by urllib itself

											
										
										
											9 years ago
+								        if isinstance(url_or_request, compat_urllib_request.Request):
 								            url_or_request = update_Request(
 								                url_or_request, data=data, headers=headers, query=query)
 								        else:
-												[extractor/common] add data, headers and query params to _request_webpage

											
										
										
											9 years ago
+								            if query:
 								                url_or_request = update_url_query(url_or_request, query)
-												[extractor/common] Allow empty post data

											
										
										
											9 years ago
+								            if data is not None or headers:
-												[extractor/common] Improve _request_webpage

* Do not ignore data, headers and query for Requests
* Default values for headers and query switched to dicts since these are used by urllib itself

											
										
										
											9 years ago
+								                url_or_request = sanitized_Request(url_or_request, data, headers)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        try:
-												Move the opener to the YoutubeDL object.

This is the first step towards being able to just import youtube_dl and start using it.
Apart from removing global state, this would fix problems like #1805.

											
										
										
											11 years ago
+								            return self._downloader.urlopen(url_or_request)
-												[utils] Add `network_exceptions`

											
										
										
											4 years ago
+								        except network_exceptions as err:
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            if isinstance(err, compat_urllib_error.HTTPError):
 								                if self.__can_accept_status_code(err, expected_status):
-												[extractor/common] Ensure response handle is not prematurely closed before it can be read if it matches expected_status (resolves #17195, closes #17846, resolves #17447)


											
										
										
											6 years ago
+								                    # Retain reference to error to prevent file object from
 								                    # being closed before it can be read. Works around the
 								                    # effects of <https://bugs.python.org/issue15002>
 								                    # introduced in Python 3.4.1.
 								                    err.fp._error = err
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								                    return err.fp
-												[aparat] Add support (Fixes #2012)

											
										
										
											11 years ago
+								            if errnote is False:
 								                return False
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								            if errnote is None:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								                errnote = 'Unable to download webpage'
-												Properly convert errors to strings

											
										
										
											9 years ago
-												Rename error_to_str to error_to_compat_str

											
										
										
											9 years ago
+								            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								            if fatal:
 								                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								                self.report_warning(errmsg)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								                return False
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 								        """
 								        Return a tuple (page content as string, URL handle).
 								        See _download_webpage docstring for arguments specification.
 								        """
-												Strip hash info from URL when making requests (Fixes #1038)

											
										
										
											11 years ago
+								        # Strip hashes from the URL (#1038)
 								        if isinstance(url_or_request, (compat_str, str)):
 								            url_or_request = url_or_request.partition('#')[0]
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								        if urlh is False:
 								            assert not fatal
 								            return False
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											10 years ago
+								        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											10 years ago
+								        return (content, urlh)
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											10 years ago
+								    @staticmethod
 								    def _guess_encoding_from_content(content_type, webpage_bytes):
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 								        if m:
 								            encoding = m.group(1)
 								        else:
-												Fix detection of the webpage charset if it's declared using ' instead of "

Like in "<meta charset='utf-8'/>"

											
										
										
											11 years ago
+								            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
-												[sohu] Handle encoding, and fix tests

											
										
										
											11 years ago
+								                          webpage_bytes[:1024])
 								            if m:
 								                encoding = m.group(1).decode('ascii')
-												Deal with implicitly UTF-16 decoded webpages

These webpages don't specify an encoding and rely on the BOM

											
										
										
											11 years ago
+								            elif webpage_bytes.startswith(b'\xff\xfe'):
 								                encoding = 'utf-16'
-												[sohu] Handle encoding, and fix tests

											
										
										
											11 years ago
+								            else:
 								                encoding = 'utf-8'
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											10 years ago
 								        return encoding
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											8 years ago
+								    def __check_blocked(self, content):
 								        first_block = content[:512]
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								        if ('<title>Access to this site is blocked</title>' in content
 								                and 'Websense' in first_block):
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											8 years ago
+								            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 								            blocked_iframe = self._html_search_regex(
 								                r'<iframe src="([^"]+)"', content,
 								                'Websense information URL', default=None)
 								            if blocked_iframe:
 								                msg += ' Visit %s for more details' % blocked_iframe
 								            raise ExtractorError(msg, expected=True)
 								        if '<title>The URL you requested has been blocked</title>' in first_block:
 								            msg = (
 								                'Access to this webpage has been blocked by Indian censorship. '
 								                'Use a VPN or proxy server (with --proxy) to route around it.')
 								            block_msg = self._html_search_regex(
 								                r'</h1><p>(.*?)</p>',
 								                content, 'block message', default=None)
 								            if block_msg:
 								                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 								            raise ExtractorError(msg, expected=True)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 								                and 'blocklist.rkn.gov.ru' in content):
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											8 years ago
+								            raise ExtractorError(
 								                'Access to this webpage has been blocked by decision of the Russian government. '
 								                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 								                expected=True)
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											10 years ago
+								    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 								        content_type = urlh.headers.get('Content-Type', '')
 								        webpage_bytes = urlh.read()
 								        if prefix is not None:
 								            webpage_bytes = prefix + webpage_bytes
 								        if not encoding:
 								            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if self.get_param('dump_intermediate_pages', False):
-												[extractor/common] Use final URL when dumping request (closes #14769)

											
										
										
											7 years ago
+								            self.to_screen('Dumping request to ' + urlh.geturl())
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								            dump = base64.b64encode(webpage_bytes).decode('ascii')
 								            self._downloader.to_screen(dump)
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if self.get_param('write_pages', False):
-												[extractor/common] Use final URL when dumping request (closes #14769)

											
										
										
											7 years ago
+								            basen = '%s_%s' % (video_id, urlh.geturl())
-												[extractor/common] Protect against long video IDs and URLs

											
										
										
											11 years ago
+								            if len(basen) > 240:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
-												[extractor/common] Protect against long video IDs and URLs

											
										
										
											11 years ago
+								                basen = basen[:240 - len(h)] + h
 								            raw_filename = basen + '.dump'
-												New debug option --write-pages

											
										
										
											11 years ago
+								            filename = sanitize_filename(raw_filename, restricted=True)
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								            self.to_screen('Saving request to ' + filename)
-												[extractor/common] Fix dumping requests with long file abspath on Windows

											
										
										
											10 years ago
+								            # Working around MAX_PATH limitation on Windows (see
 								            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
-												[jython] Introduce compat_os_name

os.name is always 'java' on Jython

											
										
										
											9 years ago
+								            if compat_os_name == 'nt':
-												[extractor/common] Fix dumping requests with long file abspath on Windows

											
										
										
											10 years ago
+								                absfilepath = os.path.abspath(filename)
 								                if len(absfilepath) > 259:
 								                    filename = '\\\\?\\' + absfilepath
-												New debug option --write-pages

											
										
										
											11 years ago
+								            with open(filename, 'wb') as outf:
 								                outf.write(webpage_bytes)
-												[extractor/common] fallback on utf-8 when charset is not found

fixes #2721

											
										
										
											11 years ago
+								        try:
 								            content = webpage_bytes.decode(encoding, 'replace')
 								        except LookupError:
 								            content = webpage_bytes.decode('utf-8', 'replace')
-												Detect Websense censorship (Fixes #2670)

											
										
										
											11 years ago
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											8 years ago
+								        self.__check_blocked(content)
-												Detect Websense censorship (Fixes #2670)

											
										
										
											11 years ago
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											10 years ago
+								        return content
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								    def _download_webpage(
 								            self, url_or_request, video_id, note=None, errnote=None,
 								            fatal=True, tries=1, timeout=5, encoding=None, data=None,
 								            headers={}, query={}, expected_status=None):
 								        """
 								        Return the data of the page as a string.
 								        Arguments:
 								        url_or_request -- plain text URL as a string or
 								            a compat_urllib_request.Requestobject
 								        video_id -- Video/playlist/item identifier (string)
 								        Keyword arguments:
 								        note -- note printed before downloading (string)
 								        errnote -- note printed in case of an error (string)
 								        fatal -- flag denoting whether error should be considered fatal,
 								            i.e. whether it should cause ExtractionError to be raised,
 								            otherwise a warning will be reported and extraction continued
 								        tries -- number of tries
 								        timeout -- sleep interval between tries
 								        encoding -- encoding for a page content decoding, guessed automatically
 								            when not explicitly specified
 								        data -- POST data (bytes)
 								        headers -- HTTP headers (dict)
 								        query -- URL query (dict)
 								        expected_status -- allows to accept failed HTTP requests (non 2xx
 								            status code) by explicitly specifying a set of accepted status
 								            codes. Can be any of the following entities:
 								                - an integer type specifying an exact failed status code to
 								                  accept
 								                - a list or a tuple of integer types specifying a list of
 								                  failed status codes to accept
 								                - a callable accepting an actual failed status code and
 								                  returning True if it should be accepted
 								            Note that this argument does not affect success status codes (2xx)
 								            which are always accepted.
 								        """
-												[common] Add new parameters for _download_webpage

											
										
										
											10 years ago
+								        success = False
 								        try_count = 0
 								        while success is False:
 								            try:
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								                res = self._download_webpage_handle(
 								                    url_or_request, video_id, note, errnote, fatal,
 								                    encoding=encoding, data=data, headers=headers, query=query,
 								                    expected_status=expected_status)
-												[common] Add new parameters for _download_webpage

											
										
										
											10 years ago
+								                success = True
 								            except compat_http_client.IncompleteRead as e:
 								                try_count += 1
 								                if try_count >= tries:
 								                    raise e
 								                self._sleep(timeout, video_id)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											11 years ago
+								        if res is False:
 								            return res
 								        else:
 								            content, _ = res
 								            return content
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor/common] Add _download_xml_handle

											
										
										
											7 years ago
+								    def _download_xml_handle(
 								            self, url_or_request, video_id, note='Downloading XML',
 								            errnote='Unable to download XML', transform_source=None,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            fatal=True, encoding=None, data=None, headers={}, query={},
 								            expected_status=None):
 								        """
-												Use compat_etree_Element

											
										
										
											6 years ago
+								        Return a tuple (xml as an compat_etree_Element, URL handle).
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
 								        See _download_webpage docstring for arguments specification.
 								        """
-												[extractor/common] Add _download_xml_handle

											
										
										
											7 years ago
+								        res = self._download_webpage_handle(
 								            url_or_request, video_id, note, errnote, fatal=fatal,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            encoding=encoding, data=data, headers=headers, query=query,
 								            expected_status=expected_status)
-												[extractor/common] Add _download_xml_handle

											
										
										
											7 years ago
+								        if res is False:
 								            return res
 								        xml_string, urlh = res
 								        return self._parse_xml(
 								            xml_string, video_id, transform_source=transform_source,
 								            fatal=fatal), urlh
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								    def _download_xml(
 								            self, url_or_request, video_id,
 								            note='Downloading XML', errnote='Unable to download XML',
 								            transform_source=None, fatal=True, encoding=None,
 								            data=None, headers={}, query={}, expected_status=None):
 								        """
-												Use compat_etree_Element

											
										
										
											6 years ago
+								        Return the xml as an compat_etree_Element.
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
 								        See _download_webpage docstring for arguments specification.
 								        """
-												[extractor/common] Add _download_xml_handle

											
										
										
											7 years ago
+								        res = self._download_xml_handle(
 								            url_or_request, video_id, note=note, errnote=errnote,
 								            transform_source=transform_source, fatal=fatal, encoding=encoding,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            data=data, headers=headers, query=query,
 								            expected_status=expected_status)
-												[extractor/common] Add _download_xml_handle

											
										
										
											7 years ago
+								        return res if res is False else res[0]
-												[extractor/common] Introduce _parse_xml

											
										
										
											7 years ago
 								    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
-												[mtv] Fixup incorrectly encoded XML documents

											
										
										
											11 years ago
+								        if transform_source:
 								            xml_string = transform_source(xml_string)
-												[extractor/common] Introduce _parse_xml

											
										
										
											7 years ago
+								        try:
 								            return compat_etree_fromstring(xml_string.encode('utf-8'))
 								        except compat_xml_parse_error as ve:
 								            errmsg = '%s: Failed to parse XML ' % video_id
 								            if fatal:
 								                raise ExtractorError(errmsg, cause=ve)
 								            else:
 								                self.report_warning(errmsg + str(ve))
-												[collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes #1822)

Uses a new helper method in InfoExtractor: _download_xml

											
										
										
											11 years ago
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								    def _download_json_handle(
 								            self, url_or_request, video_id, note='Downloading JSON metadata',
 								            errnote='Unable to download JSON metadata', transform_source=None,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            fatal=True, encoding=None, data=None, headers={}, query={},
 								            expected_status=None):
 								        """
 								        Return a tuple (JSON object, URL handle).
 								        See _download_webpage docstring for arguments specification.
 								        """
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								        res = self._download_webpage_handle(
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											10 years ago
+								            url_or_request, video_id, note, errnote, fatal=fatal,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            encoding=encoding, data=data, headers=headers, query=query,
 								            expected_status=expected_status)
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								        if res is False:
 								            return res
 								        json_string, urlh = res
-												[common] Split _download_json
Add ability for extractor to use _parse_json

											
										
										
											10 years ago
+								        return self._parse_json(
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								            json_string, video_id, transform_source=transform_source,
 								            fatal=fatal), urlh
 								    def _download_json(
 								            self, url_or_request, video_id, note='Downloading JSON metadata',
 								            errnote='Unable to download JSON metadata', transform_source=None,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            fatal=True, encoding=None, data=None, headers={}, query={},
 								            expected_status=None):
 								        """
 								        Return the JSON object as a dict.
 								        See _download_webpage docstring for arguments specification.
 								        """
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								        res = self._download_json_handle(
 								            url_or_request, video_id, note=note, errnote=errnote,
 								            transform_source=transform_source, fatal=fatal, encoding=encoding,
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											6 years ago
+								            data=data, headers=headers, query=query,
 								            expected_status=expected_status)
-												[extractor/common] Add _download_json_handle

											
										
										
											7 years ago
+								        return res if res is False else res[0]
-												[common] Split _download_json
Add ability for extractor to use _parse_json

											
										
										
											10 years ago
 								    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
-												[youtube] Correct invalid JSON (Fixes #2353)

											
										
										
											11 years ago
+								        if transform_source:
 								            json_string = transform_source(json_string)
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											11 years ago
+								        try:
 								            return json.loads(json_string)
 								        except ValueError as ve:
-												[utils] Improve and test js_to_json

											
										
										
											10 years ago
+								            errmsg = '%s: Failed to parse JSON ' % video_id
 								            if fatal:
 								                raise ExtractorError(errmsg, cause=ve)
 								            else:
 								                self.report_warning(errmsg + str(ve))
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											11 years ago
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											3 years ago
+								    def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 								        return self._parse_json(
 								            data[data.find('{'):data.rfind('}') + 1],
 								            video_id, transform_source, fatal)
 								    def _download_socket_json_handle(
 								            self, url_or_request, video_id, note='Polling socket',
 								            errnote='Unable to poll socket', transform_source=None,
 								            fatal=True, encoding=None, data=None, headers={}, query={},
 								            expected_status=None):
 								        """
 								        Return a tuple (JSON object, URL handle).
 								        See _download_webpage docstring for arguments specification.
 								        """
 								        res = self._download_webpage_handle(
 								            url_or_request, video_id, note, errnote, fatal=fatal,
 								            encoding=encoding, data=data, headers=headers, query=query,
 								            expected_status=expected_status)
 								        if res is False:
 								            return res
 								        webpage, urlh = res
 								        return self._parse_socket_response_as_json(
 								            webpage, video_id, transform_source=transform_source,
 								            fatal=fatal), urlh
 								    def _download_socket_json(
 								            self, url_or_request, video_id, note='Polling socket',
 								            errnote='Unable to poll socket', transform_source=None,
 								            fatal=True, encoding=None, data=None, headers={}, query={},
 								            expected_status=None):
 								        """
 								        Return the JSON object as a dict.
 								        See _download_webpage docstring for arguments specification.
 								        """
 								        res = self._download_socket_json_handle(
 								            url_or_request, video_id, note=note, errnote=errnote,
 								            transform_source=transform_source, fatal=fatal, encoding=encoding,
 								            data=data, headers=headers, query=query,
 								            expected_status=expected_status)
 								        return res if res is False else res[0]
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								    def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
-												[cleanup] Misc

											
										
										
											3 years ago
+								        idstr = format_field(video_id, template='%s: ')
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								        msg = f'[{self.IE_NAME}] {idstr}{msg}'
 								        if only_once:
 								            if f'WARNING: {msg}' in self._printed_messages:
 								                return
 								            self._printed_messages.add(f'WARNING: {msg}')
 								        self._downloader.report_warning(msg, *args, **kwargs)
-												[myvideo] Use RTMP instead of RTMPT (Fixes #2032)

											
										
										
											11 years ago
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								    def to_screen(self, msg, *args, **kwargs):
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        """Print msg to screen, prefixing it with '[ie_name]'"""
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
 								    def write_debug(self, msg, *args, **kwargs):
 								        self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
 								    def get_param(self, name, default=None, *args, **kwargs):
 								        if self._downloader:
 								            return self._downloader.params.get(name, default, *args, **kwargs)
 								        return default
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								    def report_drm(self, video_id, partial=False):
 								        self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    def report_extraction(self, id_or_name):
 								        """Report information extraction."""
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								        self.to_screen('%s: Extracting information' % id_or_name)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								    def report_download_webpage(self, video_id):
 								        """Report webpage download."""
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								        self.to_screen('%s: Downloading webpage' % video_id)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								    def report_age_confirmation(self):
 								        """Report attempt to confirm age."""
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								        self.to_screen('Confirming age')
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											11 years ago
+								    def report_login(self):
 								        """Report attempt to log in."""
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								        self.to_screen('Logging in')
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											11 years ago
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								    def raise_login_required(
-												[youtube] Better message when login required

											
										
										
											4 years ago
+								            self, msg='This video is only available for registered users',
 								            metadata_available=False, method='any'):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if metadata_available and self.get_param('ignore_no_formats_error'):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								            self.report_warning(msg)
-												[extractor] Minor improvements (See desc)

1. Allow removal of login hint - extractors can set their own login hint as part of `msg`
2. Cleanup `_merge_subtitles` signature

											
										
										
											3 years ago
+								        if method is not None:
 								            msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
 								        raise ExtractorError(msg, expected=True)
-												[extractor/common] Add raise_login_required

											
										
										
											9 years ago
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								    def raise_geo_restricted(
 								            self, msg='This video is not available from your location due to geo restriction',
 								            countries=None, metadata_available=False):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if metadata_available and self.get_param('ignore_no_formats_error'):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								            self.report_warning(msg)
 								        else:
 								            raise GeoRestrictedError(msg, countries=countries)
 								    def raise_no_formats(self, msg, expected=False, video_id=None):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if expected and self.get_param('ignore_no_formats_error'):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								            self.report_warning(msg, video_id)
-												[CBS] Add fallback (#579)

Related: https://github.com/ytdl-org/youtube-dl/issues/29564
Authored-by: llacb47, pukkandan
											
										
										
											3 years ago
+								        elif isinstance(msg, ExtractorError):
 								            raise msg
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											4 years ago
+								        else:
 								            raise ExtractorError(msg, expected=expected, video_id=video_id)
-												[extractor/common] Add raise_geo_restricted

											
										
										
											9 years ago
-												PEP8 applied

											
										
										
											10 years ago
+								    # Methods for following #608
-												[generic] Detect ooyala videos (fixes #2013)

											
										
										
											11 years ago
+								    @staticmethod
-												[utils] Add `video_title` for `url_result`

											
										
										
											10 years ago
+								    def url_result(url, ie=None, video_id=None, video_title=None):
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								        """Returns a URL that points to a page that should be processed"""
-												PEP8 applied

											
										
										
											10 years ago
+								        # TODO: ie should be the class used for getting the info
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        video_info = {'_type': 'url',
 								                      'url': url,
 								                      'ie_key': ie}
-												Match --download-archive during playlist processing (Fixes #1745)

											
										
										
											11 years ago
+								        if video_id is not None:
 								            video_info['id'] = video_id
-												[utils] Add `video_title` for `url_result`

											
										
										
											10 years ago
+								        if video_title is not None:
 								            video_info['title'] = video_title
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        return video_info
-												PEP8 applied

											
										
										
											10 years ago
-												[extractor/common] Fix playlist_from_matches

											
										
										
											7 years ago
+								    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 								        urls = orderedSet(
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
+								            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 								            for m in matches)
 								        return self.playlist_result(
-												[extractor/common] Fix playlist_from_matches

											
										
										
											7 years ago
+								            urls, playlist_id=playlist_id, playlist_title=playlist_title)
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											8 years ago
-												[generic] Detect ooyala videos (fixes #2013)

											
										
										
											11 years ago
+								    @staticmethod
-												[youtube] More metadata extraction for channels/playlists

											
										
										
											4 years ago
+								    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        """Returns a playlist"""
 								        video_info = {'_type': 'playlist',
 								                      'entries': entries}
-												[youtube] More metadata extraction for channels/playlists

											
										
										
											4 years ago
+								        video_info.update(kwargs)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        if playlist_id:
 								            video_info['id'] = playlist_id
 								        if playlist_title:
 								            video_info['title'] = playlist_title
-												[youtube] Don't show warning for empty playlist description (Closes #54)

:ci skip dl

											
										
										
											4 years ago
+								        if playlist_description is not None:
-												[extractor/common] Add description to playlist_result

											
										
										
											10 years ago
+								            video_info['description'] = playlist_description
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        return video_info
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											9 years ago
+								    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        """
 								        Perform a regex search on the given string, using a single or a list of
 								        patterns returning the first matching group.
 								        In case of failure return a default value or raise a WARNING or a
-												[vimeo] Fix pro videos and player.vimeo.com urls

The old process can still be used for those videos.
Added RegexNotFoundError, which is raised by _search_regex if it can't extract the info.

											
										
										
											11 years ago
+								        RegexNotFoundError, depending on fatal, specifying the field name.
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        """
 								        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 								            mobj = re.search(pattern, string, flags)
 								        else:
 								            for p in pattern:
 								                mobj = re.search(p, string, flags)
-												[extractor/common] PEP8

											
										
										
											10 years ago
+								                if mobj:
 								                    break
-												Fix generic class move (add all files)

											
										
										
											12 years ago
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								            _name = '\033[0;34m%s\033[0m' % name
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        else:
 								            _name = name
 								        if mobj:
-												[heise] Fix description, thumbnail and format ID

											
										
										
											10 years ago
+								            if group is None:
 								                # return the first matching group
 								                return next(g for g in mobj.groups() if g is not None)
-												[extractor] Allow extracting multiple groups in `_search_regex`
From #497, Authored by: fstirlitz

											
										
										
											3 years ago
+								            elif isinstance(group, (list, tuple)):
 								                return tuple(mobj.group(g) for g in group)
-												[heise] Fix description, thumbnail and format ID

											
										
										
											10 years ago
+								            else:
 								                return mobj.group(group)
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											9 years ago
+								        elif default is not NO_DEFAULT:
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								            return default
 								        elif fatal:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								            raise RegexNotFoundError('Unable to extract %s' % _name)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        else:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								            self.report_warning('unable to extract %s' % _name + bug_reports_message())
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								            return None
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											9 years ago
+								    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        """
 								        Like _search_regex, but strips HTML tags and unescapes entities.
 								        """
-												[heise] Fix description, thumbnail and format ID

											
										
										
											10 years ago
+								        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								        if res:
 								            return clean_html(res).strip()
 								        else:
 								            return res
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
+								    def _get_netrc_login_info(self, netrc_machine=None):
 								        username = None
 								        password = None
 								        netrc_machine = netrc_machine or self._NETRC_MACHINE
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if self.get_param('usenetrc', False):
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
+								            try:
-												Add option `--netrc-location`
Closes #792, #963

											
										
										
											3 years ago
+								                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
 								                if os.path.isdir(netrc_file):
 								                    netrc_file = os.path.join(netrc_file, '.netrc')
 								                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
+								                if info is not None:
 								                    username = info[0]
 								                    password = info[2]
 								                else:
-												[extractor/common] Simplify _get_netrc_login_info and carry long lines

											
										
										
											8 years ago
+								                    raise netrc.NetrcParseError(
 								                        'No authenticators for %s' % netrc_machine)
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
+								            except (IOError, netrc.NetrcParseError) as err:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								                self.report_warning(
-												[extractor/common] Simplify _get_netrc_login_info and carry long lines

											
										
										
											8 years ago
+								                    'parsing .netrc: %s' % error_to_compat_str(err))
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
-												[extractor/common] Simplify _get_netrc_login_info and carry long lines

											
										
										
											8 years ago
+								        return username, password
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
-												[adobepass] add specific options for adobe pass authentication

- add --ap-username and --ap-password option to specify
TV provider username and password in the cmd line
- add --ap-retries option to limit the number of retries
- add --list-ap-msi-ids to list the supported TV Providers

											
										
										
											8 years ago
+								    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											11 years ago
+								        """
-												Typo: twice "the the" to "the"
											
										
										
											10 years ago
+								        Get the login info as (username, password)
-												[extractor/common] Update _get_login_info's comment

											
										
										
											8 years ago
+								        First look for the manually specified credentials using username_option
 								        and password_option as keys in params dictionary. If no such credentials
 								        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 								        value.
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											11 years ago
+								        If there's no info available, return (None, None)
 								        """
 								        # Attempt to use provided username and password or .netrc data
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        username = self.get_param(username_option)
 								        if username is not None:
 								            password = self.get_param(password_option)
-												[common] add separate method for getting netrc ligin info

											
										
										
											8 years ago
+								        else:
-												[adobepass] add specific options for adobe pass authentication

- add --ap-username and --ap-password option to specify
TV provider username and password in the cmd line
- add --ap-retries option to limit the number of retries
- add --list-ap-msi-ids to list the supported TV Providers

											
										
										
											8 years ago
+								            username, password = self._get_netrc_login_info(netrc_machine)
-												PEP8 applied

											
										
										
											10 years ago
-												[extractor/common] Simplify _get_login_info

											
										
										
											8 years ago
+								        return username, password
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											11 years ago
-												[extractor/common] Interactive TFA code input

											
										
										
											9 years ago
+								    def _get_tfa_info(self, note='two-factor verification code'):
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											10 years ago
+								        """
 								        Get the two-factor authentication info
 								        TODO - asking the user will be required for sms/phone verify
 								        currently just uses the command line option
 								        If there's no info available, return None
 								        """
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        tfa = self.get_param('twofactor')
 								        if tfa is not None:
 								            return tfa
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											10 years ago
-												[extractor/common] Interactive TFA code input

											
										
										
											9 years ago
+								        return compat_getpass('Type %s and press [Return]: ' % note)
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											10 years ago
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
+								    # Helper functions for extracting OpenGraph info
 								    @staticmethod
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											11 years ago
+								    def _og_regexes(prop):
-												[extractor/common] Allow angle brackets in attributes in _og_regexes (#7215)

											
										
										
											9 years ago
+								        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
-												[malltv] Add extractor (closes #18058)

											
										
										
											6 years ago
+								        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
-												[extractor/common] Require closing quote in _og_regexes (Closes #7174)

E.g. do not match `property='og:video:type'` when `og:video` is requested.

											
										
										
											9 years ago
+								                       % {'prop': re.escape(prop)})
-												Don't accept '>' inside the content attribute in OpenGraph regexes

											
										
										
											11 years ago
+								        template = r'<meta[^>]+?%s[^>]+?%s'
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											11 years ago
+								        return [
-												Don't accept '>' inside the content attribute in OpenGraph regexes

											
										
										
											11 years ago
+								            template % (property_re, content_re),
 								            template % (content_re, property_re),
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											11 years ago
+								        ]
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											9 years ago
+								    @staticmethod
 								    def _meta_regex(prop):
 								        return r'''(?isx)<meta
-												[extractor/common] Expand meta regex

											
										
										
											9 years ago
+								                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											9 years ago
+								                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
-												Improve OpenGraph property matching

											
										
										
											11 years ago
+								    def _og_search_property(self, prop, html, name=None, **kargs):
-												[utils] Add `variadic`

											
										
										
											3 years ago
+								        prop = variadic(prop)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
+								        if name is None:
-												[extractor/common] Support multiple properties in _og_search_property

											
										
										
											8 years ago
+								            name = 'OpenGraph %s' % prop[0]
 								        og_regexes = []
 								        for p in prop:
 								            og_regexes.extend(self._og_regexes(p))
 								        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
-												[common] Simplify og_search_property

											
										
										
											11 years ago
+								        if escaped is None:
 								            return None
 								        return unescapeHTML(escaped)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
 								    def _og_search_thumbnail(self, html, **kargs):
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
 								    def _og_search_description(self, html, **kargs):
 								        return self._og_search_property('description', html, fatal=False, **kargs)
 								    def _og_search_title(self, html, **kargs):
 								        return self._og_search_property('title', html, **kargs)
-												[Instagram] get the non-https link, as they are serving Akamai cert from a instagram.com domain

											
										
										
											11 years ago
+								    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
-												[escapist] Add support for og:video:url (Fixes #3557)

											
										
										
											10 years ago
+								        regexes = self._og_regexes('video') + self._og_regexes('video:url')
 								        if secure:
 								            regexes = self._og_regexes('video:secure_url') + regexes
-												[Instagram] get the non-https link, as they are serving Akamai cert from a instagram.com domain

											
										
										
											11 years ago
+								        return self._html_search_regex(regexes, html, name, **kargs)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											11 years ago
-												[livestream:original] Add support for folder urls (closes #2631)

The webpage only contains shortened links for the videos, since the server
doesn't support HEAD requests, we use an specific extractor for them.

											
										
										
											11 years ago
+								    def _og_search_url(self, html, **kargs):
 								        return self._og_search_property('url', html, **kargs)
-												[screencast] Add suppot for more video types (#3236)

											
										
										
											10 years ago
+								    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
-												[utils] Add `variadic`

											
										
										
											3 years ago
+								        name = variadic(name)
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
+								        if display_name is None:
-												[utils] Add support for name list in _html_search_meta

											
										
										
											8 years ago
+								            display_name = name[0]
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
+								        return self._html_search_regex(
-												[utils] Add support for name list in _html_search_meta

											
										
										
											8 years ago
+								            [self._meta_regex(n) for n in name],
-												[heise] Fix description, thumbnail and format ID

											
										
										
											10 years ago
+								            html, display_name, fatal=fatal, group='content', **kwargs)
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
 								    def _dc_search_uploader(self, html):
 								        return self._html_search_meta('dc.creator', html, 'uploader')
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											11 years ago
+								    def _rta_search(self, html):
 								        # See http://www.rtalabel.org/index.php?content=howtofaq#single
 								        if re.search(r'(?ix)<meta\s+name="rating"\s+'
 								                     r'     content="RTA-5042-1996-1400-1577-RTA"',
 								                     html):
 								            return 18
 								        return 0
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
+								    def _media_rating_search(self, html):
 								        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 								        rating = self._html_search_meta('rating', html)
 								        if not rating:
 								            return None
 								        RATING_TABLE = {
 								            'safe for kids': 0,
 								            'general': 8,
 								            '14 years': 14,
 								            'mature': 17,
 								            'restricted': 19,
 								        }
-												[refactor] Do not specify redundant None as second argument in dict.get()

											
										
										
											9 years ago
+								        return RATING_TABLE.get(rating.lower())
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											10 years ago
+								    def _family_friendly_search(self, html):
-												[extractor/common] Fix link to external documentation

											
										
										
											10 years ago
+								        # See http://schema.org/VideoObject
-												[extractor/common] Make _family_friendly_search optional

											
										
										
											7 years ago
+								        family_friendly = self._html_search_meta(
 								            'isFamilyFriendly', html, default=None)
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											10 years ago
 								        if not family_friendly:
 								            return None
 								        RATING_TABLE = {
 								            '1': 0,
 								            'true': 0,
 								            '0': 18,
 								            'false': 18,
 								        }
-												[refactor] Do not specify redundant None as second argument in dict.get()

											
										
										
											9 years ago
+								        return RATING_TABLE.get(family_friendly.lower())
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											10 years ago
-												[bloomberg] Fix ooyala url extraction

Added a helper method to InfoExtractor for searching the ‘twitter:player’ meta property.
Now the OoyalaIE also recognizes the ‘ec’ parameter in the url as the embed code.

											
										
										
											11 years ago
+								    def _twitter_search_player(self, html):
 								        return self._html_search_meta('twitter:player', html,
-												PEP8: applied even more rules

											
										
										
											10 years ago
+								                                      'twitter card player')
-												[bloomberg] Fix ooyala url extraction

Added a helper method to InfoExtractor for searching the ‘twitter:player’ meta property.
Now the OoyalaIE also recognizes the ‘ec’ parameter in the url as the embed code.

											
										
										
											11 years ago
-												[extractor/common] Add expected_type in json ld routines

											
										
										
											8 years ago
+								    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								        json_ld_list = list(re.finditer(JSON_LD_RE, html))
-												[extractor/common] Respect default in _search_json_ld

											
										
										
											8 years ago
+								        default = kwargs.get('default', NO_DEFAULT)
 								        # JSON-LD may be malformed and thus `fatal` should be respected.
 								        # At the same time `default` may be passed that assumes `fatal=False`
 								        # for _search_regex. Let's simulate the same behavior here as well.
-												[cleanup] Refactor some code

											
										
										
											3 years ago
+								        fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								        json_ld = []
 								        for mobj in json_ld_list:
 								            json_ld_item = self._parse_json(
 								                mobj.group('json_ld'), video_id, fatal=fatal)
 								            if not json_ld_item:
 								                continue
 								            if isinstance(json_ld_item, dict):
 								                json_ld.append(json_ld_item)
 								            elif isinstance(json_ld_item, (list, tuple)):
 								                json_ld.extend(json_ld_item)
 								        if json_ld:
 								            json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 								        if json_ld:
 								            return json_ld
 								        if default is not NO_DEFAULT:
 								            return default
 								        elif fatal:
 								            raise RegexNotFoundError('Unable to extract JSON-LD')
 								        else:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								            self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								            return {}
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											9 years ago
-												[extractor/common] Add expected_type in json ld routines

											
										
										
											8 years ago
+								    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											9 years ago
+								        if isinstance(json_ld, compat_str):
 								            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 								        if not json_ld:
 								            return {}
 								        info = {}
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								        if not isinstance(json_ld, (list, tuple, dict)):
 								            return info
 								        if isinstance(json_ld, dict):
 								            json_ld = [json_ld]
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								        INTERACTION_TYPE_MAP = {
 								            'CommentAction': 'comment',
 								            'AgreeAction': 'like',
 								            'DisagreeAction': 'dislike',
 								            'LikeAction': 'like',
 								            'DislikeAction': 'dislike',
 								            'ListenAction': 'view',
 								            'WatchAction': 'view',
 								            'ViewAction': 'view',
 								        }
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								        def extract_interaction_type(e):
 								            interaction_type = e.get('interactionType')
 								            if isinstance(interaction_type, dict):
 								                interaction_type = interaction_type.get('@type')
 								            return str_or_none(interaction_type)
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								        def extract_interaction_statistic(e):
 								            interaction_statistic = e.get('interactionStatistic')
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								            if isinstance(interaction_statistic, dict):
 								                interaction_statistic = [interaction_statistic]
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								            if not isinstance(interaction_statistic, list):
 								                return
 								            for is_e in interaction_statistic:
 								                if not isinstance(is_e, dict):
 								                    continue
 								                if is_e.get('@type') != 'InteractionCounter':
 								                    continue
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								                interaction_type = extract_interaction_type(is_e)
 								                if not interaction_type:
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								                    continue
-												[extractor/common] Relax interaction count extraction in _json_ld

											
										
										
											4 years ago
+								                # For interaction count some sites provide string instead of
 								                # an integer (as per spec) with non digit characters (e.g. ",")
 								                # so extracting count with more relaxed str_to_int
 								                interaction_count = str_to_int(is_e.get('userInteractionCount'))
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								                if interaction_count is None:
 								                    continue
 								                count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
 								                if not count_kind:
 								                    continue
 								                count_key = '%s_count' % count_kind
 								                if info.get(count_key) is not None:
 								                    continue
 								                info[count_key] = interaction_count
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								        def extract_video_object(e):
 								            assert e['@type'] == 'VideoObject'
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											4 years ago
+								            author = e.get('author')
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								            info.update({
-												[extractor/common] Add validation for JSON-LD URLs

											
										
										
											6 years ago
+								                'url': url_or_none(e.get('contentUrl')),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								                'title': unescapeHTML(e.get('name')),
 								                'description': unescapeHTML(e.get('description')),
-												[extractor/common] Add validation for JSON-LD URLs

											
										
										
											6 years ago
+								                'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								                'duration': parse_duration(e.get('duration')),
 								                'timestamp': unified_timestamp(e.get('uploadDate')),
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											4 years ago
+								                # author can be an instance of 'Organization' or 'Person' types.
 								                # both types can have 'name' property(inherited from 'Thing' type). [1]
 								                # however some websites are using 'Text' type instead.
 								                # 1. https://schema.org/VideoObject
 								                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								                'filesize': float_or_none(e.get('contentSize')),
 								                'tbr': int_or_none(e.get('bitrate')),
 								                'width': int_or_none(e.get('width')),
 								                'height': int_or_none(e.get('height')),
-												[extractor/common] Extract view count from JSON-LD

											
										
										
											8 years ago
+								                'view_count': int_or_none(e.get('interactionCount')),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								            })
-												[extractor/common] Extract interaction statistic

											
										
										
											7 years ago
+								            extract_interaction_statistic(e)
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								        for e in json_ld:
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								            if '@context' in e:
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                item_type = e.get('@type')
 								                if expected_type is not None and expected_type != item_type:
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								                    continue
-												[extractor/common] Improve _json_ld

											
										
										
											7 years ago
+								                if item_type in ('TVEpisode', 'Episode'):
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											6 years ago
+								                    episode_name = unescapeHTML(e.get('name'))
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                    info.update({
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											6 years ago
+								                        'episode': episode_name,
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                        'episode_number': int_or_none(e.get('episodeNumber')),
 								                        'description': unescapeHTML(e.get('description')),
 								                    })
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											6 years ago
+								                    if not info.get('title') and episode_name:
 								                        info['title'] = episode_name
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                    part_of_season = e.get('partOfSeason')
-												[extractor/common] Improve _json_ld

											
										
										
											7 years ago
+								                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
-												[extractor/common] Extract season in _json_ld

											
										
										
											6 years ago
+								                        info.update({
 								                            'season': unescapeHTML(part_of_season.get('name')),
 								                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
 								                        })
-												[common] extract partOfTVSeries info in json-ld

											
										
										
											8 years ago
+								                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
-												[extractor/common] Improve _json_ld

											
										
										
											7 years ago
+								                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                        info['series'] = unescapeHTML(part_of_series.get('name'))
-												[extractor/common] Add support for movies in _json_ld

											
										
										
											6 years ago
+								                elif item_type == 'Movie':
 								                    info.update({
 								                        'title': unescapeHTML(e.get('name')),
 								                        'description': unescapeHTML(e.get('description')),
 								                        'duration': parse_duration(e.get('duration')),
 								                        'timestamp': unified_timestamp(e.get('dateCreated')),
 								                    })
-												[extractor/common] Improve _json_ld for articles

											
										
										
											7 years ago
+								                elif item_type in ('Article', 'NewsArticle'):
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											8 years ago
+								                    info.update({
 								                        'timestamp': parse_iso8601(e.get('datePublished')),
 								                        'title': unescapeHTML(e.get('headline')),
 								                        'description': unescapeHTML(e.get('articleBody')),
 								                    })
 								                elif item_type == 'VideoObject':
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											8 years ago
+								                    extract_video_object(e)
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								                    if expected_type is None:
 								                        continue
 								                    else:
 								                        break
-												[extractor/common] Improve _json_ld

											
										
										
											7 years ago
+								                video = e.get('video')
 								                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
 								                    extract_video_object(video)
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											5 years ago
+								                if expected_type is None:
 								                    continue
 								                else:
 								                    break
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											9 years ago
+								        return dict((k, v) for k, v in info.items() if v is not None)
-												[extractor/common] Add method for extracting form hidden input fields as dict

											
										
										
											9 years ago
+								    @staticmethod
-												[extractor/common] Improve _form_hidden_inputs and rename to _hidden_inputs

											
										
										
											9 years ago
+								    def _hidden_inputs(html):
-												[extractor/common] Skip html comment tags (Closes #6822)

											
										
										
											9 years ago
+								        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
-												[extractor/common] Improve _hidden_inputs

											
										
										
											9 years ago
+								        hidden_inputs = {}
-												[utils] Improve _hidden_inputs

											
										
										
											8 years ago
+								        for input in re.findall(r'(?i)(<input[^>]+>)', html):
 								            attrs = extract_attributes(input)
 								            if not input:
-												[extractor/common] Improve _hidden_inputs

											
										
										
											9 years ago
+								                continue
-												[utils] Improve _hidden_inputs

											
										
										
											8 years ago
+								            if attrs.get('type') not in ('hidden', 'submit'):
-												[extractor/common] Improve _hidden_inputs

											
										
										
											9 years ago
+								                continue
-												[utils] Improve _hidden_inputs

											
										
										
											8 years ago
+								            name = attrs.get('name') or attrs.get('id')
 								            value = attrs.get('value')
 								            if name and value is not None:
 								                hidden_inputs[name] = value
-												[extractor/common] Improve _hidden_inputs

											
										
										
											9 years ago
+								        return hidden_inputs
-												[extractor/common] Add method for extracting form hidden input fields as dict

											
										
										
											9 years ago
-												[extractor/common] Add _form_hidden_inputs

											
										
										
											9 years ago
+								    def _form_hidden_inputs(self, form_id, html):
 								        form = self._search_regex(
-												[extractor/common] Case insensitive inputs extraction

											
										
										
											9 years ago
+								            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
-												[extractor/common] Add _form_hidden_inputs

											
										
										
											9 years ago
+								            html, '%s form' % form_id, group='form')
 								        return self._hidden_inputs(form)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								    class FormatSort:
-												Fix some typos and linter

											
										
										
											4 years ago
+								        regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
-												Allow `images` formats
Necessary for #343.

* They are identified by `vcodec=acodec='none'`
* These formats show as the worst in `-F`
* Any postprocessor that expects audio/video will be skipped
* `b*` and all related selectors will skip such formats
* This commit also does not add any selector for downloading such formats. They have to be explicitly requested by the `format_id`. Implementation of a selector is left for when #389 is resolved

											
										
										
											3 years ago
+								        default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
-												[formatsort] Prefer vp9.2 over other vp9 codecs

vp9.2 may contain HDR while vp9.0 doesn't

											
										
										
											4 years ago
+								                   'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
-												Option `--compat-options` to revert some of yt-dlp's changes
* Deprecates `--list-formats-as-table`, `--list-formats-old`

											
										
										
											4 years ago
+								                   'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
-												[FormatSort] Remove priority of `lang`

											
										
										
											3 years ago
+								        ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
-												Option `--compat-options` to revert some of yt-dlp's changes
* Deprecates `--list-formats-as-table`, `--list-formats-old`

											
										
										
											4 years ago
+								                        'height', 'width', 'proto', 'vext', 'abr', 'aext',
 								                        'fps', 'fs_approx', 'source', 'format_id')
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
 								        settings = {
 								            'vcodec': {'type': 'ordered', 'regex': True,
-												[formatsort] Prefer vp9.2 over other vp9 codecs

vp9.2 may contain HDR while vp9.0 doesn't

											
										
										
											4 years ago
+								                       'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'acodec': {'type': 'ordered', 'regex': True,
 								                       'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
-												Fix some fields not sorting correctly

bug introduced by: 63be1aab2f6b6a99f289663ffd935e311aff5556

											
										
										
											4 years ago
+								            'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
-												[websockets] Add `WebSocketFragmentFD` (#399)
Necessary for #392

Co-authored by: nao20010128nao, pukkandan

											
										
										
											3 years ago
+								                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'vext': {'type': 'ordered', 'field': 'video_ext',
-												Change defaults

* Enabled --ignore by default
* Disabled --video-multistreams and --audio-multistreams by default
* Changed default format selection to 'bv*+ba/b' when --audio-multistreams is disabled
* Changed default format sort order to 'res,fps,codec,size,br,asr,proto,ext,has_audio,source,format_id'
* Changed default output template to '%(title)s [%(id)s].%(ext)s'
* Enabled `--list-formats-as-table` by default

											
										
										
											4 years ago
+								                     'order': ('mp4', 'webm', 'flv', '', 'none'),
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                     'order_free': ('webm', 'mp4', 'flv', '', 'none')},
 								            'aext': {'type': 'ordered', 'field': 'audio_ext',
 								                     'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
 								                     'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
 								            'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
-												[FormatSort] Fix bug for audio with unknown codec

											
										
										
											3 years ago
+								            'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
-												Allow `images` formats
Necessary for #343.

* They are identified by `vcodec=acodec='none'`
* These formats show as the worst in `-F`
* Any postprocessor that expects audio/video will be skipped
* `b*` and all related selectors will skip such formats
* This commit also does not add any selector for downloading such formats. They have to be explicitly requested by the `format_id`. Implementation of a selector is left for when #389 is resolved

											
										
										
											3 years ago
+								                           'field': ('vcodec', 'acodec'),
 								                           'function': lambda it: int(any(v != 'none' for v in it))},
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								            'ie_pref': {'priority': True, 'type': 'extractor'},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
+								            'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
 								            'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
-												[FormatSort] Remove priority of `lang`

											
										
										
											3 years ago
+								            'lang': {'convert': 'ignore', 'field': 'language_preference'},
-												[FormatSort] Fix for when some formats have quality and others don't

											
										
										
											4 years ago
+								            'quality': {'convert': 'float_none', 'default': -1},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'filesize': {'convert': 'bytes'},
-												Fix some fields not sorting correctly

bug introduced by: 63be1aab2f6b6a99f289663ffd935e311aff5556

											
										
										
											4 years ago
+								            'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
 								            'id': {'convert': 'string', 'field': 'format_id'},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'height': {'convert': 'float_none'},
 								            'width': {'convert': 'float_none'},
 								            'fps': {'convert': 'float_none'},
 								            'tbr': {'convert': 'float_none'},
 								            'vbr': {'convert': 'float_none'},
 								            'abr': {'convert': 'float_none'},
 								            'asr': {'convert': 'float_none'},
-												[formatSort] Fix `quality` being ignored
Closes #172

											
										
										
											4 years ago
+								            'source': {'convert': 'ignore', 'field': 'source_preference'},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
+								            'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
 								            'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
 								            'ext': {'type': 'combined', 'field': ('vext', 'aext')},
-												[FormatSort] Fix bug for audio with unknown codec

											
										
										
											3 years ago
+								            'res': {'type': 'multiple', 'field': ('height', 'width'),
-												[cleanup] Refactor some code

											
										
										
											3 years ago
+								                    'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
 								            # Most of these exist only for compatibility reasons
 								            'dimension': {'type': 'alias', 'field': 'res'},
 								            'resolution': {'type': 'alias', 'field': 'res'},
 								            'extension': {'type': 'alias', 'field': 'ext'},
 								            'bitrate': {'type': 'alias', 'field': 'br'},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'total_bitrate': {'type': 'alias', 'field': 'tbr'},
 								            'video_bitrate': {'type': 'alias', 'field': 'vbr'},
 								            'audio_bitrate': {'type': 'alias', 'field': 'abr'},
 								            'framerate': {'type': 'alias', 'field': 'fps'},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
+								            'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
 								            'protocol': {'type': 'alias', 'field': 'proto'},
 								            'source_preference': {'type': 'alias', 'field': 'source'},
 								            'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
 								            'filesize_estimate': {'type': 'alias', 'field': 'size'},
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            'samplerate': {'type': 'alias', 'field': 'asr'},
 								            'video_ext': {'type': 'alias', 'field': 'vext'},
 								            'audio_ext': {'type': 'alias', 'field': 'aext'},
 								            'video_codec': {'type': 'alias', 'field': 'vcodec'},
 								            'audio_codec': {'type': 'alias', 'field': 'acodec'},
-												Deprecate unnecessary aliases in `formatSort`

(I should never have made so many aliases in the first-place)
The aliases remain functional for backward compatability, but will be left undocumented

											
										
										
											4 years ago
+								            'video': {'type': 'alias', 'field': 'hasvid'},
 								            'has_video': {'type': 'alias', 'field': 'hasvid'},
 								            'audio': {'type': 'alias', 'field': 'hasaud'},
 								            'has_audio': {'type': 'alias', 'field': 'hasaud'},
 								            'extractor': {'type': 'alias', 'field': 'ie_pref'},
 								            'preference': {'type': 'alias', 'field': 'ie_pref'},
 								            'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
 								            'format_id': {'type': 'alias', 'field': 'id'},
 								        }
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
 								        _order = []
 								        def _get_field_setting(self, field, key):
 								            if field not in self.settings:
 								                self.settings[field] = {}
 								            propObj = self.settings[field]
 								            if key not in propObj:
 								                type = propObj.get('type')
 								                if key == 'field':
 								                    default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
 								                elif key == 'convert':
 								                    default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
-												Add temporary _sort_formats helper function

											
										
										
											11 years ago
+								                else:
-												[FormatSort] Fix bug for audio with unknown codec

											
										
										
											3 years ago
+								                    default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                propObj[key] = default
 								            return propObj[key]
 								        def _resolve_field_value(self, field, value, convertNone=False):
 								            if value is None:
 								                if not convertNone:
 								                    return None
-												Add temporary _sort_formats helper function

											
										
										
											11 years ago
+								            else:
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                value = value.lower()
 								            conversion = self._get_field_setting(field, 'convert')
 								            if conversion == 'ignore':
 								                return None
 								            if conversion == 'string':
 								                return value
 								            elif conversion == 'float_none':
 								                return float_or_none(value)
 								            elif conversion == 'bytes':
 								                return FileDownloader.parse_bytes(value)
 								            elif conversion == 'order':
-												[documentation] Better document `--prefer-free-formats`

Also added `--no-prefer-free-formats`

											
										
										
											4 years ago
+								                order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                use_regex = self._get_field_setting(field, 'regex')
 								                list_length = len(order_list)
 								                empty_pos = order_list.index('') if '' in order_list else list_length + 1
 								                if use_regex and value is not None:
-												[documentation] Better document `--prefer-free-formats`

Also added `--no-prefer-free-formats`

											
										
										
											4 years ago
+								                    for i, regex in enumerate(order_list):
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                        if regex and re.match(regex, value):
 								                            return list_length - i
 								                    return list_length - empty_pos  # not in list
 								                else:  # not regex or  value = None
 								                    return list_length - (order_list.index(value) if value in order_list else empty_pos)
 								            else:
 								                if value.isnumeric():
 								                    return float(value)
-												Add temporary _sort_formats helper function

											
										
										
											11 years ago
+								                else:
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                    self.settings[field]['convert'] = 'string'
 								                    return value
 								        def evaluate_params(self, params, sort_extractor):
 								            self._use_free_order = params.get('prefer_free_formats', False)
 								            self._sort_user = params.get('format_sort', [])
 								            self._sort_extractor = sort_extractor
 								            def add_item(field, reverse, closest, limit_text):
 								                field = field.lower()
 								                if field in self._order:
 								                    return
 								                self._order.append(field)
 								                limit = self._resolve_field_value(field, limit_text)
 								                data = {
 								                    'reverse': reverse,
 								                    'closest': False if limit is None else closest,
 								                    'limit_text': limit_text,
 								                    'limit': limit}
 								                if field in self.settings:
 								                    self.settings[field].update(data)
 								                else:
 								                    self.settings[field] = data
 								            sort_list = (
 								                tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
 								                + (tuple() if params.get('format_sort_force', False)
 								                   else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
 								                + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
 								            for item in sort_list:
 								                match = re.match(self.regex, item)
 								                if match is None:
 								                    raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
 								                field = match.group('field')
 								                if field is None:
 								                    continue
 								                if self._get_field_setting(field, 'type') == 'alias':
 								                    field = self._get_field_setting(field, 'field')
 								                reverse = match.group('reverse') is not None
-												Fix some typos and linter

											
										
										
											4 years ago
+								                closest = match.group('separator') == '~'
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                limit_text = match.group('limit')
 								                has_limit = limit_text is not None
 								                has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
 								                has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
 								                fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
 								                limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
 								                limit_count = len(limits)
 								                for (i, f) in enumerate(fields):
 								                    add_item(f, reverse, closest,
 								                             limits[i] if i < limit_count
 								                             else limits[0] if has_limit and not has_multiple_limits
 								                             else None)
-												Standardize `write_debug`

											
										
										
											4 years ago
+								        def print_verbose_info(self, write_debug):
-												[documentation] Improvements

											
										
										
											4 years ago
+								            if self._sort_user:
-												Standardize `write_debug`

											
										
										
											4 years ago
+								                write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            if self._sort_extractor:
-												Standardize `write_debug`

											
										
										
											4 years ago
+								                write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
 								            write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                '+' if self._get_field_setting(field, 'reverse') else '', field,
 								                '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
 								                              self._get_field_setting(field, 'limit_text'),
 								                              self._get_field_setting(field, 'limit'))
 								                if self._get_field_setting(field, 'limit_text') is not None else '')
 								                for field in self._order if self._get_field_setting(field, 'visible')]))
 								        def _calculate_field_preference_from_value(self, format, field, type, value):
 								            reverse = self._get_field_setting(field, 'reverse')
 								            closest = self._get_field_setting(field, 'closest')
 								            limit = self._get_field_setting(field, 'limit')
 								            if type == 'extractor':
 								                maximum = self._get_field_setting(field, 'max')
 								                if value is None or (maximum is not None and value >= maximum):
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								                    value = -1
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            elif type == 'boolean':
 								                in_list = self._get_field_setting(field, 'in_list')
 								                not_in_list = self._get_field_setting(field, 'not_in_list')
 								                value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
 								            elif type == 'ordered':
 								                value = self._resolve_field_value(field, value, True)
 								            # try to convert to number
-												[FormatSort] Fix for when some formats have quality and others don't

											
										
										
											4 years ago
+								            val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
 								            if is_num:
 								                value = val_num
 								            return ((-10, 0) if value is None
 								                    else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
 								                    else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
 								                    else (0, value, 0) if not reverse and (limit is None or value <= limit)
 								                    else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
 								                    else (-1, value, 0))
 								        def _calculate_field_preference(self, format, field):
 								            type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
 								            get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
 								            if type == 'multiple':
 								                type = 'field'  # Only 'field' is allowed in multiple for now
 								                actual_fields = self._get_field_setting(field, 'field')
-												[FormatSort] Fix bug for audio with unknown codec

											
										
										
											3 years ago
+								                value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								            else:
 								                value = get_value(field)
 								            return self._calculate_field_preference_from_value(format, field, type, value)
 								        def calculate_preference(self, format):
 								            # Determine missing protocol
 								            if not format.get('protocol'):
 								                format['protocol'] = determine_protocol(format)
 								            # Determine missing ext
 								            if not format.get('ext') and 'url' in format:
 								                format['ext'] = determine_ext(format['url'])
 								            if format.get('vcodec') == 'none':
-												Allow `images` formats
Necessary for #343.

* They are identified by `vcodec=acodec='none'`
* These formats show as the worst in `-F`
* Any postprocessor that expects audio/video will be skipped
* `b*` and all related selectors will skip such formats
* This commit also does not add any selector for downloading such formats. They have to be explicitly requested by the `format_id`. Implementation of a selector is left for when #389 is resolved

											
										
										
											3 years ago
+								                format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								                format['video_ext'] = 'none'
 								            else:
 								                format['video_ext'] = format['ext']
 								                format['audio_ext'] = 'none'
 								            # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
 								            #    format['preference'] = -1000
 								            # Determine missing bitrates
 								            if format.get('tbr') is None:
 								                if format.get('vbr') is not None and format.get('abr') is not None:
 								                    format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
 								            else:
 								                if format.get('vcodec') != "none" and format.get('vbr') is None:
 								                    format['vbr'] = format.get('tbr') - format.get('abr', 0)
 								                if format.get('acodec') != "none" and format.get('abr') is None:
 								                    format['abr'] = format.get('tbr') - format.get('vbr', 0)
 								            return tuple(self._calculate_field_preference(format, field) for field in self._order)
 								    def _sort_formats(self, formats, field_preference=[]):
 								        if not formats:
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								            return
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								        format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
 								        format_sort.evaluate_params(self._downloader.params, field_preference)
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if self.get_param('verbose', False):
-												Standardize `write_debug`

											
										
										
											4 years ago
+								            format_sort.print_verbose_info(self._downloader.write_debug)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											4 years ago
+								        formats.sort(key=lambda f: format_sort.calculate_preference(f))
-												Add support for tou.tv (Fixes #1792)

											
										
										
											11 years ago
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											10 years ago
+								    def _check_formats(self, formats, video_id):
 								        if formats:
 								            formats[:] = filter(
 								                lambda f: self._is_valid_url(
 								                    f['url'], video_id,
 								                    item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 								                formats)
-												[extractor/common] Add _remove_duplicate_formats

											
										
										
											9 years ago
+								    @staticmethod
 								    def _remove_duplicate_formats(formats):
 								        format_urls = set()
 								        unique_formats = []
 								        for f in formats:
 								            if f['url'] not in format_urls:
 								                format_urls.add(f['url'])
 								                unique_formats.append(f)
 								        formats[:] = unique_formats
-												[infoq] Add audio only format if available (#11565)

* [infoq] Add audio only format if available

Refactor cookie code into a function.
Renamed formats to http_video, http_audio, rtmp_video
Renamed extract functions to video instead of videos as they return
one or no video.

* [infoq] Rename to _extract_cookies as it more than one

* [infoq] Remove redundant determine_ext

* [infoq] Add comment about hardcoded URL

* [infoq] Use _hidden_inputs instead of messy regex

* [infoq] Probe if audio URL is valid

Make it possible to pass headers to _is_valid_url

* [infoq] Add audio only test

											
										
										
											8 years ago
+								    def _is_valid_url(self, url, video_id, item='video', headers={}):
-												[extractor/common] Assume non HTTP(S) URLs valid

											
										
										
											10 years ago
+								        url = self._proto_relative_url(url, scheme='http:')
 								        # For now assume non HTTP(S) URLs always valid
 								        if not (url.startswith('http://') or url.startswith('https://')):
 								            return True
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											10 years ago
+								        try:
-												[infoq] Add audio only format if available (#11565)

* [infoq] Add audio only format if available

Refactor cookie code into a function.
Renamed formats to http_video, http_audio, rtmp_video
Renamed extract functions to video instead of videos as they return
one or no video.

* [infoq] Rename to _extract_cookies as it more than one

* [infoq] Remove redundant determine_ext

* [infoq] Add comment about hardcoded URL

* [infoq] Use _hidden_inputs instead of messy regex

* [infoq] Probe if audio URL is valid

Make it possible to pass headers to _is_valid_url

* [infoq] Add audio only test

											
										
										
											8 years ago
+								            self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											10 years ago
+								            return True
-												Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog

											
										
										
											4 years ago
+								        except ExtractorError as e:
-												[extractor/common] Make _is_valid_url more relaxed

											
										
										
											5 years ago
+								            self.to_screen(
-												Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog

											
										
										
											4 years ago
+								                '%s: %s URL is invalid, skipping: %s'
 								                % (video_id, item, error_to_compat_str(e.cause)))
-												[extractor/common] Make _is_valid_url more relaxed

											
										
										
											5 years ago
+								            return False
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											10 years ago
-												[soundcloud/generic] Add support for playlists

											
										
										
											11 years ago
+								    def http_scheme(self):
-												[glide] Simplify

											
										
										
											10 years ago
+								        """ Either "http:" or "https:", depending on the user's preferences """
-												[soundcloud/generic] Add support for playlists

											
										
										
											11 years ago
+								        return (
 								            'http:'
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								            if self.get_param('prefer_insecure', False)
-												[soundcloud/generic] Add support for playlists

											
										
										
											11 years ago
+								            else 'https:')
-												[mixcloud] Shed API dependency (#2904)

											
										
										
											11 years ago
+								    def _proto_relative_url(self, url, scheme=None):
 								        if url is None:
 								            return url
 								        if url.startswith('//'):
 								            if scheme is None:
 								                scheme = self.http_scheme()
 								            return scheme + url
 								        else:
 								            return url
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											10 years ago
+								    def _sleep(self, timeout, video_id, msg_template=None):
 								        if msg_template is None:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											10 years ago
+								        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 								        self.to_screen(msg)
 								        time.sleep(timeout)
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
-												[extractor/common] Add fatal to _extract_f4m_formats

											
										
										
											9 years ago
+								                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											5 years ago
+								                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
-												[extractor/common] _extract_f4m_formats: Use more specific messages when downloading the manifest

											
										
										
											10 years ago
+								        manifest = self._download_xml(
 								            manifest_url, video_id, 'Downloading f4m manifest',
-												[extractor/common] Handle malformed f4m manifests

											
										
										
											9 years ago
+								            'Unable to download f4m manifest',
 								            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
-												Start moving to ytdl-org

											
										
										
											6 years ago
+								            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
-												[extractor/common] Add fatal to _extract_f4m_formats

											
										
										
											9 years ago
+								            transform_source=transform_source,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											5 years ago
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[extractor/common] Add fatal to _extract_f4m_formats

											
										
										
											9 years ago
 								        if manifest is False:
-												[common] simplify the use of _extract_m3u8_formats and _extract_f4m_formats

											
										
										
											9 years ago
+								            return []
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											9 years ago
+								        return self._parse_f4m_formats(
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								            manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											9 years ago
+								            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											9 years ago
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											9 years ago
+								                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											9 years ago
+								                           fatal=True, m3u8_id=None):
-												Use compat_etree_Element

											
										
										
											6 years ago
+								        if not isinstance(manifest, compat_etree_Element) and not fatal:
-												[extractor/common] Do not fail on invalid data while parsing F4M manifest in non fatal mode

											
										
										
											6 years ago
+								            return []
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											4 years ago
+								        # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
-												[extractor/common] do not process f4m manifest that contain akamai playerVerificationChallenge

											
										
										
											9 years ago
+								        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
 								        if akamai_pv is not None and ';' in akamai_pv.text:
 								            playerVerificationChallenge = akamai_pv.text.split(';')[0]
 								            if playerVerificationChallenge.strip() != '':
 								                return []
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								        formats = []
-												[extractor/common] href attribute added

											
										
										
											10 years ago
+								        manifest_version = '1.0'
-												[extractor/common] Generate better f4m format IDs

											
										
										
											10 years ago
+								        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
-												[extractor/common] Added support for f4m manifest Version 2.0

											
										
										
											10 years ago
+								        if not media_nodes:
-												[extractor/common] href attribute added

											
										
										
											10 years ago
+								            manifest_version = '2.0'
-												[extractor/common] Added support for f4m manifest Version 2.0

											
										
										
											10 years ago
+								            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
-												[extractor/common] Filter out unsupported encrypted media for f4m formats (Closes #8573)

											
										
										
											9 years ago
+								        # Remove unsupported DRM protected media from final formats
-												Start moving to ytdl-org

											
										
										
											6 years ago
+								        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
-												[extractor/common] Filter out unsupported encrypted media for f4m formats (Closes #8573)

											
										
										
											9 years ago
+								        media_nodes = remove_encrypted_media(media_nodes)
 								        if not media_nodes:
 								            return formats
-												[f4m] Prefer baseURL for relative URLs (closes #14660)

											
										
										
											7 years ago
 								        manifest_base_url = get_base_url(manifest)
-												[common] Support non-bootstraped streams in f4m manifests

Related: #9531

											
										
										
											9 years ago
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											9 years ago
+								        bootstrap_info = xpath_element(
-												[common] Support non-bootstraped streams in f4m manifests

Related: #9531

											
										
										
											9 years ago
+								            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
 								            'bootstrap info', default=None)
-												[extractor/common] detect f4m audio only formats

											
										
										
											8 years ago
+								        vcodec = None
 								        mime_type = xpath_text(
 								            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
 								            'base URL', default=None)
 								        if mime_type and mime_type.startswith('audio/'):
 								            vcodec = 'none'
-												[extractor/common] Generate better f4m format IDs

											
										
										
											10 years ago
+								        for i, media_el in enumerate(media_nodes):
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								            tbr = int_or_none(media_el.attrib.get('bitrate'))
 								            width = int_or_none(media_el.attrib.get('width'))
 								            height = int_or_none(media_el.attrib.get('height'))
 								            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											9 years ago
+								            # If <bootstrapInfo> is present, the specified f4m is a
 								            # stream-level manifest, and only set-level manifests may refer to
 								            # external resources.  See section 11.4 and section 4 of F4M spec
 								            if bootstrap_info is None:
 								                media_url = None
 								                # @href is introduced in 2.0, see section 11.6 of F4M spec
 								                if manifest_version == '2.0':
 								                    media_url = media_el.attrib.get('href')
 								                if media_url is None:
 								                    media_url = media_el.attrib.get('url')
-												[extractor/common] Keep going in some media_url is missing

											
										
										
											9 years ago
+								                if not media_url:
 								                    continue
-												[extractor/common] Properly handle full URLs

											
										
										
											9 years ago
+								                manifest_url = (
 								                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
-												[f4m] Prefer baseURL for relative URLs (closes #14660)

											
										
										
											7 years ago
+								                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											9 years ago
+								                # If media_url is itself a f4m manifest do the recursive extraction
 								                # since bitrates in parent manifest (this one) and media_url manifest
 								                # may differ leading to inability to resolve the format by requested
 								                # bitrate in f4m downloader
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											9 years ago
+								                ext = determine_ext(manifest_url)
 								                if ext == 'f4m':
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								                    f4m_formats = self._extract_f4m_formats(
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								                        manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								                        transform_source=transform_source, fatal=fatal)
 								                    # Sometimes stream-level manifest contains single media entry that
 								                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
 								                    # At the same time parent's media entry in set-level manifest may
 								                    # contain it. We will copy it from parent in such cases.
 								                    if len(f4m_formats) == 1:
 								                        f = f4m_formats[0]
 								                        f.update({
 								                            'tbr': f.get('tbr') or tbr,
 								                            'width': f.get('width') or width,
 								                            'height': f.get('height') or height,
 								                            'format_id': f.get('format_id') if not tbr else format_id,
-												[extractor/common] detect f4m audio only formats

											
										
										
											8 years ago
+								                            'vcodec': vcodec,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								                        })
 								                    formats.extend(f4m_formats)
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											9 years ago
+								                    continue
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											9 years ago
+								                elif ext == 'm3u8':
 								                    formats.extend(self._extract_m3u8_formats(
 								                        manifest_url, video_id, 'mp4', preference=preference,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								                        quality=quality, m3u8_id=m3u8_id, fatal=fatal))
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											9 years ago
+								                    continue
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								            formats.append({
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								                'format_id': format_id,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								                'url': manifest_url,
-												[extractor/common] Add manifest_url for hls and hds formats

											
										
										
											8 years ago
+								                'manifest_url': manifest_url,
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											9 years ago
+								                'ext': 'flv' if bootstrap_info is not None else None,
-												[extractor/common] Add protocol for f4m formats

											
										
										
											7 years ago
+								                'protocol': 'f4m',
-												[extractor/common] Generate better f4m format IDs

											
										
										
											10 years ago
+								                'tbr': tbr,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											9 years ago
+								                'width': width,
 								                'height': height,
-												[extractor/common] detect f4m audio only formats

											
										
										
											8 years ago
+								                'vcodec': vcodec,
-												[extractor/common] Prefix f4m/m3u8 entries with identifier

											
										
										
											10 years ago
+								                'preference': preference,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								                'quality': quality,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								            })
 								        return formats
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
-												[common] Add _m3u8_meta_format() template

For extractors who handle m3u8 manifests by themselves. (eg., AnvatoIE)

Part of #9522

											
										
										
											9 years ago
+								        return {
-												[extractor/common] Remove 'm3u8' from quality selection URL

											
										
										
											10 years ago
+								            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
-												[sportdeutschland] add new extractor

											
										
										
											10 years ago
+								            'url': m3u8_url,
 								            'ext': ext,
 								            'protocol': 'm3u8',
-												[common] correctly lower the preference of m3u8 master manifest format

											
										
										
											8 years ago
+								            'preference': preference - 100 if preference else -100,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								            'quality': quality,
-												[sportdeutschland] add new extractor

											
										
										
											10 years ago
+								            'resolution': 'multiple',
 								            'format_note': 'Quality selection URL',
-												[common] Add _m3u8_meta_format() template

For extractors who handle m3u8 manifests by themselves. (eg., AnvatoIE)

Part of #9522

											
										
										
											9 years ago
+								        }
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								    def _extract_m3u8_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the HLS manifest; "
 								                "if any subtitle tracks are missing,"
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								            ), only_once=True)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								        return fmts
 								    def _extract_m3u8_formats_and_subtitles(
-												[extractor] Always prefer native hls downloader by default

When the manifest is not downloadable by native downloader, it already is able to detect it and switch to `ffmpeg`. So there doesn't seem to be a reason anymore to use ffmpeg as the preferred downloader

											
										
										
											4 years ago
+								            self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								            preference=None, quality=None, m3u8_id=None, note=None,
 								            errnote=None, fatal=True, live=False, data=None, headers={},
 								            query={}):
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											9 years ago
+								        res = self._download_webpage_handle(
-												[extractor/common] Improve m3u8 output

											
										
										
											10 years ago
+								            m3u8_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											3 years ago
+								            note='Downloading m3u8 information' if note is None else note,
 								            errnote='Failed to download m3u8 information' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											5 years ago
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											9 years ago
+								        if res is False:
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								            return [], {}
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											9 years ago
+								        m3u8_doc, urlh = res
-												[extractor/common] get the redirected m3u8_url in _extract_m3u8_formats

											
										
										
											9 years ago
+								        m3u8_url = urlh.geturl()
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											9 years ago
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								        return self._parse_m3u8_formats_and_subtitles(
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								            preference=preference, quality=quality, m3u8_id=m3u8_id,
 								            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
 								            headers=headers, query=query, video_id=video_id)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								    def _parse_m3u8_formats_and_subtitles(
-												[extractor] Always prefer native hls downloader by default

When the manifest is not downloadable by native downloader, it already is able to detect it and switch to `ffmpeg`. So there doesn't seem to be a reason anymore to use ffmpeg as the preferred downloader

											
										
										
											4 years ago
+								            self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								            preference=None, quality=None, m3u8_id=None, live=False, note=None,
 								            errnote=None, fatal=True, data=None, headers={}, query={},
 								            video_id=None):
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								        formats, subtitles = [], {}
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
-												[extractor/common] skip m3u8 manifests protected with Adobe Flash Access

											
										
										
											8 years ago
+								        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								            return formats, subtitles
-												[extractor/common] skip m3u8 manifests protected with Adobe Flash Access

											
										
										
											8 years ago
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								        has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								        def format_url(url):
 								            return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
 								        if self.get_param('hls_split_discontinuity', False):
 								            def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
 								                if not m3u8_doc:
 								                    if not manifest_url:
 								                        return []
 								                    m3u8_doc = self._download_webpage(
 								                        manifest_url, video_id, fatal=fatal, data=data, headers=headers,
 								                        note=False, errnote='Failed to download m3u8 playlist information')
 								                    if m3u8_doc is False:
 								                        return []
 								                return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
-												[internetvideoarchive] extract all formats

											
										
										
											8 years ago
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								        else:
 								            def _extract_m3u8_playlist_indices(*args, **kwargs):
 								                return [None]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								        # References:
 								        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
-												Start moving to ytdl-org

											
										
										
											6 years ago
+								        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
 								        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
 								        # We should try extracting formats only from master playlists [1, 4.3.4],
 								        # i.e. playlists that describe available qualities. On the other hand
 								        # media playlists [1, 4.3.3] should be returned as is since they contain
 								        # just the media without qualities renditions.
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											9 years ago
+								        # Fortunately, master playlist can be easily distinguished from media
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
-												Updated to release 2020.11.21.1

											
										
										
											4 years ago
+								        # master playlist tags MUST NOT appear in a media playlist and vice versa.
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
 								        # media playlist and MUST NOT appear in master playlist thus we can
 								        # clearly detect media playlist with this criterion.
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											9 years ago
+								        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								            formats = [{
 								                'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
 								                'format_index': idx,
 								                'url': m3u8_url,
 								                'ext': ext,
 								                'protocol': entry_protocol,
 								                'preference': preference,
 								                'quality': quality,
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								                'has_drm': has_drm,
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								            } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								            return formats, subtitles
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
 								        groups = {}
 								        last_stream_inf = {}
 								        def extract_media(x_media_line):
 								            media = parse_m3u8_attributes(x_media_line)
 								            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
 								            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
 								            if not (media_type and group_id and name):
 								                return
 								            groups.setdefault(group_id, []).append(media)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								            # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
 								            if media_type == 'SUBTITLES':
-												[extractor] Skip subtitles without URI in m3u8 manifests
Closes #339

Authored by: hheimbuerger

											
										
										
											4 years ago
+								                # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
 								                # EXT-X-MEDIA tag if the media type is SUBTITLES.
 								                # However, lack of URI has been spotted in the wild.
 								                # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
 								                if not media.get('URI'):
 								                    return
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								                url = format_url(media['URI'])
 								                sub_info = {
 								                    'url': url,
 								                    'ext': determine_ext(url),
 								                }
-												[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments

											
										
										
											4 years ago
+								                if sub_info['ext'] == 'm3u8':
 								                    # Per RFC 8216 §3.1, the only possible subtitle format m3u8
 								                    # files may contain is WebVTT:
 								                    # <https://tools.ietf.org/html/rfc8216#section-3.1>
 								                    sub_info['ext'] = 'vtt'
 								                    sub_info['protocol'] = 'm3u8_native'
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											3 years ago
+								                lang = media.get('LANGUAGE') or 'und'
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								                subtitles.setdefault(lang, []).append(sub_info)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								            if media_type not in ('VIDEO', 'AUDIO'):
 								                return
 								            media_url = media.get('URI')
 								            if media_url:
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								                manifest_url = format_url(media_url)
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								                formats.extend({
 								                    'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
 								                    'format_note': name,
 								                    'format_index': idx,
 								                    'url': manifest_url,
 								                    'manifest_url': m3u8_url,
 								                    'language': media.get('LANGUAGE'),
 								                    'ext': ext,
 								                    'protocol': entry_protocol,
 								                    'preference': preference,
 								                    'quality': quality,
 								                    'vcodec': 'none' if media_type == 'AUDIO' else None,
 								                } for idx in _extract_m3u8_playlist_indices(manifest_url))
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
 								        def build_stream_name():
 								            # Despite specification does not mention NAME attribute for
-												[extractor/common] Rephrase comment

											
										
										
											8 years ago
+								            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
 								            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
-												[test_InfoExtractor] Add m3u8 parsing test for NAME attribute in EXT-X-STREAM-INF tag

											
										
										
											8 years ago
+								            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								            stream_name = last_stream_inf.get('NAME')
 								            if stream_name:
 								                return stream_name
 								            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
 								            # from corresponding rendition group
 								            stream_group_id = last_stream_inf.get('VIDEO')
 								            if not stream_group_id:
 								                return
 								            stream_group = groups.get(stream_group_id)
 								            if not stream_group:
 								                return stream_group_id
 								            rendition = stream_group[0]
 								            return rendition.get('NAME') or stream_group_id
-												[extractor/common] fix typo

											
										
										
											6 years ago
+								        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
-												[extractor/common] imporove HLS video only format detection(closes #18923)

											
										
										
											6 years ago
+								        # chance to detect video only formats when EXT-X-STREAM-INF tags
 								        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
 								        for line in m3u8_doc.splitlines():
 								            if line.startswith('#EXT-X-MEDIA:'):
 								                extract_media(line)
-												[sportdeutschland] add new extractor

											
										
										
											10 years ago
+								        for line in m3u8_doc.splitlines():
 								            if line.startswith('#EXT-X-STREAM-INF:'):
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								                last_stream_inf = parse_m3u8_attributes(line)
-												[sportdeutschland] add new extractor

											
										
										
											10 years ago
+								            elif line.startswith('#') or not line.strip():
 								                continue
 								            else:
-												[extractor/common] Use float for scaled tbr

											
										
										
											8 years ago
+								                tbr = float_or_none(
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								                    last_stream_inf.get('AVERAGE-BANDWIDTH')
 								                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
-												[extractor/common] Add manifest_url for hls and hds formats

											
										
										
											8 years ago
+								                manifest_url = format_url(line.strip())
-												[dailymotion] improve extraction

- extract http formats included in m3u8 manifest
- fix user extraction(closes #3553)(closes #21415)
- add suport for User Authentication(closes #11491)
- fix password protected videos extraction(closes #23176)
- respect age limit option and family filter cookie value(closes #18437)
- handle video url playlist query param
- report alowed countries for geo-restricted videos

											
										
										
											5 years ago
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								                for idx in _extract_m3u8_playlist_indices(manifest_url):
 								                    format_id = [m3u8_id, None, idx]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								                    # Bandwidth of live streams may differ over time thus making
 								                    # format_id unpredictable. So it's better to keep provided
 								                    # format_id intact.
 								                    if not live:
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								                        stream_name = build_stream_name()
 								                        format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								                    f = {
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											3 years ago
+								                        'format_id': '-'.join(map(str, filter(None, format_id))),
 								                        'format_index': idx,
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								                        'url': manifest_url,
 								                        'manifest_url': m3u8_url,
 								                        'tbr': tbr,
 								                        'ext': ext,
 								                        'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
 								                        'protocol': entry_protocol,
 								                        'preference': preference,
 								                        'quality': quality,
 								                    }
 								                    resolution = last_stream_inf.get('RESOLUTION')
 								                    if resolution:
 								                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
 								                        if mobj:
 								                            f['width'] = int(mobj.group('width'))
 								                            f['height'] = int(mobj.group('height'))
 								                    # Unified Streaming Platform
 								                    mobj = re.search(
 								                        r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
 								                    if mobj:
 								                        abr, vbr = mobj.groups()
 								                        abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
 								                        f.update({
 								                            'vbr': vbr,
 								                            'abr': abr,
 								                        })
 								                    codecs = parse_codecs(last_stream_inf.get('CODECS'))
 								                    f.update(codecs)
 								                    audio_group_id = last_stream_inf.get('AUDIO')
 								                    # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
 								                    # references a rendition group MUST have a CODECS attribute.
 								                    # However, this is not always respected, for example, [2]
 								                    # contains EXT-X-STREAM-INF tag which references AUDIO
 								                    # rendition group but does not have CODECS and despite
 								                    # referencing an audio group it represents a complete
 								                    # (with audio and video) format. So, for such cases we will
 								                    # ignore references to rendition groups and treat them
 								                    # as complete formats.
 								                    if audio_group_id and codecs and f.get('vcodec') != 'none':
 								                        audio_group = groups.get(audio_group_id)
 								                        if audio_group and audio_group[0].get('URI'):
 								                            # TODO: update acodec for audio only formats with
 								                            # the same GROUP-ID
 								                            f['acodec'] = 'none'
-												Fix some videos downloading with m3u8 extension

											
										
										
											4 years ago
+								                    if not f.get('ext'):
 								                        f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											4 years ago
+								                    formats.append(f)
 								                    # for DailyMotion
 								                    progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
 								                    if progressive_uri:
 								                        http_f = f.copy()
 								                        del http_f['manifest_url']
 								                        http_f.update({
 								                            'format_id': f['format_id'].replace('hls-', 'http-'),
 								                            'protocol': 'http',
 								                            'url': progressive_uri,
 								                        })
 								                        formats.append(http_f)
-												[dailymotion] improve extraction

- extract http formats included in m3u8 manifest
- fix user extraction(closes #3553)(closes #21415)
- add suport for User Authentication(closes #11491)
- fix password protected videos extraction(closes #23176)
- respect age limit option and family filter cookie value(closes #18437)
- handle video url playlist query param
- report alowed countries for geo-restricted videos

											
										
										
											5 years ago
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											8 years ago
+								                last_stream_inf = {}
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											8 years ago
+								        return formats, subtitles
-												[sportdeutschland] add new extractor

											
										
										
											10 years ago
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								    @staticmethod
 								    def _xpath_ns(path, namespace=None):
 								        if not namespace:
 								            return path
 								        out = []
 								        for c in path.split('/'):
 								            if not c or c == '.':
 								                out.append(c)
 								            else:
 								                out.append('{%s}%s' % (namespace, c))
 								        return '/'.join(out)
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											3 years ago
+								    def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
-												[extractor/common] add transform_source to _download_smil and _extract_smil_formats

											
										
										
											9 years ago
+								        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
-												[nerdist] Add new extractor (Fixes #4851)

											
										
										
											10 years ago
+								        if smil is False:
 								            assert not fatal
 								            return []
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											10 years ago
-												[extractor/common] Extract namespace parse routine

											
										
										
											9 years ago
+								        namespace = self._parse_smil_namespace(smil)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											3 years ago
+								        fmts = self._parse_smil_formats(
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											3 years ago
+								        subs = self._parse_smil_subtitles(
 								            smil, namespace=namespace)
 								        return fmts, subs
 								    def _extract_smil_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the SMIL manifest; "
 								                "if any subtitle tracks are missing,"
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								            ), only_once=True)
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											3 years ago
+								        return fmts
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
 								    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
 								        smil = self._download_smil(smil_url, video_id, fatal=fatal)
 								        if smil is False:
 								            return {}
 								        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
-												[extractor/common] add transform_source to _download_smil and _extract_smil_formats

											
										
										
											9 years ago
+								    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        return self._download_xml(
 								            smil_url, video_id, 'Downloading SMIL file',
-												[extractor/common] add transform_source to _download_smil and _extract_smil_formats

											
										
										
											9 years ago
+								            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
 								    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
-												[extractor/common] Extract namespace parse routine

											
										
										
											9 years ago
+								        namespace = self._parse_smil_namespace(smil)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
 								        formats = self._parse_smil_formats(
 								            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 								        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
 								        video_id = os.path.splitext(url_basename(smil_url))[0]
 								        title = None
 								        description = None
-												[extractor/common] Extract upload date from SMIL

											
										
										
											9 years ago
+								        upload_date = None
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 								            name = meta.attrib.get('name')
 								            content = meta.attrib.get('content')
 								            if not name or not content:
 								                continue
 								            if not title and name == 'title':
 								                title = content
 								            elif not description and name in ('description', 'abstract'):
 								                description = content
-												[extractor/common] Extract upload date from SMIL

											
										
										
											9 years ago
+								            elif not upload_date and name == 'date':
 								                upload_date = unified_strdate(content)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
-												[extractor/common] Extract images from SMIL

											
										
										
											9 years ago
+								        thumbnails = [{
 								            'id': image.get('type'),
 								            'url': image.get('src'),
 								            'width': int_or_none(image.get('width')),
 								            'height': int_or_none(image.get('height')),
 								        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        return {
 								            'id': video_id,
 								            'title': title or video_id,
 								            'description': description,
-												[extractor/common] Extract upload date from SMIL

											
										
										
											9 years ago
+								            'upload_date': upload_date,
-												[extractor/common] Extract images from SMIL

											
										
										
											9 years ago
+								            'thumbnails': thumbnails,
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								            'formats': formats,
 								            'subtitles': subtitles,
 								        }
-												[extractor/common] Extract namespace parse routine

											
										
										
											9 years ago
+								    def _parse_smil_namespace(self, smil):
 								        return self._search_regex(
 								            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
-												[theplatform] Use InfoExtractor._parse_smil_formats()

											
										
										
											9 years ago
+								    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        base = smil_url
 								        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 								            b = meta.get('base') or meta.get('httpBase')
 								            if b:
 								                base = b
 								                break
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											10 years ago
 								        formats = []
 								        rtmp_count = 0
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        http_count = 0
-												[extractor/common] detect media playlist in _extract_m3u8_formats

											
										
										
											9 years ago
+								        m3u8_count = 0
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
-												[extractor/common] remove duplicate rtmp formats in smil manifest

											
										
										
											9 years ago
+								        srcs = []
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											9 years ago
+								        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
 								        for medium in media:
 								            src = medium.get('src')
-												[extractor/common] remove duplicate rtmp formats in smil manifest

											
										
										
											9 years ago
+								            if not src or src in srcs:
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								                continue
-												[extractor/common] remove duplicate rtmp formats in smil manifest

											
										
										
											9 years ago
+								            srcs.append(src)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											9 years ago
+								            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
 								            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
 								            width = int_or_none(medium.get('width'))
 								            height = int_or_none(medium.get('height'))
 								            proto = medium.get('proto')
 								            ext = medium.get('ext')
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								            src_ext = determine_ext(src)
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											9 years ago
+								            streamer = medium.get('streamer') or base
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
 								            if proto == 'rtmp' or streamer.startswith('rtmp'):
 								                rtmp_count += 1
 								                formats.append({
 								                    'url': streamer,
 								                    'play_path': src,
 								                    'ext': 'flv',
 								                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 								                    'tbr': bitrate,
 								                    'filesize': filesize,
 								                    'width': width,
 								                    'height': height,
 								                })
-												[theplatform] Use InfoExtractor._parse_smil_formats()

											
										
										
											9 years ago
+								                if transform_rtmp_url:
 								                    streamer, src = transform_rtmp_url(streamer, src)
 								                    formats[-1].update({
 								                        'url': streamer,
 								                        'play_path': src,
 								                    })
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								                continue
 								            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
-												[extractor/common] strip http urls in smil manifest

											
										
										
											9 years ago
+								            src_url = src_url.strip()
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
 								            if proto == 'm3u8' or src_ext == 'm3u8':
-												[extractor/common] detect media playlist in _extract_m3u8_formats

											
										
										
											9 years ago
+								                m3u8_formats = self._extract_m3u8_formats(
 								                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
 								                if len(m3u8_formats) == 1:
 								                    m3u8_count += 1
 								                    m3u8_formats[0].update({
 								                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
 								                        'tbr': bitrate,
 								                        'width': width,
 								                        'height': height,
 								                    })
 								                formats.extend(m3u8_formats)
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											6 years ago
+								            elif src_ext == 'f4m':
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								                f4m_url = src_url
 								                if not f4m_params:
 								                    f4m_params = {
 								                        'hdcore': '3.2.0',
 								                        'plugin': 'flowplayer-3.2.0.1',
 								                    }
 								                f4m_url += '&' if '?' in f4m_url else '?'
-												[compat] Add compat_urllib_parse_urlencode and eliminate encode_dict

encode_dict functionality has been improved and moved directly into compat_urllib_parse_urlencode
All occurrences of compat_urllib_parse.urlencode throughout the codebase have been replaced by compat_urllib_parse_urlencode

Closes #8974

											
										
										
											9 years ago
+								                f4m_url += compat_urllib_parse_urlencode(f4m_params)
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											9 years ago
+								                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											6 years ago
+								            elif src_ext == 'mpd':
 								                formats.extend(self._extract_mpd_formats(
 								                    src_url, video_id, mpd_id='dash', fatal=False))
 								            elif re.search(r'\.ism/[Mm]anifest', src_url):
 								                formats.extend(self._extract_ism_formats(
 								                    src_url, video_id, ism_id='mss', fatal=False))
 								            elif src_url.startswith('http') and self._is_valid_url(src, video_id):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								                http_count += 1
 								                formats.append({
 								                    'url': src_url,
 								                    'ext': ext or src_ext or 'flv',
 								                    'format_id': 'http-%d' % (bitrate or http_count),
 								                    'tbr': bitrate,
 								                    'filesize': filesize,
 								                    'width': width,
 								                    'height': height,
 								                })
-												[extractor/common] Extract the first of a seq of videos in a .smil file

											
										
										
											10 years ago
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											10 years ago
+								        return formats
-												[extractor/common] Add default subtitles lang

											
										
										
											9 years ago
+								    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											9 years ago
+								        urls = []
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								        subtitles = {}
 								        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
 								            src = textstream.get('src')
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											9 years ago
+								            if not src or src in urls:
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								                continue
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											9 years ago
+								            urls.append(src)
-												[common] prefer using mime type over ext for smil subtitle extraction

the subtitle ext for http://www.cnet.com/videos/download-amazon-prime-movies-and-tv/
is adb_xml while using the mime type it get tt(application/smptett+xml)

											
										
										
											9 years ago
+								            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
-												[common] _parse_smil_subtitles: accept `lang` as the subtitle language

											
										
										
											9 years ago
+								            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											9 years ago
+								            subtitles.setdefault(lang, []).append({
 								                'url': src,
 								                'ext': ext,
 								            })
 								        return subtitles
-												[extractor/common] Extract the first of a seq of videos in a .smil file

											
										
										
											10 years ago
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
-												[extractor/common] Extract _parse_xspf

											
										
										
											9 years ago
+								        xspf = self._download_xml(
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								            xspf_url, playlist_id, 'Downloading xpsf playlist',
-												[extractor/common] Extract _parse_xspf

											
										
										
											9 years ago
+								            'Unable to download xspf manifest', fatal=fatal)
 								        if xspf is False:
 								            return []
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        return self._parse_xspf(
 								            xspf, playlist_id, xspf_url=xspf_url,
 								            xspf_base_url=base_url(xspf_url))
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
+								        NS_MAP = {
 								            'xspf': 'http://xspf.org/ns/0/',
 								            's1': 'http://static.streamone.nl/player/ns/0',
 								        }
 								        entries = []
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
+								            title = xpath_text(
-												[extractor/common] Use playlist id as default title

											
										
										
											9 years ago
+								                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
+								            description = xpath_text(
 								                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
 								            thumbnail = xpath_text(
 								                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
 								            duration = float_or_none(
 								                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								            formats = []
 								            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
 								                format_url = urljoin(xspf_base_url, location.text)
 								                if not format_url:
 								                    continue
 								                formats.append({
 								                    'url': format_url,
 								                    'manifest_url': xspf_url,
 								                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
 								                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
 								                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
 								                })
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											9 years ago
+								            self._sort_formats(formats)
 								            entries.append({
 								                'id': playlist_id,
 								                'title': title,
 								                'description': description,
 								                'thumbnail': thumbnail,
 								                'duration': duration,
 								                'formats': formats,
 								            })
 								        return entries
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								    def _extract_mpd_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the DASH manifest; "
 								                "if any subtitle tracks are missing,"
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								            ), only_once=True)
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								        return fmts
 								    def _extract_mpd_formats_and_subtitles(
 								            self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
 								            fatal=True, data=None, headers={}, query={}):
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        res = self._download_xml_handle(
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								            mpd_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											3 years ago
+								            note='Downloading MPD manifest' if note is None else note,
 								            errnote='Failed to download MPD manifest' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											5 years ago
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								        if res is False:
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								            return [], {}
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        mpd_doc, urlh = res
-												[vimeo] add support live streams and improve info extraction(closes #19144)

											
										
										
											6 years ago
+								        if mpd_doc is None:
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								            return [], {}
-												[utils] Introduce base_url

											
										
										
											8 years ago
+								        mpd_base_url = base_url(urlh.geturl())
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								        return self._parse_mpd_formats_and_subtitles(
-												[youtube] Update to ytdl-2021.02.04.1

											
										
										
											4 years ago
+								            mpd_doc, mpd_id, mpd_base_url, mpd_url)
-												[common] Add _extract_dash_manifest_formats

											
										
										
											9 years ago
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								    def _parse_mpd_formats(self, *args, **kwargs):
 								        fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the DASH manifest; "
 								                "if any subtitle tracks are missing,"
-												[extractor] Reset non-repeating warnings per video

											
										
										
											3 years ago
+								            ), only_once=True)
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								        return fmts
 								    def _parse_mpd_formats_and_subtitles(
 								            self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											8 years ago
+								        """
 								        Parse formats from MPD manifest.
 								        References:
 . MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
 								            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
 . https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
 								        """
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if not self.get_param('dynamic_mpd', True):
-												[Core] hls manifests, dynamic mpd


											
										
										
											4 years ago
+								            if mpd_doc.get('type') == 'dynamic':
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								                return [], {}
-												[common] Add _extract_dash_manifest_formats

											
										
										
											9 years ago
-												rename _parse_mpd to _parse_mpd_formats and add default value for mpd namespace

											
										
										
											9 years ago
+								        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
 								        def _add_ns(path):
 								            return self._xpath_ns(path, namespace)
-												[common] skip drm protected dash formats

											
										
										
											9 years ago
+								        def is_drm_protected(element):
 								            return element.find(_add_ns('ContentProtection')) is not None
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								        def extract_multisegment_info(element, ms_parent_info):
 								            ms_info = ms_parent_info.copy()
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
 								            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
 								            # common attributes and elements.  We will only extract relevant
 								            # for us.
 								            def extract_common(source):
 								                segment_timeline = source.find(_add_ns('SegmentTimeline'))
 								                if segment_timeline is not None:
 								                    s_e = segment_timeline.findall(_add_ns('S'))
 								                    if s_e:
 								                        ms_info['total_number'] = 0
 								                        ms_info['s'] = []
 								                        for s in s_e:
 								                            r = int(s.get('r', 0))
 								                            ms_info['total_number'] += 1 + r
 								                            ms_info['s'].append({
 								                                't': int(s.get('t', 0)),
 								                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
 								                                'd': int(s.attrib['d']),
 								                                'r': r,
 								                            })
 								                start_number = source.get('startNumber')
 								                if start_number:
 								                    ms_info['start_number'] = int(start_number)
 								                timescale = source.get('timescale')
 								                if timescale:
 								                    ms_info['timescale'] = int(timescale)
 								                segment_duration = source.get('duration')
 								                if segment_duration:
-												[extractor/common] Add support for float durations in _parse_mpd_formats (closes #13919)

											
										
										
											7 years ago
+								                    ms_info['segment_duration'] = float(segment_duration)
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
 								            def extract_Initialization(source):
 								                initialization = source.find(_add_ns('Initialization'))
 								                if initialization is not None:
 								                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								            segment_list = element.find(_add_ns('SegmentList'))
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								            if segment_list is not None:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
+								                extract_common(segment_list)
 								                extract_Initialization(segment_list)
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                if segment_urls_e:
 								                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
 								            else:
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								                segment_template = element.find(_add_ns('SegmentTemplate'))
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                if segment_template is not None:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
+								                    extract_common(segment_template)
-												[extractor/common] Fix initialization template (closes #11605, closes #11825)

											
										
										
											8 years ago
+								                    media = segment_template.get('media')
 								                    if media:
 								                        ms_info['media'] = media
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                    initialization = segment_template.get('initialization')
 								                    if initialization:
-												[extractor/common] Fix initialization template (closes #11605, closes #11825)

											
										
										
											8 years ago
+								                        ms_info['initialization'] = initialization
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                    else:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
+								                        extract_Initialization(segment_template)
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								            return ms_info
-												[common] Modify _parse_dash_manifest for use in Facebook

											
										
										
											9 years ago
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
-												[downloader/ffmpeg] Support for DASH manifests (experimental)
Closes #159

											
										
										
											3 years ago
+								        formats, subtitles = [], {}
 								        stream_numbers = {'audio': 0, 'video': 0}
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								        for period in mpd_doc.findall(_add_ns('Period')):
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								            period_duration = parse_duration(period.get('duration')) or mpd_duration
 								            period_ms_info = extract_multisegment_info(period, {
 								                'start_number': 1,
 								                'timescale': 1,
 								            })
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
-												[common] remove duplicate reference to namespace

											
										
										
											9 years ago
+								                for representation in adaptation_set.findall(_add_ns('Representation')):
-												[common] add a generic support for mpd manifests

											
										
										
											9 years ago
+								                    representation_attrib = adaptation_set.attrib.copy()
 								                    representation_attrib.update(representation.attrib)
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											8 years ago
+								                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
-												[common] Use mimeType to determine file extensions (#8766)

											
										
										
											9 years ago
+								                    mime_type = representation_attrib['mimeType']
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                    codecs = representation_attrib.get('codecs', '')
 								                    if content_type not in ('video', 'audio', 'text'):
 								                        if mime_type == 'image/jpeg':
-												minor bugfixes
bugs due to be2fc5b212338d89d9c139cb463f785e797d1ad3, e9f4ccd19eb92621970b518fb5984b8aef52bdc8

											
										
										
											3 years ago
+								                            content_type = mime_type
 								                        elif codecs.split('.')[0] == 'stpp':
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                            content_type = 'text'
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											4 years ago
+								                        else:
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                            self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
 								                            continue
 								                    base_url = ''
 								                    for element in (representation, adaptation_set, period, mpd_doc):
 								                        base_url_e = element.find(_add_ns('BaseURL'))
 								                        if base_url_e is not None:
 								                            base_url = base_url_e.text + base_url
 								                            if re.match(r'^https?://', base_url):
 								                                break
-												[extractor] Fix root-relative URLs in MPD (#1006)

Authored by: DigitalDJ
											
										
										
											3 years ago
+								                    if mpd_base_url and base_url.startswith('/'):
 								                        base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
 								                    elif mpd_base_url and not re.match(r'^https?://', base_url):
 								                        if not mpd_base_url.endswith('/'):
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                            mpd_base_url += '/'
 								                        base_url = mpd_base_url + base_url
 								                    representation_id = representation_attrib.get('id')
 								                    lang = representation_attrib.get('lang')
 								                    url_el = representation.find(_add_ns('BaseURL'))
 								                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
 								                    bandwidth = int_or_none(representation_attrib.get('bandwidth'))
 								                    if representation_id is not None:
 								                        format_id = representation_id
 								                    else:
 								                        format_id = content_type
 								                    if mpd_id:
 								                        format_id = mpd_id + '-' + format_id
 								                    if content_type in ('video', 'audio'):
 								                        f = {
 								                            'format_id': format_id,
 								                            'manifest_url': mpd_url,
 								                            'ext': mimetype2ext(mime_type),
 								                            'width': int_or_none(representation_attrib.get('width')),
 								                            'height': int_or_none(representation_attrib.get('height')),
 								                            'tbr': float_or_none(bandwidth, 1000),
 								                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
 								                            'fps': int_or_none(representation_attrib.get('frameRate')),
 								                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
 								                            'format_note': 'DASH %s' % content_type,
 								                            'filesize': filesize,
 								                            'container': mimetype2ext(mime_type) + '_dash',
-												[downloader/ffmpeg] Support for DASH manifests (experimental)
Closes #159

											
										
										
											3 years ago
+								                            'manifest_stream_number': stream_numbers[content_type]
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                        }
 								                        f.update(parse_codecs(codecs))
-												[downloader/ffmpeg] Support for DASH manifests (experimental)
Closes #159

											
										
										
											3 years ago
+								                        stream_numbers[content_type] += 1
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                    elif content_type == 'text':
 								                        f = {
 								                            'ext': mimetype2ext(mime_type),
 								                            'manifest_url': mpd_url,
 								                            'filesize': filesize,
 								                        }
 								                    elif content_type == 'image/jpeg':
 								                        # See test case in VikiIE
 								                        # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
 								                        f = {
 								                            'format_id': format_id,
 								                            'ext': 'mhtml',
 								                            'manifest_url': mpd_url,
 								                            'format_note': 'DASH storyboards (jpeg)',
 								                            'acodec': 'none',
 								                            'vcodec': 'none',
 								                        }
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								                    if is_drm_protected(adaptation_set) or is_drm_protected(representation):
 								                        f['has_drm'] = True
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                    representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
 								                    def prepare_template(template_name, identifiers):
 								                        tmpl = representation_ms_info[template_name]
 								                        # First of, % characters outside $...$ templates
 								                        # must be escaped by doubling for proper processing
 								                        # by % operator string formatting used further (see
 								                        # https://github.com/ytdl-org/youtube-dl/issues/16867).
 								                        t = ''
 								                        in_template = False
 								                        for c in tmpl:
 								                            t += c
 								                            if c == '$':
 								                                in_template = not in_template
 								                            elif c == '%' and not in_template:
-												[extractor/common] Properly escape % in MPD templates (closes #16867)

											
										
										
											6 years ago
+								                                t += c
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                        # Next, $...$ templates are translated to their
 								                        # %(...) counterparts to be used with % operator
 								                        if representation_id is not None:
 								                            t = t.replace('$RepresentationID$', representation_id)
 								                        t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
 								                        t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
 								                        t.replace('$$', '$')
 								                        return t
 								                    # @initialization is a regular template like @media one
 								                    # so it should be handled just the same way (see
 								                    # https://github.com/ytdl-org/youtube-dl/issues/11605)
 								                    if 'initialization' in representation_ms_info:
 								                        initialization_template = prepare_template(
 								                            'initialization',
 								                            # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
 								                            # $Time$ shall not be included for @initialization thus
 								                            # only $Bandwidth$ remains
 								                            ('Bandwidth', ))
 								                        representation_ms_info['initialization_url'] = initialization_template % {
 								                            'Bandwidth': bandwidth,
 								                        }
 								                    def location_key(location):
 								                        return 'url' if re.match(r'^https?://', location) else 'path'
 								                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
 								                        media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
 								                        media_location_key = location_key(media_template)
 								                        # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
 								                        # can't be used at the same time
 								                        if '%(Number' in media_template and 's' not in representation_ms_info:
 								                            segment_duration = None
 								                            if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
 								                                segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
 								                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
 								                            representation_ms_info['fragments'] = [{
 								                                media_location_key: media_template % {
 								                                    'Number': segment_number,
 								                                    'Bandwidth': bandwidth,
 								                                },
 								                                'duration': segment_duration,
 								                            } for segment_number in range(
 								                                representation_ms_info['start_number'],
 								                                representation_ms_info['total_number'] + representation_ms_info['start_number'])]
 								                        else:
 								                            # $Number*$ or $Time$ in media template with S list available
 								                            # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
 								                            # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
 								                            representation_ms_info['fragments'] = []
 								                            segment_time = 0
 								                            segment_d = None
 								                            segment_number = representation_ms_info['start_number']
 								                            def add_segment_url():
 								                                segment_url = media_template % {
 								                                    'Time': segment_time,
 								                                    'Bandwidth': bandwidth,
 								                                    'Number': segment_number,
 								                                }
 								                                representation_ms_info['fragments'].append({
 								                                    media_location_key: segment_url,
 								                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
 								                                })
 								                            for num, s in enumerate(representation_ms_info['s']):
 								                                segment_time = s.get('t') or segment_time
 								                                segment_d = s['d']
 								                                add_segment_url()
 								                                segment_number += 1
 								                                for r in range(s.get('r', 0)):
 								                                    segment_time += segment_d
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											8 years ago
+								                                    add_segment_url()
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											8 years ago
+								                                    segment_number += 1
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                                segment_time += segment_d
 								                    elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
 								                        # No media template
 								                        # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
 								                        # or any YouTube dashsegments video
 								                        fragments = []
 								                        segment_index = 0
 								                        timescale = representation_ms_info['timescale']
 								                        for s in representation_ms_info['s']:
 								                            duration = float_or_none(s['d'], timescale)
 								                            for r in range(s.get('r', 0) + 1):
 								                                segment_uri = representation_ms_info['segment_urls'][segment_index]
 								                                fragments.append({
 								                                    location_key(segment_uri): segment_uri,
 								                                    'duration': duration,
 								                                })
 								                                segment_index += 1
 								                        representation_ms_info['fragments'] = fragments
 								                    elif 'segment_urls' in representation_ms_info:
 								                        # Segment URLs with no SegmentTimeline
 								                        # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
 								                        # https://github.com/ytdl-org/youtube-dl/pull/14844
 								                        fragments = []
 								                        segment_duration = float_or_none(
 								                            representation_ms_info['segment_duration'],
 								                            representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
 								                        for segment_url in representation_ms_info['segment_urls']:
 								                            fragment = {
 								                                location_key(segment_url): segment_url,
 								                            }
 								                            if segment_duration:
 								                                fragment['duration'] = segment_duration
 								                            fragments.append(fragment)
 								                        representation_ms_info['fragments'] = fragments
 								                    # If there is a fragments key available then we correctly recognized fragmented media.
 								                    # Otherwise we will assume unfragmented media with direct access. Technically, such
 								                    # assumption is not necessarily correct since we may simply have no support for
 								                    # some forms of fragmented media renditions yet, but for now we'll use this fallback.
 								                    if 'fragments' in representation_ms_info:
 								                        f.update({
 								                            # NB: mpd_url may be empty when MPD manifest is parsed from a string
 								                            'url': mpd_url or base_url,
 								                            'fragment_base_url': base_url,
 								                            'fragments': [],
 								                            'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
 								                        })
 								                        if 'initialization_url' in representation_ms_info:
 								                            initialization_url = representation_ms_info['initialization_url']
 								                            if not f.get('url'):
 								                                f['url'] = initialization_url
 								                            f['fragments'].append({location_key(initialization_url): initialization_url})
 								                        f['fragments'].extend(representation_ms_info['fragments'])
-												[common] _parse_dash_manifest() from youtube.py

											
										
										
											9 years ago
+								                    else:
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											3 years ago
+								                        # Assuming direct URL to unfragmented media.
 								                        f['url'] = base_url
 								                    if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
 								                        formats.append(f)
 								                    elif content_type == 'text':
 								                        subtitles.setdefault(lang or 'und', []).append(f)
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											4 years ago
+								        return formats, subtitles
-												[common] _parse_dash_manifest() from youtube.py

											
										
										
											9 years ago
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								    def _extract_ism_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the ISM manifest; "
 								                "if any subtitle tracks are missing,"
 								            ))
 								        return fmts
 								    def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        res = self._download_xml_handle(
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								            ism_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											3 years ago
+								            note='Downloading ISM manifest' if note is None else note,
 								            errnote='Failed to download ISM manifest' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											5 years ago
+								            fatal=fatal, data=data, headers=headers, query=query)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								        if res is False:
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								            return [], {}
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											7 years ago
+								        ism_doc, urlh = res
-												[extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667)

											
										
										
											5 years ago
+								        if ism_doc is None:
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								            return [], {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								    def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
-												[extractor/common] Respect Width and Height attributes in ISM manifests

											
										
										
											8 years ago
+								        """
 								        Parse formats from ISM manifest.
 								        References:
 . [MS-SSTR]: Smooth Streaming Protocol,
 								            https://msdn.microsoft.com/en-us/library/ff469518.aspx
 								        """
-												Cleanup some code and fix typos

:ci skip dl

											
										
										
											4 years ago
+								        if ism_doc.get('IsLive') == 'TRUE':
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								            return [], {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
 								        duration = int(ism_doc.attrib['Duration'])
 								        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
 								        formats = []
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								        subtitles = {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								        for stream in ism_doc.findall('StreamIndex'):
 								            stream_type = stream.get('Type')
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								            if stream_type not in ('video', 'audio', 'text'):
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                continue
 								            url_pattern = stream.attrib['Url']
 								            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
 								            stream_name = stream.get('Name')
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								            stream_language = stream.get('Language', 'und')
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								            for track in stream.findall('QualityLevel'):
-												[common] Fix FourCC fallback when parsing ISM (#372)

In some DASH manifests, the FourCC attribute is actually present,
but empty.  We thus apply the same fallback to 'AACL' that we do
when the attribute is entirely absent.

Authored by: fstirlitz

											
										
										
											4 years ago
+								                fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                # TODO: add support for WVC1 and WMAP
-												[downloader/ism] Support muxing TTML subtitles

											
										
										
											4 years ago
+								                if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                    self.report_warning('%s is not a supported codec' % fourcc)
 								                    continue
 								                tbr = int(track.attrib['Bitrate']) // 1000
-												[extractor/common] Respect Width and Height attributes in ISM manifests

											
										
										
											8 years ago
+								                # [1] does not mention Width and Height attributes. However,
 								                # they're often present while MaxWidth and MaxHeight are
 								                # missing, so should be used as fallbacks
 								                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
 								                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                sampling_rate = int_or_none(track.get('SamplingRate'))
 								                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
 								                track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
 								                fragments = []
 								                fragment_ctx = {
 								                    'time': 0,
 								                }
 								                stream_fragments = stream.findall('c')
 								                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
 								                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
 								                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
 								                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
 								                    if not fragment_ctx['duration']:
 								                        try:
 								                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
 								                        except IndexError:
 								                            next_fragment_time = duration
-												[extractor/common] Fix typo

											
										
										
											8 years ago
+								                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                    for _ in range(fragment_repeat):
 								                        fragments.append({
-												[extractor/common] Fix typo

											
										
										
											8 years ago
+								                            'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
+								                            'duration': fragment_ctx['duration'] / stream_timescale,
 								                        })
 								                        fragment_ctx['time'] += fragment_ctx['duration']
 								                format_id = []
 								                if ism_id:
 								                    format_id.append(ism_id)
 								                if stream_name:
 								                    format_id.append(stream_name)
 								                format_id.append(compat_str(tbr))
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								                if stream_type == 'text':
 								                    subtitles.setdefault(stream_language, []).append({
 								                        'ext': 'ismt',
 								                        'protocol': 'ism',
 								                        'url': ism_url,
 								                        'manifest_url': ism_url,
 								                        'fragments': fragments,
 								                        '_download_params': {
 								                            'stream_type': stream_type,
 								                            'duration': duration,
 								                            'timescale': stream_timescale,
 								                            'fourcc': fourcc,
 								                            'language': stream_language,
 								                            'codec_private_data': track.get('CodecPrivateData'),
 								                        }
 								                    })
 								                elif stream_type in ('video', 'audio'):
 								                    formats.append({
 								                        'format_id': '-'.join(format_id),
 								                        'url': ism_url,
 								                        'manifest_url': ism_url,
 								                        'ext': 'ismv' if stream_type == 'video' else 'isma',
 								                        'width': width,
 								                        'height': height,
 								                        'tbr': tbr,
 								                        'asr': sampling_rate,
 								                        'vcodec': 'none' if stream_type == 'audio' else fourcc,
 								                        'acodec': 'none' if stream_type == 'video' else fourcc,
 								                        'protocol': 'ism',
 								                        'fragments': fragments,
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											3 years ago
+								                        'has_drm': ism_doc.find('Protection') is not None,
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											4 years ago
+								                        '_download_params': {
 								                            'stream_type': stream_type,
 								                            'duration': duration,
 								                            'timescale': stream_timescale,
 								                            'width': width or 0,
 								                            'height': height or 0,
 								                            'fourcc': fourcc,
 								                            'language': stream_language,
 								                            'codec_private_data': track.get('CodecPrivateData'),
 								                            'sampling_rate': sampling_rate,
 								                            'channels': int_or_none(track.get('Channels', 2)),
 								                            'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
 								                            'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
 								                        },
 								                    })
 								        return formats, subtitles
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											8 years ago
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
-												[extractor/common] Improve thumbnail extraction for HTML5 entries

											
										
										
											7 years ago
+								        def absolute_url(item_url):
 								            return urljoin(base_url, item_url)
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
 								        def parse_content_type(content_type):
 								            if not content_type:
 								                return {}
 								            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
 								            if ctr:
 								                mimetype, codecs = ctr.groups()
 								                f = parse_codecs(codecs)
 								                f['ext'] = mimetype2ext(mimetype)
 								                return f
 								            return {}
-												[extractor/common] Fix _media_formats

											
										
										
											7 years ago
+								        def _media_formats(src, cur_media_type, type_info={}):
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								            full_url = absolute_url(src)
-												[extractor/common] Respect source's type attribute for HTML5 media (closes #13892)

											
										
										
											7 years ago
+								            ext = type_info.get('ext') or determine_ext(full_url)
-												[extractor/common] Recognize DASH formats in html5 media entries

											
										
										
											8 years ago
+								            if ext == 'm3u8':
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								                is_plain_url = False
 								                formats = self._extract_m3u8_formats(
-												[extractor/common] Change the default m3u8 protocol in HTML5

Helper functions should have consistent default values

											
										
										
											8 years ago
+								                    full_url, video_id, ext='mp4',
-												[extractor/common] Add 'preference' to _parse_html5_media_entries

Some websites, like NJPWorld, put different qualities on different
player pages.

											
										
										
											8 years ago
+								                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											4 years ago
+								                    preference=preference, quality=quality, fatal=False)
-												[extractor/common] Recognize DASH formats in html5 media entries

											
										
										
											8 years ago
+								            elif ext == 'mpd':
 								                is_plain_url = False
 								                formats = self._extract_mpd_formats(
-												[extractor/common] Make HLS and DASH extraction non fatal in _parse_html5_media_entries (closes #13970)

											
										
										
											7 years ago
+								                    full_url, video_id, mpd_id=mpd_id, fatal=False)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								            else:
 								                is_plain_url = True
 								                formats = [{
 								                    'url': full_url,
 								                    'vcodec': 'none' if cur_media_type == 'audio' else None,
 								                }]
 								            return is_plain_url, formats
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								        entries = []
-												[extractor/common] Add support for AMP tags in _parse_html5_media_entries

											
										
										
											7 years ago
+								        # amp-video and amp-audio are very similar to their HTML5 counterparts
 								        # so we wll include them right here (see
 								        # https://www.ampproject.org/docs/reference/components/amp-video)
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
 								        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
 								        media_tags = [(media_tag, media_tag_name, media_type, '')
 								                      for media_tag, media_tag_name, media_type
 								                      in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
-												[extractor/common] Speed-up media tags regex (closes #11979)

											
										
										
											8 years ago
+								        media_tags.extend(re.findall(
 								            # We only allow video|audio followed by a whitespace or '>'.
 								            # Allowing more characters may end up in significant slow down (see
-												Start moving to ytdl-org

											
										
										
											6 years ago
+								            # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
-												[extractor/common] Speed-up media tags regex (closes #11979)

											
										
										
											8 years ago
+								            # http://www.porntrex.com/maps/videositemap.xml).
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								            r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
 								        for media_tag, _, media_type, media_content in media_tags:
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								            media_info = {
 								                'formats': [],
 								                'subtitles': {},
 								            }
 								            media_attributes = extract_attributes(media_tag)
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											6 years ago
+								            src = strip_or_none(media_attributes.get('src'))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								            if src:
-												Fix parsing of HTML5 media elements

This fixes an error in _parse_html5_media_entries in case
an audio or video tag directly uses a src attribute insted
of <source> elements in it's body.

											
										
										
											8 years ago
+								                _, formats = _media_formats(src, media_type)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								                media_info['formats'].extend(formats)
-												[extractor/common] Improve thumbnail extraction for HTML5 entries

											
										
										
											7 years ago
+								            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								            if media_content:
 								                for source_tag in re.findall(r'<source[^>]+>', media_content):
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								                    s_attr = extract_attributes(source_tag)
 								                    # data-video-src and data-src are non standard but seen
 								                    # several times in the wild
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											6 years ago
+								                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								                    if not src:
 								                        continue
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								                    f = parse_content_type(s_attr.get('type'))
-												[extractor/common] Fix _media_formats

											
										
										
											7 years ago
+								                    is_plain_url, formats = _media_formats(src, media_type, f)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								                    if is_plain_url:
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								                        # width, height, res, label and title attributes are
 								                        # all not standard but seen several times in the wild
 								                        labels = [
 								                            s_attr.get(lbl)
 								                            for lbl in ('label', 'title')
 								                            if str_or_none(s_attr.get(lbl))
 								                        ]
 								                        width = int_or_none(s_attr.get('width'))
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								                        height = (int_or_none(s_attr.get('height'))
 								                                  or int_or_none(s_attr.get('res')))
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								                        if not width or not height:
 								                            for lbl in labels:
 								                                resolution = parse_resolution(lbl)
 								                                if not resolution:
 								                                    continue
 								                                width = width or resolution.get('width')
 								                                height = height or resolution.get('height')
 								                        for lbl in labels:
 								                            tbr = parse_bitrate(lbl)
 								                            if tbr:
 								                                break
 								                        else:
 								                            tbr = None
-												[extractor/common] Extract format id from label attribute of source tag for HTML5 videos (#14034)

											
										
										
											7 years ago
+								                        f.update({
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											6 years ago
+								                            'width': width,
 								                            'height': height,
 								                            'tbr': tbr,
 								                            'format_id': s_attr.get('label') or s_attr.get('title'),
-												[extractor/common] Extract format id from label attribute of source tag for HTML5 videos (#14034)

											
										
										
											7 years ago
+								                        })
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											8 years ago
+								                        f.update(formats[0])
 								                        media_info['formats'].append(f)
 								                    else:
 								                        media_info['formats'].extend(formats)
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								                for track_tag in re.findall(r'<track[^>]+>', media_content):
 								                    track_attributes = extract_attributes(track_tag)
 								                    kind = track_attributes.get('kind')
-												[extractor/common] Improved support for HTML5 subtitles

Ref: #10625

In a strict sense, <track>s with kind=captions are not subtitles. [1]
openload misuses this attribute, and I guess there will be more
examples, so I add it to common.py.

Also allow extracting information for subtitles-only <video> or <audio>
tags, which is the case of openload.

[1] https://www.w3.org/TR/html5/embedded-content-0.html#attr-track-kind

											
										
										
											8 years ago
+								                    if not kind or kind in ('subtitles', 'captions'):
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											6 years ago
+								                        src = strip_or_none(track_attributes.get('src'))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								                        if not src:
 								                            continue
 								                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
 								                        media_info['subtitles'].setdefault(lang, []).append({
 								                            'url': absolute_url(src),
 								                        })
-												[extractor/common] Use source URL as Referer for HTML5 entries (closes #16849)

											
										
										
											6 years ago
+								            for f in media_info['formats']:
 								                f.setdefault('http_headers', {})['Referer'] = base_url
-												[extractor/common] Improved support for HTML5 subtitles

Ref: #10625

In a strict sense, <track>s with kind=captions are not subtitles. [1]
openload misuses this attribute, and I guess there will be more
examples, so I add it to common.py.

Also allow extracting information for subtitles-only <video> or <audio>
tags, which is the case of openload.

[1] https://www.w3.org/TR/html5/embedded-content-0.html#attr-track-kind

											
										
										
											8 years ago
+								            if media_info['formats'] or media_info['subtitles']:
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											9 years ago
+								                entries.append(media_info)
 								        return entries
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											4 years ago
+								    def _extract_akamai_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self.report_warning(bug_reports_message(
 								                "Ignoring subtitle tracks found in the manifests; "
 								                "if any subtitle tracks are missing,"
 								            ))
 								        return fmts
 								    def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								        signed = 'hdnea=' in manifest_url
 								        if not signed:
 								            # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
 								            manifest_url = re.sub(
 								                r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
 								                '', manifest_url).strip('?')
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											8 years ago
+								        formats = []
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											4 years ago
+								        subtitles = {}
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
-												[common] add hdcore sign to akamai f4m formats

											
										
										
											8 years ago
+								        hdcore_sign = 'hdcore=3.7.0'
-												[extractor/common] fix typo in _extract_akamai_formats

											
										
										
											8 years ago
+								        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
-												[common] add possibility to customize akamai manifest host

											
										
										
											8 years ago
+								        hds_host = hosts.get('hds')
 								        if hds_host:
 								            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
-												[common] add hdcore sign to akamai f4m formats

											
										
										
											8 years ago
+								        if 'hdcore=' not in f4m_url:
 								            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
 								        f4m_formats = self._extract_f4m_formats(
 								            f4m_url, video_id, f4m_id='hds', fatal=False)
 								        for entry in f4m_formats:
 								            entry.update({'extra_param_to_segment_url': hdcore_sign})
 								        formats.extend(f4m_formats)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
-												[common] add possibility to customize akamai manifest host

											
										
										
											8 years ago
+								        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
 								        hls_host = hosts.get('hls')
 								        if hls_host:
 								            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											4 years ago
+								        m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											8 years ago
+								            m3u8_url, video_id, 'mp4', 'm3u8_native',
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								            m3u8_id='hls', fatal=False)
 								        formats.extend(m3u8_formats)
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											4 years ago
+								        subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
 								        http_host = hosts.get('http')
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								        if http_host and m3u8_formats and not signed:
 								            REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
+								            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
 								            qualities_length = len(qualities)
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								            if len(m3u8_formats) in (qualities_length, qualities_length + 1):
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
+								                i = 0
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								                for f in m3u8_formats:
 								                    if f['vcodec'] != 'none':
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
+								                        for protocol in ('http', 'https'):
 								                            http_f = f.copy()
 								                            del http_f['manifest_url']
 								                            http_url = re.sub(
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								                                REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
+								                            http_f.update({
 								                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
 								                                'url': http_url,
 								                                'protocol': protocol,
 								                            })
-												Update to ytdl-2021.01.03

											
										
										
											4 years ago
+								                            formats.append(http_f)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											4 years ago
+								                        i += 1
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											4 years ago
+								        return formats, subtitles
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											8 years ago
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
+								        query = compat_urlparse.urlparse(url).query
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
-												[extractor/common] Respect secure schemes in _extract_wowza_formats

											
										
										
											7 years ago
+								        mobj = re.search(
 								            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
 								        url_base = mobj.group('url')
 								        http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								        formats = []
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
 								        def manifest_url(manifest):
 								            m_url = '%s/%s' % (http_base_url, manifest)
 								            if query:
 								                m_url += '?%s' % query
 								            return m_url
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								        if 'm3u8' not in skip_protocols:
 								            formats.extend(self._extract_m3u8_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
+								                manifest_url('playlist.m3u8'), video_id, 'mp4',
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
 								        if 'f4m' not in skip_protocols:
 								            formats.extend(self._extract_f4m_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
+								                manifest_url('manifest.f4m'),
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								                video_id, f4m_id='hds', fatal=False))
-												[extractor/common] try to extract non smil wowza mpd manifests

											
										
										
											8 years ago
+								        if 'dash' not in skip_protocols:
 								            formats.extend(self._extract_mpd_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
+								                manifest_url('manifest.mpd'),
-												[extractor/common] try to extract non smil wowza mpd manifests

											
										
										
											8 years ago
+								                video_id, mpd_id='dash', fatal=False))
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								        if re.search(r'(?:/smil:|\.smil)', url_base):
 								            if 'smil' not in skip_protocols:
 								                rtmp_formats = self._extract_smil_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											7 years ago
+								                    manifest_url('jwplayer.smil'),
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								                    video_id, fatal=False)
 								                for rtmp_format in rtmp_formats:
 								                    rtsp_format = rtmp_format.copy()
 								                    rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
 								                    del rtsp_format['play_path']
 								                    del rtsp_format['ext']
 								                    rtsp_format.update({
 								                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
 								                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
 								                        'protocol': 'rtsp',
 								                    })
 								                    formats.extend([rtmp_format, rtsp_format])
 								        else:
 								            for protocol in ('rtmp', 'rtsp'):
 								                if protocol not in skip_protocols:
 								                    formats.append({
-												[extractor/common] Fix rtmp and rtsp formats' URLs in _extract_wowza_formats

											
										
										
											8 years ago
+								                        'url': '%s:%s' % (protocol, url_base),
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											8 years ago
+								                        'format_id': protocol,
 								                        'protocol': protocol,
 								                    })
 								        return formats
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											8 years ago
+								    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								        mobj = re.search(
-												[extractor/common] Improve jwplayer regex

											
										
										
											8 years ago
+								            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								            webpage)
 								        if mobj:
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											8 years ago
+								            try:
 								                jwplayer_data = self._parse_json(mobj.group('options'),
 								                                                 video_id=video_id,
 								                                                 transform_source=transform_source)
 								            except ExtractorError:
 								                pass
 								            else:
 								                if isinstance(jwplayer_data, dict):
 								                    return jwplayer_data
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
 								    def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											8 years ago
+								        jwplayer_data = self._find_jwplayer_data(
 								            webpage, video_id, transform_source=js_to_json)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								        return self._parse_jwplayer_data(
 								            jwplayer_data, video_id, *args, **kwargs)
 								    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
 								                             m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
 								        # JWPlayer backward compatibility: flattened playlists
 								        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
 								        if 'playlist' not in jwplayer_data:
 								            jwplayer_data = {'playlist': [jwplayer_data]}
 								        entries = []
 								        # JWPlayer backward compatibility: single playlist item
 								        # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
 								        if not isinstance(jwplayer_data['playlist'], list):
 								            jwplayer_data['playlist'] = [jwplayer_data['playlist']]
 								        for video_data in jwplayer_data['playlist']:
 								            # JWPlayer backward compatibility: flattened sources
 								            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
 								            if 'sources' not in video_data:
 								                video_data['sources'] = [video_data]
 								            this_video_id = video_id or video_data['mediaid']
-												[extractor/common] Pass arguments to _parse_jwplayer_formats and PEP8

											
										
										
											8 years ago
+								            formats = self._parse_jwplayer_formats(
 								                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
 								                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
 								            subtitles = {}
 								            tracks = video_data.get('tracks')
 								            if tracks and isinstance(tracks, list):
 								                for track in tracks:
-												[extractor/common] Improve jwplayer subtitles extraction

											
										
										
											7 years ago
+								                    if not isinstance(track, dict):
 								                        continue
-												[extractor/common] Improve jwplayer subtitles extraction (closes #15695)

											
										
										
											7 years ago
+								                    track_kind = track.get('kind')
 								                    if not track_kind or not isinstance(track_kind, compat_str):
 								                        continue
 								                    if track_kind.lower() not in ('captions', 'subtitles'):
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								                        continue
 								                    track_url = urljoin(base_url, track.get('file'))
 								                    if not track_url:
 								                        continue
 								                    subtitles.setdefault(track.get('label') or 'en', []).append({
 								                        'url': self._proto_relative_url(track_url)
 								                    })
-												[common] add support for jwplayer youtube embeds

											
										
										
											7 years ago
+								            entry = {
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								                'id': this_video_id,
-												[common] add support for jwplayer youtube embeds

											
										
										
											7 years ago
+								                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
-												[extractor/common] clean jwplayer description HTML tags

											
										
										
											5 years ago
+								                'description': clean_html(video_data.get('description')),
-												[extractor/common] improve jwplayer relative url handling(closes #18892)

											
										
										
											6 years ago
+								                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								                'timestamp': int_or_none(video_data.get('pubdate')),
 								                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
 								                'subtitles': subtitles,
-												[common] add support for jwplayer youtube embeds

											
										
										
											7 years ago
+								            }
 								            # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
 								            if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
 								                entry.update({
 								                    '_type': 'url_transparent',
 								                    'url': formats[0]['url'],
 								                })
 								            else:
 								                self._sort_formats(formats)
 								                entry['formats'] = formats
 								            entries.append(entry)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											8 years ago
+								        if len(entries) == 1:
 								            return entries[0]
 								        else:
 								            return self.playlist_result(entries)
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
 								                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
-												[common] Relax JWPlayer regex and remove duplicate urls(#12768)

											
										
										
											8 years ago
+								        urls = []
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								        formats = []
-												[extractor/common] Pass arguments to _parse_jwplayer_formats and PEP8

											
										
										
											8 years ago
+								        for source in jwplayer_sources_data:
-												[extractor/common] Improve jwplayer formats extraction (closes #13379)

											
										
										
											8 years ago
+								            if not isinstance(source, dict):
 								                continue
-												[extractor/common] improve jwplayer relative url handling(closes #18892)

											
										
										
											6 years ago
+								            source_url = urljoin(
 								                base_url, self._proto_relative_url(source.get('file')))
 								            if not source_url or source_url in urls:
-												[common] Relax JWPlayer regex and remove duplicate urls(#12768)

											
										
										
											8 years ago
+								                continue
 								            urls.append(source_url)
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								            source_type = source.get('type') or ''
 								            ext = mimetype2ext(source_type) or determine_ext(source_url)
 								            if source_type == 'hls' or ext == 'm3u8':
 								                formats.extend(self._extract_m3u8_formats(
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											8 years ago
+								                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
 								                    m3u8_id=m3u8_id, fatal=False))
-												[extractor/common] Improve DASH formats extraction for jwplayer (#9242, #15187)

											
										
										
											7 years ago
+								            elif source_type == 'dash' or ext == 'mpd':
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								                formats.extend(self._extract_mpd_formats(
 								                    source_url, video_id, mpd_id=mpd_id, fatal=False))
-												[extractor/common] Extract SMIL formats from jwplayer

											
										
										
											8 years ago
+								            elif ext == 'smil':
 								                formats.extend(self._extract_smil_formats(
 								                    source_url, video_id, fatal=False))
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											8 years ago
+								            elif source_type.startswith('audio') or ext in (
 								                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								                formats.append({
 								                    'url': source_url,
 								                    'vcodec': 'none',
 								                    'ext': ext,
 								                })
 								            else:
 								                height = int_or_none(source.get('height'))
 								                if height is None:
 								                    # Often no height is provided but there is a label in
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											8 years ago
+								                    # format like "1080p", "720p SD", or 1080.
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								                    height = int_or_none(self._search_regex(
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											8 years ago
+								                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								                        'height', default=None))
 								                a_format = {
 								                    'url': source_url,
 								                    'width': int_or_none(source.get('width')),
 								                    'height': height,
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											8 years ago
+								                    'tbr': int_or_none(source.get('bitrate')),
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											8 years ago
+								                    'ext': ext,
 								                }
 								                if source_url.startswith('rtmp'):
 								                    a_format['ext'] = 'flv'
 								                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
 								                    # of jwplayer.flash.swf
 								                    rtmp_url_parts = re.split(
 								                        r'((?:mp4|mp3|flv):)', source_url, 1)
 								                    if len(rtmp_url_parts) == 3:
 								                        rtmp_url, prefix, play_path = rtmp_url_parts
 								                        a_format.update({
 								                            'url': rtmp_url,
 								                            'play_path': prefix + play_path,
 								                        })
 								                    if rtmp_params:
 								                        a_format.update(rtmp_params)
 								                formats.append(a_format)
 								        return formats
-												[muenchentv] Move live title generation to common

											
										
										
											10 years ago
+								    def _live_title(self, name):
 								        """ Generate the title for a live video """
 								        now = datetime.datetime.now()
-												[refactor] Single quotes consistency

											
										
										
											9 years ago
+								        now_str = now.strftime('%Y-%m-%d %H:%M')
-												[muenchentv] Move live title generation to common

											
										
										
											10 years ago
+								        return name + ' ' + now_str
-												[golem] Simplify (#3828)

											
										
										
											10 years ago
+								    def _int(self, v, name, fatal=False, **kwargs):
 								        res = int_or_none(v, **kwargs)
 								        if 'get_attr' in kwargs:
 								            print(getattr(v, kwargs['get_attr']))
 								        if res is None:
 								            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 								            if fatal:
 								                raise ExtractorError(msg)
 								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								                self.report_warning(msg)
-												[golem] Simplify (#3828)

											
										
										
											10 years ago
+								        return res
 								    def _float(self, v, name, fatal=False, **kwargs):
 								        res = float_or_none(v, **kwargs)
 								        if res is None:
 								            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 								            if fatal:
 								                raise ExtractorError(msg)
 								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								                self.report_warning(msg)
-												[golem] Simplify (#3828)

											
										
										
											10 years ago
+								        return res
-												[phantomjs] add cookie support

											
										
										
											8 years ago
+								    def _set_cookie(self, domain, name, value, expire_time=None, port=None,
 								                    path='/', secure=False, discard=False, rest={}, **kwargs):
-												[extractor/common] Use compat_cookiejar_Cookie for _set_cookie (closes #23256, closes #24776)

To always ensure cookie name and value are bytestrings on python 2.

											
										
										
											5 years ago
+								        cookie = compat_cookiejar_Cookie(
-												Fix flake8 issues after #14225

											
										
										
											7 years ago
+, name, value, port, port is not None, domain, True,
-												[phantomjs] add cookie support

											
										
										
											8 years ago
+								            domain.startswith('.'), path, True, secure, expire_time,
 								            discard, None, None, rest)
-												[youtube] Use a cookie for seeting the language

This way, we don't have to do an aditional request

											
										
										
											10 years ago
+								        self._downloader.cookiejar.set_cookie(cookie)
-												[viewster] extract the api auth token

Closes #6406.

											
										
										
											9 years ago
+								    def _get_cookies(self, url):
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											4 years ago
+								        """ Return a compat_cookies_SimpleCookie with the cookies for the url """
-												Switch codebase to use sanitized_Request instead of
compat_urllib_request.Request

[downloader/dash] Use sanitized_Request

[downloader/http] Use sanitized_Request

[atresplayer] Use sanitized_Request

[bambuser] Use sanitized_Request

[bliptv] Use sanitized_Request

[brightcove] Use sanitized_Request

[cbs] Use sanitized_Request

[ceskatelevize] Use sanitized_Request

[collegerama] Use sanitized_Request

[extractor/common] Use sanitized_Request

[crunchyroll] Use sanitized_Request

[dailymotion] Use sanitized_Request

[dcn] Use sanitized_Request

[dramafever] Use sanitized_Request

[dumpert] Use sanitized_Request

[eitb] Use sanitized_Request

[escapist] Use sanitized_Request

[everyonesmixtape] Use sanitized_Request

[extremetube] Use sanitized_Request

[facebook] Use sanitized_Request

[fc2] Use sanitized_Request

[flickr] Use sanitized_Request

[4tube] Use sanitized_Request

[gdcvault] Use sanitized_Request

[extractor/generic] Use sanitized_Request

[hearthisat] Use sanitized_Request

[hotnewhiphop] Use sanitized_Request

[hypem] Use sanitized_Request

[iprima] Use sanitized_Request

[ivi] Use sanitized_Request

[keezmovies] Use sanitized_Request

[letv] Use sanitized_Request

[lynda] Use sanitized_Request

[metacafe] Use sanitized_Request

[minhateca] Use sanitized_Request

[miomio] Use sanitized_Request

[meovideo] Use sanitized_Request

[mofosex] Use sanitized_Request

[moniker] Use sanitized_Request

[mooshare] Use sanitized_Request

[movieclips] Use sanitized_Request

[mtv] Use sanitized_Request

[myvideo] Use sanitized_Request

[neteasemusic] Use sanitized_Request

[nfb] Use sanitized_Request

[niconico] Use sanitized_Request

[noco] Use sanitized_Request

[nosvideo] Use sanitized_Request

[novamov] Use sanitized_Request

[nowness] Use sanitized_Request

[nuvid] Use sanitized_Request

[played] Use sanitized_Request

[pluralsight] Use sanitized_Request

[pornhub] Use sanitized_Request

[pornotube] Use sanitized_Request

[primesharetv] Use sanitized_Request

[promptfile] Use sanitized_Request

[qqmusic] Use sanitized_Request

[rtve] Use sanitized_Request

[safari] Use sanitized_Request

[sandia] Use sanitized_Request

[shared] Use sanitized_Request

[sharesix] Use sanitized_Request

[sina] Use sanitized_Request

[smotri] Use sanitized_Request

[sohu] Use sanitized_Request

[spankwire] Use sanitized_Request

[sportdeutschland] Use sanitized_Request

[streamcloud] Use sanitized_Request

[streamcz] Use sanitized_Request

[tapely] Use sanitized_Request

[tube8] Use sanitized_Request

[tubitv] Use sanitized_Request

[twitch] Use sanitized_Request

[twitter] Use sanitized_Request

[udemy] Use sanitized_Request

[vbox7] Use sanitized_Request

[veoh] Use sanitized_Request

[vessel] Use sanitized_Request

[vevo] Use sanitized_Request

[viddler] Use sanitized_Request

[videomega] Use sanitized_Request

[viewvster] Use sanitized_Request

[viki] Use sanitized_Request

[vk] Use sanitized_Request

[vodlocker] Use sanitized_Request

[voicerepublic] Use sanitized_Request

[wistia] Use sanitized_Request

[xfileshare] Use sanitized_Request

[xtube] Use sanitized_Request

[xvideos] Use sanitized_Request

[yandexmusic] Use sanitized_Request

[youku] Use sanitized_Request

[youporn] Use sanitized_Request

[youtube] Use sanitized_Request

[patreon] Use sanitized_Request

[extractor/common] Remove unused import

[nfb] PEP 8

											
										
										
											9 years ago
+								        req = sanitized_Request(url)
-												[viewster] extract the api auth token

Closes #6406.

											
										
										
											9 years ago
+								        self._downloader.cookiejar.add_cookie_header(req)
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											4 years ago
+								        return compat_cookies_SimpleCookie(req.get_header('Cookie'))
-												[viewster] extract the api auth token

Closes #6406.

											
										
										
											9 years ago
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											6 years ago
+								    def _apply_first_set_cookie_header(self, url_handle, cookie):
-												[extractor/common] Add doc string for _apply_first_set_cookie_header

											
										
										
											6 years ago
+								        """
 								        Apply first Set-Cookie header instead of the last. Experimental.
 								        Some sites (e.g. [1-3]) may serve two cookies under the same name
 								        in Set-Cookie header and expect the first (old) one to be set rather
 								        than second (new). However, as of RFC6265 the newer one cookie
 								        should be set into cookie store what actually happens.
 								        We will workaround this issue by resetting the cookie to
 								        the first one manually.
 . https://new.vk.com/
 . https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
 . https://learning.oreilly.com/
 								        """
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											6 years ago
+								        for header, cookies in url_handle.headers.items():
 								            if header.lower() != 'set-cookie':
 								                continue
 								            if sys.version_info[0] >= 3:
 								                cookies = cookies.encode('iso-8859-1')
 								            cookies = cookies.decode('utf-8')
 								            cookie_value = re.search(
 								                r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
 								            if cookie_value:
 								                value, domain = cookie_value.groups()
 								                self._set_cookie(domain, cookie, value)
 								                break
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											10 years ago
+								    def get_testcases(self, include_onlymatching=False):
 								        t = getattr(self, '_TEST', None)
 								        if t:
 								            assert not hasattr(self, '_TESTS'), \
 								                '%s has _TEST and _TESTS' % type(self).__name__
 								            tests = [t]
 								        else:
 								            tests = getattr(self, '_TESTS', [])
 								        for t in tests:
 								            if not include_onlymatching and t.get('only_matching', False):
 								                continue
 								            t['name'] = type(self).__name__[:-len('IE')]
 								            yield t
 								    def is_suitable(self, age_limit):
 								        """ Test whether the extractor is generally suitable for the given
 								        age limit (i.e. pornographic sites are not, all others usually are) """
 								        any_restricted = False
 								        for tc in self.get_testcases(include_onlymatching=False):
-												[extractor/common] Improve is_suitable

In order to fix breakage introduced by a3aa814b774a413d9e7f4fbfadf06fe6dcc59b25

											
										
										
											8 years ago
+								            if tc.get('playlist', []):
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											10 years ago
+								                tc = tc['playlist'][0]
 								            is_restricted = age_restricted(
 								                tc.get('info_dict', {}).get('age_limit'), age_limit)
 								            if not is_restricted:
 								                return True
 								            any_restricted = any_restricted or is_restricted
 								        return not any_restricted
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											10 years ago
+								    def extract_subtitles(self, *args, **kwargs):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if (self.get_param('writesubtitles', False)
 								                or self.get_param('listsubtitles')):
-												[extractor/common] Simplify subtitles handling methods

Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables.

											
										
										
											10 years ago
+								            return self._get_subtitles(*args, **kwargs)
 								        return {}
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											10 years ago
 								    def _get_subtitles(self, *args, **kwargs):
-												[refactor] Single quotes consistency

											
										
										
											9 years ago
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											10 years ago
-												[common] Add _merge_subtitles()

											
										
										
											9 years ago
+								    @staticmethod
 								    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
 								        """ Merge subtitle items for one language. Items with duplicated URLs
 								        will be dropped. """
 								        list1_urls = set([item['url'] for item in subtitle_list1])
 								        ret = list(subtitle_list1)
 								        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
 								        return ret
 								    @classmethod
-												[extractor] Minor improvements (See desc)

1. Allow removal of login hint - extractors can set their own login hint as part of `msg`
2. Cleanup `_merge_subtitles` signature

											
										
										
											3 years ago
+								    def _merge_subtitles(cls, *dicts, target=None):
-												[extractor/common] Generalise _merge_subtitles

This allows modifying a subtitles dictionary in-place.

											
										
										
											4 years ago
+								        """ Merge subtitle dictionaries, language by language. """
 								        if target is None:
 								            target = {}
 								        for d in dicts:
 								            for lang, subs in d.items():
 								                target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
 								        return target
-												[common] Add _merge_subtitles()

											
										
										
											9 years ago
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											10 years ago
+								    def extract_automatic_captions(self, *args, **kwargs):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if (self.get_param('writeautomaticsub', False)
 								                or self.get_param('listsubtitles')):
-												[extractor/common] Simplify subtitles handling methods

Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables.

											
										
										
											10 years ago
+								            return self._get_automatic_captions(*args, **kwargs)
 								        return {}
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											10 years ago
 								    def _get_automatic_captions(self, *args, **kwargs):
-												[refactor] Single quotes consistency

											
										
										
											9 years ago
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											10 years ago
-												Add --mark-watched feature (Closes #5054)

											
										
										
											9 years ago
+								    def mark_watched(self, *args, **kwargs):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        if (self.get_param('mark_watched', False)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											6 years ago
+								                and (self._get_login_info()[0] is not None
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								                     or self.get_param('cookiefile') is not None)):
-												Add --mark-watched feature (Closes #5054)

											
										
										
											9 years ago
+								            self._mark_watched(*args, **kwargs)
 								    def _mark_watched(self, *args, **kwargs):
 								        raise NotImplementedError('This method must be implemented by subclasses')
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											8 years ago
+								    def geo_verification_headers(self):
 								        headers = {}
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											4 years ago
+								        geo_verification_proxy = self.get_param('geo_verification_proxy')
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											8 years ago
+								        if geo_verification_proxy:
 								            headers['Ytdl-request-proxy'] = geo_verification_proxy
 								        return headers
-												[extractor/common] Add id and title helpers for generic IEs

											
										
										
											8 years ago
+								    def _generic_id(self, url):
 								        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 								    def _generic_title(self, url):
 								        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											4 years ago
+								    @staticmethod
-												[fancode] Add extractor (#316,#354)
Closes #269, #363

Authored by: rmsmachine

											
										
										
											3 years ago
+								    def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											4 years ago
+								        all_known = all(map(
 								            lambda x: x is not None,
 								            (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
 								        return (
 								            'private' if is_private
 								            else 'premium_only' if needs_premium
 								            else 'subscriber_only' if needs_subscription
 								            else 'needs_auth' if needs_auth
 								            else 'unlisted' if is_unlisted
 								            else 'public' if all_known
 								            else None)
-												Improve `extractor_args` parsing

											
										
										
											3 years ago
+								    def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
 								        '''
 								        @returns            A list of values for the extractor argument given by "key"
 								                            or "default" if no such key is present
 								        @param default      The default value to return when the key is not present (default: [])
 								        @param casesense    When false, the values are converted to lower case
 								        '''
 								        val = traverse_obj(
-												Add `--extractor-args` to pass extractor-specific arguments

											
										
										
											3 years ago
+								            self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
-												Improve `extractor_args` parsing

											
										
										
											3 years ago
+								        if val is None:
 								            return [] if default is NO_DEFAULT else default
 								        return list(val) if casesense else [x.lower() for x in val]
-												Add `--extractor-args` to pass extractor-specific arguments

											
										
										
											3 years ago
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											11 years ago
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								class SearchInfoExtractor(InfoExtractor):
 								    """
 								    Base class for paged search queries extractors.
-												[extractor/common] Consistent URL spelling

											
										
										
											9 years ago
+								    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								    Instances should define _SEARCH_KEY and _MAX_RESULTS.
 								    """
 								    @classmethod
 								    def _make_valid_url(cls):
 								        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 								    @classmethod
 								    def suitable(cls, url):
 								        return re.match(cls._make_valid_url(), url) is not None
 								    def _real_extract(self, query):
 								        mobj = re.match(self._make_valid_url(), query)
 								        if mobj is None:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								            raise ExtractorError('Invalid search query "%s"' % query)
-												Fix generic class move (add all files)

											
										
										
											12 years ago
 								        prefix = mobj.group('prefix')
 								        query = mobj.group('query')
 								        if prefix == '':
 								            return self._get_n_results(query, 1)
 								        elif prefix == 'all':
 								            return self._get_n_results(query, self._MAX_RESULTS)
 								        else:
 								            n = int(prefix)
 								            if n <= 0:
-												[extractor/common] Modernize

											
										
										
											10 years ago
+								                raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								            elif n > self._MAX_RESULTS:
-												Fix inconsistent use of `report_warning`

											
										
										
											4 years ago
+								                self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
-												Fix generic class move (add all files)

											
										
										
											12 years ago
+								                n = self._MAX_RESULTS
 								            return self._get_n_results(query, n)
 								    def _get_n_results(self, query, n):
 								        """Get a specified number of results for a query"""
-												[refactor] Single quotes consistency

											
										
										
											9 years ago
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												Add --list-extractor-descriptions (human-readable list of IEs)

											
										
										
											12 years ago
 								    @property
 								    def SEARCH_KEY(self):
 								        return self._SEARCH_KEY