Add option `--use-extractors`

Deprecates `--force-generic-extractor` Closes #3234, Closes #2044 Related: #4307, #1791
2 years ago · fe7866d0ed
parent 5314b52192
commit fe7866d0ed
5 changed files with 58 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -375,7 +375,13 @@ You can also fork the project on github and run your fork's [build workflow](.gi
    --list-extractors               List all supported extractors and exit
    --extractor-descriptions        Output descriptions of all supported
                                    extractors and exit
-    --force-generic-extractor       Force extraction to use the generic extractor
+    --use-extractors, --ies NAMES   Extractor names to use separated by commas.
+                                    You can also use regexes, "all", "default"
+                                    and "end" (end URL matching); e.g. --ies
+                                    "holodex.*,end,youtube". Prefix the name
+                                    with a "-" to exclude it, e.g. --ies
+                                    default,-generic. Use --list-extractors for
+                                    a list of available extractor names
    --default-search PREFIX         Use this prefix for unqualified URLs. E.g.
                                    "gvsearch2:python" downloads two videos from
                                    google videos for the search term "python".
@ -2058,6 +2064,7 @@ While these options are redundant, they are still expected to be used due to the
 #### Not recommended
 While these options still work, their use is not recommended since there are other alternatives to achieve the same

+    --force-generic-extractor        --ies generic,default
    --exec-before-download CMD       --exec "before_dl:CMD"
    --no-exec-before-download        --no-exec
    --all-formats                    -f all
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -29,6 +29,7 @@ from .cookies import load_cookies
 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
+from .extractor.common import UnsupportedURLIE
 from .extractor.openload import PhantomJSwrapper
 from .minicurses import format_text
 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
@ -237,7 +238,7 @@ class YoutubeDL:
                       Default is 'only_download' for CLI, but False for API
    skip_playlist_after_errors: Number of allowed failures until the rest of
                       the playlist is skipped
-    force_generic_extractor: Force downloader to use the generic extractor
+    allowed_extractors:  List of regexes to match against extractor names that are allowed
    overwrites:        Overwrite all video and metadata files if True,
                       overwrite only non-video files if None
                       and don't overwrite any file if False
@ -477,6 +478,8 @@ class YoutubeDL:

    The following options are deprecated and may be removed in the future:

+    force_generic_extractor: Force downloader to use the generic extractor
+                       - Use allowed_extractors = ['generic', 'default']
    playliststart:     - Use playlist_items
                       Playlist item to start at.
    playlistend:       - Use playlist_items
@ -758,13 +761,6 @@ class YoutubeDL:
            self._ies_instances[ie_key] = ie
            ie.set_downloader(self)

-    def _get_info_extractor_class(self, ie_key):
-        ie = self._ies.get(ie_key)
-        if ie is None:
-            ie = get_info_extractor(ie_key)
-            self.add_info_extractor(ie)
-        return ie
-
    def get_info_extractor(self, ie_key):
        """
        Get an instance of an IE with name ie_key, it will try to get one from
@ -781,8 +777,19 @@ class YoutubeDL:
        """
        Add the InfoExtractors returned by gen_extractors to the end of the list
        """
-        for ie in gen_extractor_classes():
-            self.add_info_extractor(ie)
+        all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
+        all_ies['end'] = UnsupportedURLIE()
+        try:
+            ie_names = orderedSet_from_options(
+                self.params.get('allowed_extractors', ['default']), {
+                    'all': list(all_ies),
+                    'default': [name for name, ie in all_ies.items() if ie._ENABLED],
+                }, use_regex=True)
+        except re.error as e:
+            raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
+        for name in ie_names:
+            self.add_info_extractor(all_ies[name])
+        self.write_debug(f'Loaded {len(ie_names)} extractors')

    def add_post_processor(self, pp, when='post_process'):
        """Add a PostProcessor object to the end of the chain."""
@ -1413,11 +1420,11 @@ class YoutubeDL:
            ie_key = 'Generic'

        if ie_key:
-            ies = {ie_key: self._get_info_extractor_class(ie_key)}
+            ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
        else:
            ies = self._ies

-        for ie_key, ie in ies.items():
+        for key, ie in ies.items():
            if not ie.suitable(url):
                continue

@ -1426,14 +1433,16 @@ class YoutubeDL:
                                    'and will probably not work.')

            temp_id = ie.get_temp_id(url)
-            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
-                self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
+            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
+                self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
                if self.params.get('break_on_existing', False):
                    raise ExistingVideoReached()
                break
-            return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
+            return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
        else:
-            self.report_error('no suitable InfoExtractor for URL %s' % url)
+            extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
+            self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
+                              tb=False if extractors_restricted else None)

    def _handle_extraction_exceptions(func):
        @functools.wraps(func)
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -766,6 +766,7 @@ def parse_options(argv=None):
        'windowsfilenames': opts.windowsfilenames,
        'ignoreerrors': opts.ignoreerrors,
        'force_generic_extractor': opts.force_generic_extractor,
+        'allowed_extractors': opts.allowed_extractors or ['default'],
        'ratelimit': opts.ratelimit,
        'throttledratelimit': opts.throttledratelimit,
        'overwrites': opts.overwrites,
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -480,6 +480,9 @@ class InfoExtractor:
    will be used by geo restriction bypass mechanism similarly
    to _GEO_COUNTRIES.

+    The _ENABLED attribute should be set to False for IEs that
+    are disabled by default and must be explicitly enabled.
+
    The _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """
@ -491,6 +494,7 @@ class InfoExtractor:
    _GEO_COUNTRIES = None
    _GEO_IP_BLOCKS = None
    _WORKING = True
+    _ENABLED = True
    _NETRC_MACHINE = None
    IE_DESC = None
    SEARCH_KEY = None
@ -3941,3 +3945,12 @@ class SearchInfoExtractor(InfoExtractor):
    @classproperty
    def SEARCH_KEY(cls):
        return cls._SEARCH_KEY
+
+
+class UnsupportedURLIE(InfoExtractor):
+    _VALID_URL = '.*'
+    _ENABLED = False
+    IE_DESC = False
+
+    def _real_extract(self, url):
+        raise UnsupportedError(url)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -353,10 +353,20 @@ def create_parser():
        '--extractor-descriptions',
        action='store_true', dest='list_extractor_descriptions', default=False,
        help='Output descriptions of all supported extractors and exit')
+    general.add_option(
+        '--use-extractors', '--ies',
+        action='callback', dest='allowed_extractors', metavar='NAMES', type='str',
+        default=[], callback=_list_from_options_callback,
+        help=(
+            'Extractor names to use separated by commas. '
+            'You can also use regexes, "all", "default" and "end" (end URL matching); '
+            'e.g. --ies "holodex.*,end,youtube". '
+            'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. '
+            'Use --list-extractors for a list of available extractor names'))
    general.add_option(
        '--force-generic-extractor',
        action='store_true', dest='force_generic_extractor', default=False,
-        help='Force extraction to use the generic extractor')
+        help=optparse.SUPPRESS_HELP)
    general.add_option(
        '--default-search',
        dest='default_search', metavar='PREFIX',