[InfoExtractor] Add `_match_valid_url()` class method and refactor

* API compatible with yt-dlp
* also support Sequence of patterns in _VALID_URL
* one place to compile _VALID_URL
* TODO: remove existing extractor shims
pull/29318/head
dirkf 10 months ago
parent a190b55964
commit b2ba24bb02

@ -4,6 +4,7 @@ from inspect import getsource
import io import io
import os import os
from os.path import dirname as dirn from os.path import dirname as dirn
import re
import sys import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f: with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read() module_template = f.read()
def get_source(m):
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
module_contents = [ module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', module_template,
get_source(InfoExtractor.suitable),
get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780) # needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n', 'from youtube_dl.utils import parse_qs, variadic\n',
] ]
ie_template = ''' ie_template = '''
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url, valid_url=valid_url,
module=ie.__module__) module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += '\n' + getsource(ie.suitable) s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'): if hasattr(ie, '_make_valid_url'):
# search extractors # search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url()) s += make_valid_template.format(valid_url=ie._make_valid_url())

@ -83,6 +83,7 @@ from ..utils import (
urljoin, urljoin,
url_basename, url_basename,
url_or_none, url_or_none,
variadic,
xpath_element, xpath_element,
xpath_text, xpath_text,
xpath_with_ns, xpath_with_ns,
@ -371,9 +372,22 @@ class InfoExtractor(object):
title, description etc. title, description etc.
Subclasses of this one should re-define the _real_initialize() and A subclass of InfoExtractor must be defined to handle each specific site (or
_real_extract() methods and define a _VALID_URL regexp. several sites). Such a concrete subclass should be added to the list of
Probably, they should also be added to the list of extractors. extractors. It should also:
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
regexps (but see below)
* re-define the _real_extract() method
* optionally re-define the _real_initialize() method.
An extractor subclass may also override suitable() if necessary, but the
function signature must be preserved and the function must import everything
it needs (except other extractors), so that lazy_extractors works correctly.
If the subclass's suitable() and _real_extract() functions avoid using
_VALID_URL, the subclass need not set that class attribute.
An abstract subclass of InfoExtractor may be used to simplify implementation
within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable _GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor. geo restriction bypass mechanisms for a particular extractor.
@ -409,21 +423,32 @@ class InfoExtractor(object):
self.set_downloader(downloader) self.set_downloader(downloader)
@classmethod @classmethod
def suitable(cls, url): def __match_valid_url(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
# This does not use has/getattr intentionally - we want to know whether # This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also # we have cached the regexp for cls, whereas getattr would also
# match the superclass # match its superclass
if '_VALID_URL_RE' not in cls.__dict__: if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL) # _VALID_URL can now be a list/tuple of patterns
return cls._VALID_URL_RE.match(url) is not None cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
for p in cls._VALID_URL_RE:
p = p.match(url)
if p:
return p
# The public alias can safely be overridden, as in some back-ports
_match_valid_url = __match_valid_url
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
# This function must import everything it needs (except other extractors),
# so that lazy_extractors works correctly
return cls.__match_valid_url(url) is not None
@classmethod @classmethod
def _match_id(cls, url): def _match_id(cls, url):
if '_VALID_URL_RE' not in cls.__dict__: m = cls.__match_valid_url(url)
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
assert m assert m
return compat_str(m.group('id')) return compat_str(m.group('id'))

@ -18,12 +18,6 @@ from ..utils import (
class GlobalPlayerBaseIE(InfoExtractor): class GlobalPlayerBaseIE(InfoExtractor):
import re
@classmethod
def _match_valid_url(cls, url):
return cls.re.match(cls._VALID_URL, url)
def _get_page_props(self, url, video_id): def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] return self._search_nextjs_data(webpage, video_id)['props']['pageProps']

Loading…
Cancel
Save