From 5e51f4a8ad25aab0f78000d8812f4f1af490953e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 3 Feb 2022 13:16:47 +0530 Subject: [PATCH] [glomex] Simplify embed detection (#2600) Closes #2512 --- yt_dlp/extractor/glomex.py | 72 ++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/yt_dlp/extractor/glomex.py b/yt_dlp/extractor/glomex.py index a6477faab..d9ef4338f 100644 --- a/yt_dlp/extractor/glomex.py +++ b/yt_dlp/extractor/glomex.py @@ -7,6 +7,7 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, ExtractorError, int_or_none, parse_qs, @@ -177,49 +178,38 @@ class GlomexEmbedIE(GlomexBaseIE): @classmethod def _extract_urls(cls, webpage, origin_url): - VALID_SRC = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ - EMBED_RE = r'''(?x)(?: - ]+?src=(?P<_q1>%(quot_re)s)(?P%(url_re)s)(?P=_q1)| - <(?Pglomex-player|div)(?: - data-integration-id=(?P<_q2>%(quot_re)s)(?P(?:(?!(?P=_q2)).)+)(?P=_q2)| - data-playlist-id=(?P<_q3>%(quot_re)s)(?P(?:(?!(?P=_q3)).)+)(?P=_q3)| - data-glomex-player=(?P<_q4>%(quot_re)s)(?Ptrue)(?P=_q4)| - [^>]*? - )+>| - # naive parsing of inline scripts for hard-coded integration parameters - <(?Pscript)[^<]*?>(?: - (?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s* - (?P<_q5>%(quot_re)s)(?P(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?| - (?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s* - (?P<_q6>%(quot_re)s)(?P(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?| - (?:\s|.)*? - )+ - )''' % {'quot_re': r'["\']', 'url_re': VALID_SRC} - - for mtup in re.findall(EMBED_RE, webpage): - # re.finditer causes a memory spike. See https://github.com/yt-dlp/yt-dlp/issues/2512 - mdict = dict(zip(( - 'url', '_', - 'html_tag', '_', 'integration_html', '_', 'id_html', '_', 'glomex_player', - 'script_tag', '_', '_', 'integration_js', '_', 'id_js', - ), mtup)) - if mdict.get('url'): - url = unescapeHTML(mdict['url']) - if not cls.suitable(url): - continue + quot_re = r'["\']' + + regex = fr'''(?x) + ]+?src=(?P{quot_re})(?P + (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ + )(?P=q)''' + for mobj in re.finditer(regex, webpage): + url = unescapeHTML(mobj.group('url')) + if cls.suitable(url): yield cls._smuggle_origin_url(url, origin_url) - elif mdict.get('html_tag'): - if mdict['html_tag'] == 'div' and not mdict.get('glomex_player'): - continue - if not mdict.get('video_id_html') or not mdict.get('integration_html'): - continue - yield cls.build_player_url(mdict['video_id_html'], mdict['integration_html'], origin_url) - elif mdict.get('script_tag'): - if not mdict.get('video_id_js') or not mdict.get('integration_js'): - continue - yield cls.build_player_url(mdict['video_id_js'], mdict['integration_js'], origin_url) + + regex = fr'''(?x) + ]+?>| + ]* data-glomex-player=(?P{quot_re})true(?P=q)[^>]*>''' + for mobj in re.finditer(regex, webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + + # naive parsing of inline scripts for hard-coded integration parameters + regex = fr'''(?x) + (?Pdataset\.)?%s\s*(?(is_js)=|:)\s* + (?P{quot_re})(?P(?:(?!(?P=q)).)+)(?P=q)\s''' + for mobj in re.finditer(r'(?x).+?', webpage): + script = mobj.group(0) + integration_id = re.search(regex % 'integrationId', script) + if not integration_id: + continue + playlist_id = re.search(regex % 'playlistId', script) + if playlist_id: + yield cls.build_player_url(playlist_id, integration_id, origin_url) def _real_extract(self, url): url, origin_url = self._unsmuggle_origin_url(url)