[utils] Improve parsing for nested HTML elements (#2129)

and add functions to return the HTML of elements Authored by: zmousm
3 years ago · 6f32a0b5b7
parent e8736539f3
commit 6f32a0b5b7
2 changed files with 216 additions and 28 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -44,6 +44,12 @@ from yt_dlp.utils import (
    get_element_by_attribute,
    get_elements_by_class,
    get_elements_by_attribute,
    get_element_html_by_class,
    get_element_html_by_attribute,
    get_elements_html_by_class,
    get_elements_html_by_attribute,
    get_elements_text_and_html_by_attribute,
    get_element_text_and_html_by_tag,
    InAdvancePagedList,
    int_or_none,
    intlist_to_bytes,
@ -118,6 +124,7 @@ from yt_dlp.compat import (
    compat_chr,
    compat_etree_fromstring,
    compat_getenv,
    compat_HTMLParseError,
    compat_os_name,
    compat_setenv,
 )
@ -1575,46 +1582,116 @@ Line 1
        self.assertEqual(urshift(3, 1), 1)
        self.assertEqual(urshift(-3, 1), 2147483646)
    GET_ELEMENT_BY_CLASS_TEST_STRING = '''
        <span class="foo bar">nice</span>
    '''
    def test_get_element_by_class(self):
-        html = '''
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
            <span class="foo bar">nice</span>
        '''
        self.assertEqual(get_element_by_class('foo', html), 'nice')
        self.assertEqual(get_element_by_class('no-such-class', html), None)
    def test_get_element_html_by_class(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
        self.assertEqual(get_element_html_by_class('foo', html), html.strip())
        self.assertEqual(get_element_by_class('no-such-class', html), None)
    GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
        <div itemprop="author" itemscope>foo</div>
    '''
    def test_get_element_by_attribute(self):
-        html = '''
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
            <span class="foo bar">nice</span>
        '''
        self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
        self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
        self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
-        html = '''
+        html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
            <div itemprop="author" itemscope>foo</div>
        '''
        self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
    def test_get_element_html_by_attribute(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
        self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
        self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
        html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
        self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
    GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
        <span class="foo bar">nice</span><span class="foo bar">also nice</span>
    '''
    GET_ELEMENTS_BY_CLASS_RES = ['<span class="foo bar">nice</span>', '<span class="foo bar">also nice</span>']
    def test_get_elements_by_class(self):
-        html = '''
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
            <span class="foo bar">nice</span><span class="foo bar">also nice</span>
        '''
        self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
        self.assertEqual(get_elements_by_class('no-such-class', html), [])
    def test_get_elements_html_by_class(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
        self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
        self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
    def test_get_elements_by_attribute(self):
-        html = '''
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
            <span class="foo bar">nice</span><span class="foo bar">also nice</span>
        '''
        self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
        self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
    def test_get_elements_html_by_attribute(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
        self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
        self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
    def test_get_elements_text_and_html_by_attribute(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
        self.assertEqual(
            get_elements_text_and_html_by_attribute('class', 'foo bar', html),
            list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
    GET_ELEMENT_BY_TAG_TEST_STRING = '''
    random text lorem ipsum</p>
    <div>
        this should be returned
        <span>this should also be returned</span>
        <div>
            this should also be returned
        </div>
        closing tag above should not trick, so this should also be returned
    </div>
    but this text should not be returned
    '''
    GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
    GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
    GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
    GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
    def test_get_element_text_and_html_by_tag(self):
        html = self.GET_ELEMENT_BY_TAG_TEST_STRING
        self.assertEqual(
            get_element_text_and_html_by_tag('div', html),
            (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
        self.assertEqual(
            get_element_text_and_html_by_tag('span', html),
            (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
        self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
    def test_iri_to_uri(self):
        self.assertEqual(
            iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -416,17 +416,33 @@ def get_element_by_id(id, html):
    return get_element_by_attribute('id', id, html)
 def get_element_html_by_id(id, html):
    """Return the html of the tag with the specified ID in the passed HTML document"""
    return get_element_html_by_attribute('id', id, html)
 def get_element_by_class(class_name, html):
    """Return the content of the first tag with the specified class in the passed HTML document"""
    retval = get_elements_by_class(class_name, html)
    return retval[0] if retval else None
 def get_element_html_by_class(class_name, html):
    """Return the html of the first tag with the specified class in the passed HTML document"""
    retval = get_elements_html_by_class(class_name, html)
    return retval[0] if retval else None
 def get_element_by_attribute(attribute, value, html, escape_value=True):
    retval = get_elements_by_attribute(attribute, value, html, escape_value)
    return retval[0] if retval else None
 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
    retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
    return retval[0] if retval else None
 def get_elements_by_class(class_name, html):
    """Return the content of all tags with the specified class in the passed HTML document as a list"""
    return get_elements_by_attribute(
@ -434,31 +450,126 @@ def get_elements_by_class(class_name, html):
        html, escape_value=False)
-def get_elements_by_attribute(attribute, value, html, escape_value=True):
+def get_elements_html_by_class(class_name, html):
    """Return the html of all tags with the specified class in the passed HTML document as a list"""
    return get_elements_html_by_attribute(
        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
        html, escape_value=False)
 def get_elements_by_attribute(*args, **kwargs):
    """Return the content of the tag with the specified attribute in the passed HTML document"""
    return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 def get_elements_html_by_attribute(*args, **kwargs):
    """Return the html of the tag with the specified attribute in the passed HTML document"""
    return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
    """
    Return the text (content) and the html (whole) of the tag with the specified
    attribute in the passed HTML document
    """
    value = re.escape(value) if escape_value else value
    retlist = []
    for m in re.finditer(r'''(?xs)
-        <([a-zA-Z0-9:._-]+)
+        <(?P<tag>[a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
-         \s+%s=['"]?%s['"]?
+         \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
        \s*>
-        (?P<content>.*?)
+    ''' % {'attribute': re.escape(attribute), 'value': value}, html):
-        </\1>
+        content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
    ''' % (re.escape(attribute), value), html):
        res = m.group('content')
-        if res.startswith('"') or res.startswith("'"):
+        retlist.append((
-            res = res[1:-1]
+            unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
-
+            whole,
-        retlist.append(unescapeHTML(res))
+        ))
    return retlist
 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
    """
    HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
    closing tag for the first opening tag it has encountered, and can be used
    as a context manager
    """
    class HTMLBreakOnClosingTagException(Exception):
        pass
    def __init__(self):
        self.tagstack = collections.deque()
        compat_HTMLParser.__init__(self)
    def __enter__(self):
        return self
    def __exit__(self, *_):
        self.close()
    def close(self):
        # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
        # so data remains buffered; we no longer have any interest in it, thus
        # override this method to discard it
        pass
    def handle_starttag(self, tag, _):
        self.tagstack.append(tag)
    def handle_endtag(self, tag):
        if not self.tagstack:
            raise compat_HTMLParseError('no tags in the stack')
        while self.tagstack:
            inner_tag = self.tagstack.pop()
            if inner_tag == tag:
                break
        else:
            raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
        if not self.tagstack:
            raise self.HTMLBreakOnClosingTagException()
 def get_element_text_and_html_by_tag(tag, html):
    """
    For the first element with the specified tag in the passed HTML document
    return its' content (text) and the whole element (html)
    """
    def find_or_raise(haystack, needle, exc):
        try:
            return haystack.index(needle)
        except ValueError:
            raise exc
    closing_tag = f'</{tag}>'
    whole_start = find_or_raise(
        html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
    content_start = find_or_raise(
        html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
    content_start += whole_start + 1
    with HTMLBreakOnClosingTagParser() as parser:
        parser.feed(html[whole_start:content_start])
        if not parser.tagstack or parser.tagstack[0] != tag:
            raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
        offset = content_start
        while offset < len(html):
            next_closing_tag_start = find_or_raise(
                html[offset:], closing_tag,
                compat_HTMLParseError(f'closing {tag} tag not found'))
            next_closing_tag_end = next_closing_tag_start + len(closing_tag)
            try:
                parser.feed(html[offset:offset + next_closing_tag_end])
                offset += next_closing_tag_end
            except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
                return html[content_start:offset + next_closing_tag_start], \
                    html[whole_start:offset + next_closing_tag_end]
        raise compat_HTMLParseError('unexpected end of html')
 class HTMLAttributeParser(compat_HTMLParser):
    """Trivial HTML parser to gather the attributes for a single element"""