From 6f32a0b5b70fe0f8b14c2946b40840b795044662 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Wed, 5 Jan 2022 20:37:49 +0200 Subject: [PATCH] [utils] Improve parsing for nested HTML elements (#2129) and add functions to return the HTML of elements Authored by: zmousm --- test/test_utils.py | 107 ++++++++++++++++++++++++++++++----- yt_dlp/utils.py | 137 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 216 insertions(+), 28 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 1a9f71947..c3ec798dc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -44,6 +44,12 @@ from yt_dlp.utils import ( get_element_by_attribute, get_elements_by_class, get_elements_by_attribute, + get_element_html_by_class, + get_element_html_by_attribute, + get_elements_html_by_class, + get_elements_html_by_attribute, + get_elements_text_and_html_by_attribute, + get_element_text_and_html_by_tag, InAdvancePagedList, int_or_none, intlist_to_bytes, @@ -118,6 +124,7 @@ from yt_dlp.compat import ( compat_chr, compat_etree_fromstring, compat_getenv, + compat_HTMLParseError, compat_os_name, compat_setenv, ) @@ -1575,46 +1582,116 @@ Line 1 self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' + nice + ''' + def test_get_element_by_class(self): - html = ''' - nice - ''' + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) + def test_get_element_html_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_by_class('no-such-class', html), None) + + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' + + ''' + def test_get_element_by_attribute(self): - html = ''' - nice - ''' + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) - html = ''' - - ''' + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + def test_get_element_html_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) + self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) + + GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' + nicealso nice + ''' + GET_ELEMENTS_BY_CLASS_RES = ['nice', 'also nice'] + def test_get_elements_by_class(self): - html = ''' - nicealso nice - ''' + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_class('no-such-class', html), []) + def test_get_elements_html_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_class('no-such-class', html), []) + def test_get_elements_by_attribute(self): - html = ''' - nicealso nice - ''' + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_get_elements_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_text_and_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual( + get_elements_text_and_html_by_attribute('class', 'foo bar', html), + list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) + + GET_ELEMENT_BY_TAG_TEST_STRING = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned +
+ this should also be returned +
+ closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276] + GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] + + def test_get_element_text_and_html_by_tag(self): + html = self.GET_ELEMENT_BY_TAG_TEST_STRING + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), + (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML)) + self.assertEqual( + get_element_text_and_html_by_tag('span', html), + (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) + self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d6f1ff708..826ab5d29 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -416,17 +416,33 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) +def get_element_html_by_id(id, html): + """Return the html of the tag with the specified ID in the passed HTML document""" + return get_element_html_by_attribute('id', id, html) + + def get_element_by_class(class_name, html): """Return the content of the first tag with the specified class in the passed HTML document""" retval = get_elements_by_class(class_name, html) return retval[0] if retval else None +def get_element_html_by_class(class_name, html): + """Return the html of the first tag with the specified class in the passed HTML document""" + retval = get_elements_html_by_class(class_name, html) + return retval[0] if retval else None + + def get_element_by_attribute(attribute, value, html, escape_value=True): retval = get_elements_by_attribute(attribute, value, html, escape_value) return retval[0] if retval else None +def get_element_html_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_html_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + def get_elements_by_class(class_name, html): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( @@ -434,31 +450,126 @@ def get_elements_by_class(class_name, html): html, escape_value=False) -def get_elements_by_attribute(attribute, value, html, escape_value=True): +def get_elements_html_by_class(class_name, html): + """Return the html of all tags with the specified class in the passed HTML document as a list""" + return get_elements_html_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_by_attribute(*args, **kwargs): """Return the content of the tag with the specified attribute in the passed HTML document""" + return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_html_by_attribute(*args, **kwargs): + """Return the html of the tag with the specified attribute in the passed HTML document""" + return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True): + """ + Return the text (content) and the html (whole) of the tag with the specified + attribute in the passed HTML document + """ value = re.escape(value) if escape_value else value retlist = [] for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + <(?P[a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? + \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q)) + (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? \s*> - (?P.*?) - - ''' % (re.escape(attribute), value), html): - res = m.group('content') + ''' % {'attribute': re.escape(attribute), 'value': value}, html): + content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] - - retlist.append(unescapeHTML(res)) + retlist.append(( + unescapeHTML(re.sub(r'(?s)^(?P["\'])(?P.*)(?P=q)$', r'\g', content)), + whole, + )) return retlist +class HTMLBreakOnClosingTagParser(compat_HTMLParser): + """ + HTML parser which raises HTMLBreakOnClosingTagException upon reaching the + closing tag for the first opening tag it has encountered, and can be used + as a context manager + """ + + class HTMLBreakOnClosingTagException(Exception): + pass + + def __init__(self): + self.tagstack = collections.deque() + compat_HTMLParser.__init__(self) + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + def close(self): + # handle_endtag does not return upon raising HTMLBreakOnClosingTagException, + # so data remains buffered; we no longer have any interest in it, thus + # override this method to discard it + pass + + def handle_starttag(self, tag, _): + self.tagstack.append(tag) + + def handle_endtag(self, tag): + if not self.tagstack: + raise compat_HTMLParseError('no tags in the stack') + while self.tagstack: + inner_tag = self.tagstack.pop() + if inner_tag == tag: + break + else: + raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + if not self.tagstack: + raise self.HTMLBreakOnClosingTagException() + + +def get_element_text_and_html_by_tag(tag, html): + """ + For the first element with the specified tag in the passed HTML document + return its' content (text) and the whole element (html) + """ + def find_or_raise(haystack, needle, exc): + try: + return haystack.index(needle) + except ValueError: + raise exc + closing_tag = f'' + whole_start = find_or_raise( + html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) + content_start = find_or_raise( + html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) + content_start += whole_start + 1 + with HTMLBreakOnClosingTagParser() as parser: + parser.feed(html[whole_start:content_start]) + if not parser.tagstack or parser.tagstack[0] != tag: + raise compat_HTMLParseError(f'parser did not match opening {tag} tag') + offset = content_start + while offset < len(html): + next_closing_tag_start = find_or_raise( + html[offset:], closing_tag, + compat_HTMLParseError(f'closing {tag} tag not found')) + next_closing_tag_end = next_closing_tag_start + len(closing_tag) + try: + parser.feed(html[offset:offset + next_closing_tag_end]) + offset += next_closing_tag_end + except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: + return html[content_start:offset + next_closing_tag_start], \ + html[whole_start:offset + next_closing_tag_end] + raise compat_HTMLParseError('unexpected end of html') + + class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element"""