[YouTube] Fix incorrect chapter extraction

* align `_get_text()` with yt-dlp (thx, passim) at last
pull/29686/merge
dirkf 4 weeks ago
parent 6f5d4c3289
commit 39378f7b5c

@ -533,6 +533,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'uploader': uploader, 'uploader': uploader,
} }
@staticmethod
def _get_text(data, *path_list, **kw_max_runs):
max_runs = kw_max_runs.get('max_runs')
for path in path_list or [None]:
if path is None:
obj = [data] # shortcut
else:
obj = traverse_obj(data, tuple(variadic(path) + (all,)))
for runs in traverse_obj(
obj, ('simpleText', {'text': T(compat_str)}, all, filter),
('runs', lambda _, r: isinstance(r.get('text'), compat_str), all, filter),
(T(list), lambda _, r: isinstance(r.get('text'), compat_str)),
default=[]):
max_runs = int_or_none(max_runs, default=len(runs))
if max_runs < len(runs):
runs = runs[:max_runs]
text = ''.join(traverse_obj(runs, (Ellipsis, 'text')))
if text:
return text
@staticmethod @staticmethod
def _extract_thumbnails(data, *path_list, **kw_final_key): def _extract_thumbnails(data, *path_list, **kw_final_key):
""" """
@ -2493,10 +2514,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self.url_result( return self.url_result(
trailer_video_id, self.ie_key(), trailer_video_id) trailer_video_id, self.ie_key(), trailer_video_id)
def get_text(x): get_text = lambda x: self._get_text(x) or ''
return ''.join(traverse_obj(
x, (('simpleText',),), ('runs', Ellipsis, 'text'),
expected_type=compat_str))
search_meta = ( search_meta = (
(lambda x: self._html_search_meta(x, webpage, default=None)) (lambda x: self._html_search_meta(x, webpage, default=None))
@ -2960,24 +2978,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
chapters = self._extract_chapters_from_json( chapters = self._extract_chapters_from_json(
initial_data, video_id, duration) initial_data, video_id, duration)
if not chapters: if not chapters:
for engagment_pannel in (initial_data.get('engagementPanels') or []):
contents = try_get(
engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
list)
if not contents:
continue
def chapter_time(mmlir): def chapter_time(mmlir):
return parse_duration( return parse_duration(
get_text(mmlir.get('timeDescription'))) get_text(mmlir.get('timeDescription')))
for markers in traverse_obj(initial_data, (
'engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer',
'content', 'macroMarkersListRenderer', 'contents', T(list))):
chapters = [] chapters = []
for next_num, content in enumerate(contents, start=1): for next_num, content in enumerate(markers, start=1):
mmlir = content.get('macroMarkersListItemRenderer') or {} mmlir = content.get('macroMarkersListItemRenderer') or {}
start_time = chapter_time(mmlir) start_time = chapter_time(mmlir)
end_time = (traverse_obj( end_time = (traverse_obj(markers, (
contents, (next_num, 'macroMarkersListItemRenderer', T(chapter_time))) next_num, 'macroMarkersListItemRenderer', T(chapter_time)))
if next_num < len(contents) else duration) if next_num < len(markers) else duration)
if start_time is None or end_time is None: if start_time is None or end_time is None:
continue continue
chapters.append({ chapters.append({
@ -3536,12 +3551,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
T(dict.items), lambda _, k_v: k_v[0].startswith('grid') and k_v[0].endswith('Renderer'), T(dict.items), lambda _, k_v: k_v[0].startswith('grid') and k_v[0].endswith('Renderer'),
1, T(dict)), get_all=False) 1, T(dict)), get_all=False)
@staticmethod
def _get_text(r, k):
return traverse_obj(
r, (k, 'runs', 0, 'text'), (k, 'simpleText'),
expected_type=txt_or_none)
def _grid_entries(self, grid_renderer): def _grid_entries(self, grid_renderer):
for item in traverse_obj(grid_renderer, ('items', Ellipsis, T(dict))): for item in traverse_obj(grid_renderer, ('items', Ellipsis, T(dict))):
lockup_view_model = traverse_obj(item, ('lockupViewModel', T(dict))) lockup_view_model = traverse_obj(item, ('lockupViewModel', T(dict)))

Loading…
Cancel
Save