From 22ccd5420b3eb0782776071f12cccd1fedaa1fd0 Mon Sep 17 00:00:00 2001 From: mushbite Date: Sat, 4 Mar 2023 15:33:17 +0200 Subject: [PATCH] [extractor/rutube] Extract chapters from description (#6345) Authored by: mushbite --- yt_dlp/extractor/common.py | 32 ++++++++++++++++++++++++++++ yt_dlp/extractor/rutube.py | 34 +++++++++++++++++++++++++----- yt_dlp/extractor/youtube.py | 42 +++++-------------------------------- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 8ad63b411..2091df7fa 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3649,6 +3649,38 @@ class InfoExtractor: or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) or default) + def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': start_function(chapter), + 'title': title_function(chapter), + } for chapter in chapter_list or []] + if not strict: + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + self.report_warning(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + self.report_warning( + f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') + return chapters[1:] + + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters_helper( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters_helper( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], + duration=duration, strict=False) + @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): all_known = all(map( diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index 97e6354b4..08d9b9257 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -25,8 +25,7 @@ class RutubeBaseIE(InfoExtractor): video_id, 'Downloading video JSON', 'Unable to download video JSON', query=query) - @staticmethod - def _extract_info(video, video_id=None, require_title=True): + def _extract_info(self, video, video_id=None, require_title=True): title = video['title'] if require_title else video.get('title') age_limit = video.get('is_adult') @@ -35,13 +34,15 @@ class RutubeBaseIE(InfoExtractor): uploader_id = try_get(video, lambda x: x['author']['id']) category = try_get(video, lambda x: x['category']['name']) + description = video.get('description') + duration = int_or_none(video.get('duration')) return { 'id': video.get('id') or video_id if video_id else video['id'], 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': video.get('thumbnail_url'), - 'duration': int_or_none(video.get('duration')), + 'duration': duration, 'uploader': try_get(video, lambda x: x['author']['name']), 'uploader_id': compat_str(uploader_id) if uploader_id else None, 'timestamp': unified_timestamp(video.get('created_ts')), @@ -50,6 +51,7 @@ class RutubeBaseIE(InfoExtractor): 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), 'is_live': bool_or_none(video.get('is_livestream')), + 'chapters': self._extract_chapters_from_description(description, duration), } def _download_and_extract_info(self, video_id, query=None): @@ -111,8 +113,9 @@ class RutubeIE(RutubeBaseIE): 'view_count': int, 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', 'category': ['Новости и СМИ'], - + 'chapters': [], }, + 'expected_warnings': ['Unable to download f4m'], }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, @@ -142,7 +145,28 @@ class RutubeIE(RutubeBaseIE): 'view_count': int, 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', 'category': ['Видеоигры'], + 'chapters': [], + }, + 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/', + 'info_dict': { + 'id': 'c65b465ad0c98c89f3b25cb03dcc87c6', + 'ext': 'mp4', + 'chapters': 'count:4', + 'category': ['Бизнес и предпринимательство'], + 'description': 'md5:252feac1305257d8c1bab215cedde75d', + 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', + 'duration': 782, + 'age_limit': 0, + 'uploader_id': '23491359', + 'timestamp': 1677153329, + 'view_count': int, + 'upload_date': '20230223', + 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании', + 'uploader': 'Стас Быков', }, + 'expected_warnings': ['Unable to download f4m'], }] @classmethod diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b02e0153a..b8bb980f3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3205,11 +3205,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters' ), expected_type=list) - return self._extract_chapters( + return self._extract_chapters_helper( chapter_list, - chapter_time=lambda chapter: float_or_none( + start_function=lambda chapter: float_or_none( traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000), - chapter_title=lambda chapter: traverse_obj( + title_function=lambda chapter: traverse_obj( chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str), duration=duration) @@ -3222,42 +3222,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapter_title = lambda chapter: self._get_text(chapter, 'title') return next(filter(None, ( - self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), - chapter_time, chapter_title, duration) + self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_chapters_from_description(self, description, duration): - duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' - sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' - return self._extract_chapters( - re.findall(sep_re % (duration_re, r'.+?'), description or ''), - chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], - duration=duration, strict=False) or self._extract_chapters( - re.findall(sep_re % (r'.+?', duration_re), description or ''), - chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0], - duration=duration, strict=False) - - def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): - if not duration: - return - chapter_list = [{ - 'start_time': chapter_time(chapter), - 'title': chapter_title(chapter), - } for chapter in chapter_list or []] - if not strict: - chapter_list.sort(key=lambda c: c['start_time'] or 0) - - chapters = [{'start_time': 0}] - for idx, chapter in enumerate(chapter_list): - if chapter['start_time'] is None: - self.report_warning(f'Incomplete chapter {idx}') - elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: - chapters.append(chapter) - elif chapter not in chapters: - self.report_warning( - f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') - return chapters[1:] - def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: