From 5ec1b6b71689d2f0cbdcd2b6c4dd861fb2fcf911 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 7 Jun 2022 01:43:50 +0530 Subject: [PATCH] Add option `--download-sections` to download video partially Closes #52, Closes #3932 --- README.md | 28 +++++++++++------ yt_dlp/YoutubeDL.py | 57 +++++++++++++++++++++++++++-------- yt_dlp/__init__.py | 34 +++++++++++++-------- yt_dlp/downloader/__init__.py | 4 +-- yt_dlp/downloader/external.py | 21 ++++++------- yt_dlp/options.py | 17 +++++++---- yt_dlp/utils.py | 17 +++++++++++ 7 files changed, 123 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index c14f4b365..9424f67a0 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,8 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE]` +* **Download time range**: Videos can be downloaded partially based on either timestamps or chapters using `--download-sections` + * **Split video by chapters**: Videos can be split into multiple files based on chapters using `--split-chapters` * **Multi-threaded fragment downloads**: Download multiple fragments of m3u8/mpd videos in parallel. Use `--concurrent-fragments` (`-N`) option to set the number of threads used @@ -555,6 +557,14 @@ You can also fork the project on github and run your fork's [build workflow](.gi --no-hls-use-mpegts Do not use the mpegts container for HLS videos. This is default when not downloading live streams + --download-sections REGEX Download only chapters whose title matches + the given regular expression. Time ranges + prefixed by a "*" can also be used in place + of chapters to download the specified range. + Eg: --download-sections "*10:15-15:00" + --download-sections "intro". Needs ffmpeg. + This option can be used multiple times to + download multiple sections --downloader [PROTO:]NAME Name or path of the external downloader to use (optionally) prefixed by the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to @@ -997,18 +1007,16 @@ You can also fork the project on github and run your fork's [build workflow](.gi --no-split-chapters Do not split video based on chapters (default) --remove-chapters REGEX Remove chapters whose title matches the - given regular expression. Time ranges - prefixed by a "*" can also be used in place - of chapters to remove the specified range. - Eg: --remove-chapters "*10:15-15:00" - --remove-chapters "intro". This option can + given regular expression. The syntax is the + same as --download-sections. This option can be used multiple times --no-remove-chapters Do not remove any chapters from the file (default) - --force-keyframes-at-cuts Force keyframes around chapters when - removing/splitting them. This is slow due to - needing a re-encode, but the resulting video - may have fewer artifacts around the cuts + --force-keyframes-at-cuts Force keyframes at cuts when + downloading/splitting/removing sections. + This is slow due to needing a re-encode, but + the resulting video may have fewer artifacts + around the cuts --no-force-keyframes-at-cuts Do not force keyframes around the chapters when cutting/splitting (default) --use-postprocessor NAME[:ARGS] @@ -1286,7 +1294,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Available for `chapter:` prefix when using `--split-chapters` for videos with internal chapters: +Available only when using `--download-sections` and for `chapter:` prefix when using `--split-chapters` for videos with internal chapters: - `section_title` (string): Title of the chapter - `section_number` (numeric): Number of the chapter within the file diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e71e85d2e..8fff9ddc0 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -417,8 +417,6 @@ class YoutubeDL: geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to geo_bypass_country - - The following options determine which downloader is picked: external_downloader: A dictionary of protocol keys and the executable of the external downloader to use for it. The allowed protocols are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. @@ -435,6 +433,13 @@ class YoutubeDL: retry_sleep_functions: Dictionary of functions that takes the number of attempts as argument and returns the time to sleep in seconds. Allowed keys are 'http', 'fragment', 'file_access' + download_ranges: A function that gets called for every video with the signature + (info_dict, *, ydl) -> Iterable[Section]. + Only the returned sections will be downloaded. Each Section contains: + * start_time: Start time of the section in seconds + * end_time: End time of the section in seconds + * title: Section title (Optional) + * index: Section number (Optional) The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): @@ -2653,16 +2658,34 @@ class YoutubeDL: # Process what we can, even without any available formats. formats_to_download = [{}] - best_format = formats_to_download[-1] + requested_ranges = self.params.get('download_ranges') + if requested_ranges: + requested_ranges = tuple(requested_ranges(info_dict, self)) + + best_format, downloaded_formats = formats_to_download[-1], [] if download: if best_format: - self.to_screen( - f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): ' - + ', '.join([f['format_id'] for f in formats_to_download])) + def to_screen(*msg): + self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') + + to_screen(f'Downloading {len(formats_to_download)} format(s):', + (f['format_id'] for f in formats_to_download)) + if requested_ranges: + to_screen(f'Downloading {len(requested_ranges)} time ranges:', + (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges)) max_downloads_reached = False - for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = self._copy_infodict(info_dict) + + for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]): + new_info = self._copy_infodict(info_dict) new_info.update(fmt) + if chapter: + new_info.update({ + 'section_start': chapter.get('start_time'), + 'section_end': chapter.get('end_time', 0), + 'section_title': chapter.get('title'), + 'section_number': chapter.get('index'), + }) + downloaded_formats.append(new_info) try: self.process_info(new_info) except MaxDownloadsReached: @@ -2675,12 +2698,12 @@ class YoutubeDL: if max_downloads_reached: break - write_archive = {f.get('__write_download_archive', False) for f in formats_to_download} + write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats} assert write_archive.issubset({True, False, 'ignore'}) if True in write_archive and False not in write_archive: self.record_download_archive(info_dict) - info_dict['requested_downloads'] = formats_to_download + info_dict['requested_downloads'] = downloaded_formats info_dict = self.run_all_pps('after_video', info_dict) if max_downloads_reached: raise MaxDownloadsReached() @@ -3036,6 +3059,17 @@ class YoutubeDL: return file success = True + merger = FFmpegMergerPP(self) + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + if fd is not FFmpegFD and ( + info_dict.get('section_start') or info_dict.get('section_end')): + msg = ('This format cannot be partially downloaded' if merger.available + else 'You have requested downloading the video partially, but ffmpeg is not installed') + if not self.params.get('ignoreerrors'): + self.report_error(f'{msg}. Aborting due to --abort-on-error') + return + self.report_warning(f'{msg}. The entire video will be downloaded') + if info_dict.get('requested_formats') is not None: def compatible_formats(formats): @@ -3091,9 +3125,6 @@ class YoutubeDL: info_dict['__real_download'] = False downloaded = [] - merger = FFmpegMergerPP(self) - - fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index d42a3f0d3..4217601bf 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -35,6 +35,7 @@ from .utils import ( GeoUtils, SameFileError, decodeOption, + download_range_func, expand_path, float_or_none, int_or_none, @@ -305,20 +306,25 @@ def validate_options(opts): 'Cannot download a video and extract audio into the same file! ' f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') - # Remove chapters - remove_chapters_patterns, opts.remove_ranges = [], [] - for regex in opts.remove_chapters or []: - if regex.startswith('*'): - dur = list(map(parse_duration, regex[1:].split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - opts.remove_ranges.append(tuple(dur)) + def parse_chapters(name, value): + chapters, ranges = [], [] + for regex in value or []: + if regex.startswith('*'): + for range in regex[1:].split(','): + dur = tuple(map(parse_duration, range.strip().split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + ranges.append(dur) + else: + raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end') continue - raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end') - try: - remove_chapters_patterns.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}') - opts.remove_chapters = remove_chapters_patterns + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + return chapters, ranges + + opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) # Cookies from browser if opts.cookiesfrombrowser: @@ -803,6 +809,8 @@ def parse_options(argv=None): 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, 'external_downloader': opts.external_downloader, + 'download_ranges': opts.download_ranges, + 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 3b4a82635..a7dc6c9d0 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -84,8 +84,8 @@ def _get_suitable_downloader(info_dict, protocol, params, default): if default is NO_DEFAULT: default = HttpFD - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD + if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict): + return FFmpegFD info_dict['protocol'] = protocol downloaders = params.get('external_downloader') diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 66eced1b3..3ef7fd4dc 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -384,13 +384,6 @@ class FFmpegFD(ExternalFD): # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', str(end_time - start_time)] - http_headers = None if info_dict.get('http_headers'): youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers']) @@ -451,15 +444,21 @@ class FFmpegFD(ExternalFD): elif isinstance(conn, str): args += ['-rtmp_conn', conn] + start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') + for i, url in enumerate(urls): - # We need to specify headers for each http input stream - # otherwise, it will only be applied to the first. - # https://github.com/yt-dlp/yt-dlp/issues/2696 if http_headers is not None and re.match(r'^https?://', url): args += http_headers + if start_time: + args += ['-ss', str(start_time)] + if end_time: + args += ['-t', str(end_time - start_time)] + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] - args += ['-c', 'copy'] + if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): + args += ['-c', 'copy'] + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index a9a2ba45f..9e36e1c52 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -916,6 +916,14 @@ def create_parser(): help=( 'Do not use the mpegts container for HLS videos. ' 'This is default when not downloading live streams')) + downloader.add_option( + '--download-sections', + metavar='REGEX', dest='download_ranges', action='append', + help=( + 'Download only chapters whose title matches the given regular expression. ' + 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' + 'Eg: --download-sections "*10:15-15:00" --download-sections "intro". ' + 'Needs ffmpeg. This option can be used multiple times to download multiple sections')) downloader.add_option( '--downloader', '--external-downloader', dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', @@ -1631,9 +1639,7 @@ def create_parser(): metavar='REGEX', dest='remove_chapters', action='append', help=( 'Remove chapters whose title matches the given regular expression. ' - 'Time ranges prefixed by a "*" can also be used in place of chapters to remove the specified range. ' - 'Eg: --remove-chapters "*10:15-15:00" --remove-chapters "intro". ' - 'This option can be used multiple times')) + 'The syntax is the same as --download-sections. This option can be used multiple times')) postproc.add_option( '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None, help='Do not remove any chapters from the file (default)') @@ -1641,9 +1647,8 @@ def create_parser(): '--force-keyframes-at-cuts', action='store_true', dest='force_keyframes_at_cuts', default=False, help=( - 'Force keyframes around chapters when removing/splitting them. ' - 'This is slow due to needing a re-encode, but ' - 'the resulting video may have fewer artifacts around the cuts')) + 'Force keyframes at cuts when downloading/splitting/removing sections. ' + 'This is slow due to needing a re-encode, but the resulting video may have fewer artifacts around the cuts')) postproc.add_option( '--no-force-keyframes-at-cuts', action='store_false', dest='force_keyframes_at_cuts', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 777b8b3ea..45af4ec61 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3495,6 +3495,23 @@ def match_filter_func(filters): return _match_func +def download_range_func(chapters, ranges): + def inner(info_dict, ydl): + warning = ('There are no chapters matching the regex' if info_dict.get('chapters') + else 'Chapter information is unavailable') + for regex in chapters or []: + for i, chapter in enumerate(info_dict.get('chapters') or []): + if re.search(regex, chapter['title']): + warning = None + yield {**chapter, 'index': i} + if warning: + ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') + + yield from ({'start_time': start, 'end_time': end} for start, end in ranges or []) + + return inner + + def parse_dfxp_time_expr(time_expr): if not time_expr: return