From 5ec1b6b71689d2f0cbdcd2b6c4dd861fb2fcf911 Mon Sep 17 00:00:00 2001
From: pukkandan <pukkandan.ytdlp@gmail.com>
Date: Tue, 7 Jun 2022 01:43:50 +0530
Subject: [PATCH] Add option `--download-sections` to download video partially

Closes #52, Closes #3932
---
 README.md                     | 28 +++++++++++------
 yt_dlp/YoutubeDL.py           | 57 +++++++++++++++++++++++++++--------
 yt_dlp/__init__.py            | 34 +++++++++++++--------
 yt_dlp/downloader/__init__.py |  4 +--
 yt_dlp/downloader/external.py | 21 ++++++-------
 yt_dlp/options.py             | 17 +++++++----
 yt_dlp/utils.py               | 17 +++++++++++
 7 files changed, 123 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index c14f4b365..9424f67a0 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,8 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t
 
 * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE]`
 
+* **Download time range**: Videos can be downloaded partially based on either timestamps or chapters using `--download-sections`
+
 * **Split video by chapters**: Videos can be split into multiple files based on chapters using `--split-chapters`
 
 * **Multi-threaded fragment downloads**: Download multiple fragments of m3u8/mpd videos in parallel. Use `--concurrent-fragments` (`-N`) option to set the number of threads used
@@ -555,6 +557,14 @@ You can also fork the project on github and run your fork's [build workflow](.gi
     --no-hls-use-mpegts             Do not use the mpegts container for HLS
                                     videos. This is default when not downloading
                                     live streams
+    --download-sections REGEX       Download only chapters whose title matches
+                                    the given regular expression. Time ranges
+                                    prefixed by a "*" can also be used in place
+                                    of chapters to download the specified range.
+                                    Eg: --download-sections "*10:15-15:00"
+                                    --download-sections "intro". Needs ffmpeg.
+                                    This option can be used multiple times to
+                                    download multiple sections
     --downloader [PROTO:]NAME       Name or path of the external downloader to
                                     use (optionally) prefixed by the protocols
                                     (http, ftp, m3u8, dash, rstp, rtmp, mms) to
@@ -997,18 +1007,16 @@ You can also fork the project on github and run your fork's [build workflow](.gi
     --no-split-chapters             Do not split video based on chapters
                                     (default)
     --remove-chapters REGEX         Remove chapters whose title matches the
-                                    given regular expression. Time ranges
-                                    prefixed by a "*" can also be used in place
-                                    of chapters to remove the specified range.
-                                    Eg: --remove-chapters "*10:15-15:00"
-                                    --remove-chapters "intro". This option can
+                                    given regular expression. The syntax is the
+                                    same as --download-sections. This option can
                                     be used multiple times
     --no-remove-chapters            Do not remove any chapters from the file
                                     (default)
-    --force-keyframes-at-cuts       Force keyframes around chapters when
-                                    removing/splitting them. This is slow due to
-                                    needing a re-encode, but the resulting video
-                                    may have fewer artifacts around the cuts
+    --force-keyframes-at-cuts       Force keyframes at cuts when
+                                    downloading/splitting/removing sections.
+                                    This is slow due to needing a re-encode, but
+                                    the resulting video may have fewer artifacts
+                                    around the cuts
     --no-force-keyframes-at-cuts    Do not force keyframes around the chapters
                                     when cutting/splitting (default)
     --use-postprocessor NAME[:ARGS]
@@ -1286,7 +1294,7 @@ Available for the media that is a track or a part of a music album:
  - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to
  - `release_year` (numeric): Year (YYYY) when the album was released
 
-Available for `chapter:` prefix when using `--split-chapters` for videos with internal chapters:
+Available only when using `--download-sections` and for `chapter:` prefix when using `--split-chapters` for videos with internal chapters:
 
  - `section_title` (string): Title of the chapter
  - `section_number` (numeric): Number of the chapter within the file
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index e71e85d2e..8fff9ddc0 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -417,8 +417,6 @@ class YoutubeDL:
     geo_bypass_ip_block:
                        IP range in CIDR notation that will be used similarly to
                        geo_bypass_country
-
-    The following options determine which downloader is picked:
     external_downloader: A dictionary of protocol keys and the executable of the
                        external downloader to use for it. The allowed protocols
                        are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
@@ -435,6 +433,13 @@ class YoutubeDL:
     retry_sleep_functions: Dictionary of functions that takes the number of attempts
                        as argument and returns the time to sleep in seconds.
                        Allowed keys are 'http', 'fragment', 'file_access'
+    download_ranges:   A function that gets called for every video with the signature
+                       (info_dict, *, ydl) -> Iterable[Section].
+                       Only the returned sections will be downloaded. Each Section contains:
+                       * start_time: Start time of the section in seconds
+                       * end_time: End time of the section in seconds
+                       * title: Section title (Optional)
+                       * index: Section number (Optional)
 
     The following parameters are not used by YoutubeDL itself, they are used by
     the downloader (see yt_dlp/downloader/common.py):
@@ -2653,16 +2658,34 @@ class YoutubeDL:
             # Process what we can, even without any available formats.
             formats_to_download = [{}]
 
-        best_format = formats_to_download[-1]
+        requested_ranges = self.params.get('download_ranges')
+        if requested_ranges:
+            requested_ranges = tuple(requested_ranges(info_dict, self))
+
+        best_format, downloaded_formats = formats_to_download[-1], []
         if download:
             if best_format:
-                self.to_screen(
-                    f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): '
-                    + ', '.join([f['format_id'] for f in formats_to_download]))
+                def to_screen(*msg):
+                    self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
+
+                to_screen(f'Downloading {len(formats_to_download)} format(s):',
+                          (f['format_id'] for f in formats_to_download))
+                if requested_ranges:
+                    to_screen(f'Downloading {len(requested_ranges)} time ranges:',
+                              (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
             max_downloads_reached = False
-            for i, fmt in enumerate(formats_to_download):
-                formats_to_download[i] = new_info = self._copy_infodict(info_dict)
+
+            for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
+                new_info = self._copy_infodict(info_dict)
                 new_info.update(fmt)
+                if chapter:
+                    new_info.update({
+                        'section_start': chapter.get('start_time'),
+                        'section_end': chapter.get('end_time', 0),
+                        'section_title': chapter.get('title'),
+                        'section_number': chapter.get('index'),
+                    })
+                downloaded_formats.append(new_info)
                 try:
                     self.process_info(new_info)
                 except MaxDownloadsReached:
@@ -2675,12 +2698,12 @@ class YoutubeDL:
                 if max_downloads_reached:
                     break
 
-            write_archive = {f.get('__write_download_archive', False) for f in formats_to_download}
+            write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
             assert write_archive.issubset({True, False, 'ignore'})
             if True in write_archive and False not in write_archive:
                 self.record_download_archive(info_dict)
 
-            info_dict['requested_downloads'] = formats_to_download
+            info_dict['requested_downloads'] = downloaded_formats
             info_dict = self.run_all_pps('after_video', info_dict)
             if max_downloads_reached:
                 raise MaxDownloadsReached()
@@ -3036,6 +3059,17 @@ class YoutubeDL:
                     return file
 
                 success = True
+                merger = FFmpegMergerPP(self)
+                fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
+                if fd is not FFmpegFD and (
+                        info_dict.get('section_start') or info_dict.get('section_end')):
+                    msg = ('This format cannot be partially downloaded' if merger.available
+                           else 'You have requested downloading the video partially, but ffmpeg is not installed')
+                    if not self.params.get('ignoreerrors'):
+                        self.report_error(f'{msg}. Aborting due to --abort-on-error')
+                        return
+                    self.report_warning(f'{msg}. The entire video will be downloaded')
+
                 if info_dict.get('requested_formats') is not None:
 
                     def compatible_formats(formats):
@@ -3091,9 +3125,6 @@ class YoutubeDL:
                     info_dict['__real_download'] = False
 
                     downloaded = []
-                    merger = FFmpegMergerPP(self)
-
-                    fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
                     if dl_filename is not None:
                         self.report_file_already_downloaded(dl_filename)
                     elif fd:
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index d42a3f0d3..4217601bf 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -35,6 +35,7 @@ from .utils import (
     GeoUtils,
     SameFileError,
     decodeOption,
+    download_range_func,
     expand_path,
     float_or_none,
     int_or_none,
@@ -305,20 +306,25 @@ def validate_options(opts):
             'Cannot download a video and extract audio into the same file! '
             f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template')
 
-    # Remove chapters
-    remove_chapters_patterns, opts.remove_ranges = [], []
-    for regex in opts.remove_chapters or []:
-        if regex.startswith('*'):
-            dur = list(map(parse_duration, regex[1:].split('-')))
-            if len(dur) == 2 and all(t is not None for t in dur):
-                opts.remove_ranges.append(tuple(dur))
+    def parse_chapters(name, value):
+        chapters, ranges = [], []
+        for regex in value or []:
+            if regex.startswith('*'):
+                for range in regex[1:].split(','):
+                    dur = tuple(map(parse_duration, range.strip().split('-')))
+                    if len(dur) == 2 and all(t is not None for t in dur):
+                        ranges.append(dur)
+                    else:
+                        raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end')
                 continue
-            raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end')
-        try:
-            remove_chapters_patterns.append(re.compile(regex))
-        except re.error as err:
-            raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}')
-    opts.remove_chapters = remove_chapters_patterns
+            try:
+                chapters.append(re.compile(regex))
+            except re.error as err:
+                raise ValueError(f'invalid {name} regex "{regex}" - {err}')
+        return chapters, ranges
+
+    opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters)
+    opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges))
 
     # Cookies from browser
     if opts.cookiesfrombrowser:
@@ -803,6 +809,8 @@ def parse_options(argv=None):
         'max_sleep_interval': opts.max_sleep_interval,
         'sleep_interval_subtitles': opts.sleep_interval_subtitles,
         'external_downloader': opts.external_downloader,
+        'download_ranges': opts.download_ranges,
+        'force_keyframes_at_cuts': opts.force_keyframes_at_cuts,
         'list_thumbnails': opts.list_thumbnails,
         'playlist_items': opts.playlist_items,
         'xattr_set_filesize': opts.xattr_set_filesize,
diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py
index 3b4a82635..a7dc6c9d0 100644
--- a/yt_dlp/downloader/__init__.py
+++ b/yt_dlp/downloader/__init__.py
@@ -84,8 +84,8 @@ def _get_suitable_downloader(info_dict, protocol, params, default):
     if default is NO_DEFAULT:
         default = HttpFD
 
-    # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict):
-    #     return FFmpegFD
+    if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict):
+        return FFmpegFD
 
     info_dict['protocol'] = protocol
     downloaders = params.get('external_downloader')
diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py
index 66eced1b3..3ef7fd4dc 100644
--- a/yt_dlp/downloader/external.py
+++ b/yt_dlp/downloader/external.py
@@ -384,13 +384,6 @@ class FFmpegFD(ExternalFD):
             # http://trac.ffmpeg.org/ticket/6125#comment:10
             args += ['-seekable', '1' if seekable else '0']
 
-        # start_time = info_dict.get('start_time') or 0
-        # if start_time:
-        #     args += ['-ss', str(start_time)]
-        # end_time = info_dict.get('end_time')
-        # if end_time:
-        #     args += ['-t', str(end_time - start_time)]
-
         http_headers = None
         if info_dict.get('http_headers'):
             youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers'])
@@ -451,15 +444,21 @@ class FFmpegFD(ExternalFD):
             elif isinstance(conn, str):
                 args += ['-rtmp_conn', conn]
 
+        start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end')
+
         for i, url in enumerate(urls):
-            # We need to specify headers for each http input stream
-            # otherwise, it will only be applied to the first.
-            # https://github.com/yt-dlp/yt-dlp/issues/2696
             if http_headers is not None and re.match(r'^https?://', url):
                 args += http_headers
+            if start_time:
+                args += ['-ss', str(start_time)]
+            if end_time:
+                args += ['-t', str(end_time - start_time)]
+
             args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url]
 
-        args += ['-c', 'copy']
+        if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'):
+            args += ['-c', 'copy']
+
         if info_dict.get('requested_formats') or protocol == 'http_dash_segments':
             for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]):
                 stream_number = fmt.get('manifest_stream_number', 0)
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index a9a2ba45f..9e36e1c52 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -916,6 +916,14 @@ def create_parser():
         help=(
             'Do not use the mpegts container for HLS videos. '
             'This is default when not downloading live streams'))
+    downloader.add_option(
+        '--download-sections',
+        metavar='REGEX', dest='download_ranges', action='append',
+        help=(
+            'Download only chapters whose title matches the given regular expression. '
+            'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. '
+            'Eg: --download-sections "*10:15-15:00" --download-sections "intro". '
+            'Needs ffmpeg. This option can be used multiple times to download multiple sections'))
     downloader.add_option(
         '--downloader', '--external-downloader',
         dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str',
@@ -1631,9 +1639,7 @@ def create_parser():
         metavar='REGEX', dest='remove_chapters', action='append',
         help=(
             'Remove chapters whose title matches the given regular expression. '
-            'Time ranges prefixed by a "*" can also be used in place of chapters to remove the specified range. '
-            'Eg: --remove-chapters "*10:15-15:00" --remove-chapters "intro". '
-            'This option can be used multiple times'))
+            'The syntax is the same as --download-sections. This option can be used multiple times'))
     postproc.add_option(
         '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None,
         help='Do not remove any chapters from the file (default)')
@@ -1641,9 +1647,8 @@ def create_parser():
         '--force-keyframes-at-cuts',
         action='store_true', dest='force_keyframes_at_cuts', default=False,
         help=(
-            'Force keyframes around chapters when removing/splitting them. '
-            'This is slow due to needing a re-encode, but '
-            'the resulting video may have fewer artifacts around the cuts'))
+            'Force keyframes at cuts when downloading/splitting/removing sections. '
+            'This is slow due to needing a re-encode, but the resulting video may have fewer artifacts around the cuts'))
     postproc.add_option(
         '--no-force-keyframes-at-cuts',
         action='store_false', dest='force_keyframes_at_cuts',
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 777b8b3ea..45af4ec61 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3495,6 +3495,23 @@ def match_filter_func(filters):
     return _match_func
 
 
+def download_range_func(chapters, ranges):
+    def inner(info_dict, ydl):
+        warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
+                   else 'Chapter information is unavailable')
+        for regex in chapters or []:
+            for i, chapter in enumerate(info_dict.get('chapters') or []):
+                if re.search(regex, chapter['title']):
+                    warning = None
+                    yield {**chapter, 'index': i}
+        if warning:
+            ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
+
+        yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
+
+    return inner
+
+
 def parse_dfxp_time_expr(time_expr):
     if not time_expr:
         return