From 7e9a61258543f64113e779f2f82fe7a29827489d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 17 Jun 2022 13:35:04 +0530 Subject: [PATCH] Add option `--lazy-playlist` to process entries as they are received --- README.md | 5 +++ test/test_YoutubeDL.py | 2 +- yt_dlp/YoutubeDL.py | 59 ++++++++++++++++++++---------- yt_dlp/__init__.py | 4 +++ yt_dlp/options.py | 12 +++++-- yt_dlp/utils.py | 81 +++++++++++++++++++----------------------- 6 files changed, 97 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 16f02787a..2e1ae9c11 100644 --- a/README.md +++ b/README.md @@ -540,6 +540,11 @@ You can also fork the project on github and run your fork's [build workflow](.gi bandwidth throttling imposed by a webserver (experimental) --playlist-random Download playlist videos in random order + --lazy-playlist Process entries in the playlist as they are + received. This disables n_entries, + --playlist-random and --playlist-reverse + --no-lazy-playlist Process videos in the playlist only after + the entire playlist is parsed (default) --xattr-set-filesize Set file xattribute ytdl.filesize with expected file size --hls-use-mpegts Use the mpegts container for HLS videos; diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3aafc3c4f..03a2c36a1 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1046,7 +1046,7 @@ class TestYoutubeDL(unittest.TestCase): for name, func, expected_eval in ( ('list', list_entries, INDICES), ('Generator', generator_entries, generator_eval), - ('LazyList', lazylist_entries, generator_eval), + # ('LazyList', lazylist_entries, generator_eval), # Generator and LazyList follow the exact same code path ('PagedList', pagedlist_entries, pagedlist_eval), ): evaluated = [] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4162727c4..fb3f9337f 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -242,11 +242,9 @@ class YoutubeDL: and don't overwrite any file if False For compatibility with youtube-dl, "nooverwrites" may also be used instead - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. playlistrandom: Download playlist items in random order. + lazy_playlist: Process playlist entries as they are received. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -469,6 +467,12 @@ class YoutubeDL: The following options are deprecated and may be removed in the future: + playliststart: - Use playlist_items + Playlist item to start at. + playlistend: - Use playlist_items + Playlist item to end at. + playlistreverse: - Use playlist_items + Download playlist items in reverse order. forceurl: - Use forceprint Force printing final URL. forcetitle: - Use forceprint @@ -1671,16 +1675,26 @@ class YoutubeDL: self.to_screen(f'[download] Downloading playlist: {title}') all_entries = PlaylistEntries(self, ie_result) - entries = orderedSet(all_entries.get_requested_items()) - ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*entries)) or ([], []) - n_entries, ie_result['playlist_count'] = len(entries), all_entries.full_count + entries = orderedSet(all_entries.get_requested_items(), lazy=True) + + lazy = self.params.get('lazy_playlist') + if lazy: + resolved_entries, n_entries = [], 'N/A' + ie_result['requested_entries'], ie_result['entries'] = None, None + else: + entries = resolved_entries = list(entries) + n_entries = len(resolved_entries) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + if not ie_result.get('playlist_count'): + # Better to do this after potentially exhausting entries + ie_result['playlist_count'] = all_entries.get_full_count() _infojson_written = False write_playlist_files = self.params.get('allow_playlist_files', True) if write_playlist_files and self.params.get('list_thumbnails'): self.list_thumbnails(ie_result) if write_playlist_files and not self.params.get('simulate'): - ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries) + ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)) _infojson_written = self._write_info_json( 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) if _infojson_written is None: @@ -1691,9 +1705,12 @@ class YoutubeDL: # TODO: This should be passed to ThumbnailsConvertor if necessary self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - if self.params.get('playlistreverse', False): - entries = entries[::-1] - if self.params.get('playlistrandom', False): + if lazy: + if self.params.get('playlistreverse') or self.params.get('playlistrandom'): + self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True) + elif self.params.get('playlistreverse'): + entries.reverse() + elif self.params.get('playlistrandom'): random.shuffle(entries) self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos' @@ -1701,23 +1718,27 @@ class YoutubeDL: failures = 0 max_failures = self.params.get('skip_playlist_after_errors') or float('inf') - for i, (playlist_index, entry) in enumerate(entries, 1): + for i, (playlist_index, entry) in enumerate(entries): + if lazy: + resolved_entries.append((playlist_index, entry)) + # TODO: Add auto-generated fields if self._match_entry(entry, incomplete=True) is not None: continue - if 'playlist-index' in self.params.get('compat_opts', []): - playlist_index = ie_result['requested_entries'][i - 1] self.to_screen('[download] Downloading video %s of %s' % ( - self._format_screen(i, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) + self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') + if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = ie_result['requested_entries'][i] + entry_result = self.__process_iterable_entry(entry, download, { - 'n_entries': n_entries, - '__last_playlist_index': max(ie_result['requested_entries']), + 'n_entries': int_or_none(n_entries), + '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)), 'playlist_count': ie_result.get('playlist_count'), 'playlist_index': playlist_index, - 'playlist_autonumber': i, + 'playlist_autonumber': i + 1, 'playlist': title, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), @@ -1735,10 +1756,10 @@ class YoutubeDL: self.report_error( f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') break - entries[i - 1] = (playlist_index, entry_result) + resolved_entries[i] = (playlist_index, entry_result) # Update with processed data - ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*entries)) or ([], []) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) # Write the updated info to json if _infojson_written is True and self._write_info_json( diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 1538a7e89..db34fe12a 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -434,6 +434,9 @@ def validate_options(opts): setattr(opts, opt1, default) # Conflicting options + report_conflict('--playlist-reverse', 'playlist_reverse', '--playlist-random', 'playlist_random') + report_conflict('--playlist-reverse', 'playlist_reverse', '--lazy-playlist', 'lazy_playlist') + report_conflict('--playlist-random', 'playlist_random', '--lazy-playlist', 'lazy_playlist') report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) report_conflict('--exec-before-download', 'exec_before_dl_cmd', @@ -740,6 +743,7 @@ def parse_options(argv=None): 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, 'playlistrandom': opts.playlist_random, + 'lazy_playlist': opts.lazy_playlist, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl.get('default') == '-', 'consoletitle': opts.consoletitle, diff --git a/yt_dlp/options.py b/yt_dlp/options.py index bc646ab4a..900b5c8b1 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -888,7 +888,7 @@ def create_parser(): help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-reverse', - action='store_true', + action='store_true', dest='playlist_reverse', help=optparse.SUPPRESS_HELP) downloader.add_option( '--no-playlist-reverse', @@ -896,8 +896,16 @@ def create_parser(): help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-random', - action='store_true', + action='store_true', dest='playlist_random', help='Download playlist videos in random order') + downloader.add_option( + '--lazy-playlist', + action='store_true', dest='lazy_playlist', + help='Process entries in the playlist as they are received. This disables n_entries, --playlist-random and --playlist-reverse') + downloader.add_option( + '--no-lazy-playlist', + action='store_false', dest='lazy_playlist', + help='Process videos in the playlist only after the entire playlist is parsed (default)') downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f21d70672..8dda5e931 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -770,13 +770,16 @@ def expand_path(s): return os.path.expandvars(compat_expanduser(s)) -def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res +def orderedSet(iterable, *, lazy=False): + """Remove all duplicates from the input iterable""" + def _iter(): + seen = [] # Do not use set since the items can be unhashable + for x in iterable: + if x not in seen: + seen.append(x) + yield x + + return _iter() if lazy else list(_iter()) def _htmlentity_transform(entity_with_semicolon): @@ -2820,7 +2823,26 @@ class PlaylistEntries: is_exhausted = False def __init__(self, ydl, info_dict): - self.ydl, self.info_dict = ydl, info_dict + self.ydl = ydl + + # _entries must be assigned now since infodict can change during iteration + entries = info_dict.get('entries') + if entries is None: + raise EntryNotInPlaylist('There are no entries') + elif isinstance(entries, list): + self.is_exhausted = True + + requested_entries = info_dict.get('requested_entries') + self.is_incomplete = bool(requested_entries) + if self.is_incomplete: + assert self.is_exhausted + self._entries = [self.MissingEntry] * max(requested_entries) + for i, entry in zip(requested_entries, entries): + self._entries[i - 1] = entry + elif isinstance(entries, (list, PagedList, LazyList)): + self._entries = entries + else: + self._entries = LazyList(entries) PLAYLIST_ITEMS_RE = re.compile(r'''(?x) (?P[+-]?\d+)? @@ -2863,37 +2885,13 @@ class PlaylistEntries: except (ExistingVideoReached, RejectedVideoReached): return - @property - def full_count(self): - if self.info_dict.get('playlist_count'): - return self.info_dict['playlist_count'] - elif self.is_exhausted and not self.is_incomplete: + def get_full_count(self): + if self.is_exhausted and not self.is_incomplete: return len(self) elif isinstance(self._entries, InAdvancePagedList): if self._entries._pagesize == 1: return self._entries._pagecount - @functools.cached_property - def _entries(self): - entries = self.info_dict.get('entries') - if entries is None: - raise EntryNotInPlaylist('There are no entries') - elif isinstance(entries, list): - self.is_exhausted = True - - indices = self.info_dict.get('requested_entries') - self.is_incomplete = bool(indices) - if self.is_incomplete: - assert self.is_exhausted - ret = [self.MissingEntry] * max(indices) - for i, entry in zip(indices, entries): - ret[i - 1] = entry - return ret - - if isinstance(entries, (list, PagedList, LazyList)): - return entries - return LazyList(entries) - @functools.cached_property def _getter(self): if isinstance(self._entries, list): @@ -2937,17 +2935,12 @@ class PlaylistEntries: if i < 0: continue try: - try: - entry = self._getter(i) - except self.IndexError: - self.is_exhausted = True - if step > 0: - break - continue - except IndexError: - if self.is_exhausted: + entry = self._getter(i) + except self.IndexError: + self.is_exhausted = True + if step > 0: break - raise + continue yield i + 1, entry def __len__(self):