From fc08bdd6ab2abb92d7814d035b34c15cb7006597 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Jan 2022 21:01:17 +0530 Subject: [PATCH] [extractor] Allow non-fatal `title` extraction --- CONTRIBUTING.md | 4 +++- yt_dlp/YoutubeDL.py | 7 ++++++- yt_dlp/extractor/common.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ed4bf69d9..c25d6a2a5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -252,7 +252,9 @@ For extraction to work yt-dlp relies on metadata your extractor extracts and pro - `title` (media title) - `url` (media download URL) or `formats` -The aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. While, in fact, only `id` is technically mandatory, due to compatibility reasons, yt-dlp also treats `title` as mandatory. The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - Eg: when the video is a live stream that has not started yet. +The aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. While all extractors must return a `title`, they must also allow it's extraction to be non-fatal. + +The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - Eg: when the video is a live stream that has not started yet. [Any field](yt_dlp/extractor/common.py#219-L426) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4af77cae2..24843c775 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2299,10 +2299,15 @@ class YoutubeDL(object): self._num_videos += 1 if 'id' not in info_dict: - raise ExtractorError('Missing "id" field in extractor result') + raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor']) + elif not info_dict.get('id'): + raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor']) if 'title' not in info_dict: raise ExtractorError('Missing "title" field in extractor result', video_id=info_dict['id'], ie=info_dict['extractor']) + elif not info_dict.get('title'): + self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') + info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}' def report_force_conversion(field, field_not, conversion): self.report_warning( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bd9362827..a2f160a82 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1291,7 +1291,7 @@ class InfoExtractor(object): return self._og_search_property('description', html, fatal=False, **kargs) def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) + return self._og_search_property('title', html, fatal=False, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url')