From 96b9e9cf62c81b005242da418f092e45709a5123 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 6 Nov 2022 19:05:09 +0000 Subject: [PATCH] [extractor/telegram] Add playlist support and more metadata (#5358) Authored by: bashonly, bsun0000 --- yt_dlp/extractor/telegram.py | 146 +++++++++++++++++++++++++++++------ yt_dlp/utils.py | 4 +- 2 files changed, 123 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/telegram.py b/yt_dlp/extractor/telegram.py index bb9ca8c45..39f1a628a 100644 --- a/yt_dlp/extractor/telegram.py +++ b/yt_dlp/extractor/telegram.py @@ -1,41 +1,137 @@ +import re + from .common import InfoExtractor -from ..utils import clean_html, get_element_by_class +from ..utils import ( + clean_html, + format_field, + get_element_by_class, + parse_duration, + parse_qs, + traverse_obj, + unified_timestamp, + update_url_query, + url_basename, +) class TelegramEmbedIE(InfoExtractor): IE_NAME = 'telegram:embed' - _VALID_URL = r'https?://t\.me/(?P[^/]+)/(?P\d+)' + _VALID_URL = r'https?://t\.me/(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'https://t.me/europa_press/613', + 'md5': 'dd707708aea958c11a590e8068825f22', 'info_dict': { 'id': '613', 'ext': 'mp4', - 'title': 'Europa Press', - 'description': '6ce2d7e8d56eda16d80607b23db7b252', - 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+', + 'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252', + 'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252', + 'channel_id': 'europa_press', + 'channel': 'Europa Press ✔', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1635631203, + 'upload_date': '20211030', + 'duration': 61, + }, + }, { + # 2-video post + 'url': 'https://t.me/vorposte/29342', + 'info_dict': { + 'id': 'vorposte-29342', + 'title': 'Форпост 29342', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + }, + 'playlist_count': 2, + 'params': { + 'skip_download': True, + }, + }, { + # 2-video post with --no-playlist + 'url': 'https://t.me/vorposte/29343', + 'md5': '1724e96053c18e788c8464038876e245', + 'info_dict': { + 'id': '29343', + 'ext': 'mp4', + 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'channel_id': 'vorposte', + 'channel': 'Форпост', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1666384480, + 'upload_date': '20221021', + 'duration': 35, + }, + 'params': { + 'noplaylist': True, + } + }, { + # 2-video post with 'single' query param + 'url': 'https://t.me/vorposte/29342?single', + 'md5': 'd20b202f1e41400a9f43201428add18f', + 'info_dict': { + 'id': '29342', + 'ext': 'mp4', + 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'channel_id': 'vorposte', + 'channel': 'Форпост', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1666384480, + 'upload_date': '20221021', + 'duration': 33, }, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, query={'embed': 0}) - webpage_embed = self._download_webpage(url, video_id, query={'embed': 1}, note='Downloading ermbed page') + channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id') + embed = self._download_webpage( + url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame') - formats = [{ - 'url': self._proto_relative_url(self._search_regex( - ']+src="([^"]+)"', webpage_embed, 'source')), - 'ext': 'mp4', - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), - 'description': self._html_search_meta( - ['og:description', 'twitter:description'], webpage, - default=clean_html(get_element_by_class('tgme_widget_message_text', webpage_embed))), - 'thumbnail': self._search_regex( - r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)', - webpage_embed, 'thumbnail'), - 'formats': formats, + def clean_text(html_class, html): + text = clean_html(get_element_by_class(html_class, html)) + return text.replace('\n', ' ') if text else None + + description = clean_text('tgme_widget_message_text', embed) + message = { + 'title': description or '', + 'description': description, + 'channel': clean_text('tgme_widget_message_author', embed), + 'channel_id': channel_id, + 'timestamp': unified_timestamp(self._search_regex( + r']*datetime="([^"]*)"', embed, 'timestamp', fatal=False)), } + + videos = [] + for video in re.findall(r']+duration[^>]*>([\d:]+)', video, 'duration', fatal=False)), + 'thumbnail': self._search_regex( + r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)', + video, 'thumbnail', fatal=False), + 'formats': formats, + **message, + }) + + playlist_id = None + if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True): + playlist_id = f'{channel_id}-{msg_id}' + + if self._yes_playlist(playlist_id, msg_id): + return self.playlist_result( + videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description) + else: + return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index cfc7ba63a..84a8ecd6e 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3092,8 +3092,8 @@ def escape_url(url): ).geturl() -def parse_qs(url): - return urllib.parse.parse_qs(urllib.parse.urlparse(url).query) +def parse_qs(url, **kwargs): + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) def read_batch_urls(batch_fd):