From 3416c1a0e88c599ac053946ae22a385381409158 Mon Sep 17 00:00:00 2001 From: DmitryScaletta Date: Wed, 14 Feb 2024 00:46:01 +0300 Subject: [PATCH] [PromoDJ] Add user pages and blogs extractors --- yt_dlp/extractor/promodj.py | 66 ++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/promodj.py b/yt_dlp/extractor/promodj.py index da9db44ad..e9820b60d 100644 --- a/yt_dlp/extractor/promodj.py +++ b/yt_dlp/extractor/promodj.py @@ -4,17 +4,16 @@ import re import urllib.parse from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( OnDemandPagedList, clean_html, - dict_get, extract_attributes, - float_or_none, + ExtractorError, get_element_by_class, get_elements_by_class, int_or_none, parse_duration, - str_or_none, traverse_obj, urlencode_postdata, url_or_none, @@ -106,6 +105,14 @@ class PromoDJBaseIE(InfoExtractor): return playlist_links + def _parse_page_content(self, html): + for id in re.findall(r'CORE\.Player\(\'[^\']+\', \'(?:standalone|cover)\.big\', (\d+),', html): + yield self.url_result(f'https://promodj.com/embed/{id}/big', PromoDJEmbedIE, id) + + for iframe_url in re.findall(r']+src=\"([^\"]+)\"', html): + if YoutubeIE.suitable(iframe_url): + yield self.url_result(iframe_url, YoutubeIE) + def _get_playlist_page_size(self, url): is_default_playlist = '/groups/' not in url return 30 if is_default_playlist else 20 @@ -210,7 +217,7 @@ class PromoDJUserMediaIE(PromoDJBaseIE): class PromoDJUserPagesIE(PromoDJBaseIE): - _VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P{PromoDJBaseIE._LOGIN_RE})/(?P(pages|blog))$' + _VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P{PromoDJBaseIE._LOGIN_RE})/(?Ppages|blog)$' _TESTS = [{ 'url': 'https://promodj.com/djperetse/pages', 'only_matching': True, @@ -219,8 +226,38 @@ class PromoDJUserPagesIE(PromoDJBaseIE): 'only_matching': True, }] + _PAGE_SIZE = 10 + + def _parse_pages(self, url, playlist_id): + html = self._download_webpage(url, playlist_id) + content_html = get_element_by_class('dj_universal', get_element_by_class('dj_bblock', html)) + print(re.findall(r'([^<]+)', content_html)) + for page_url, page_title in re.findall(r'([^<]+)', content_html): + yield self.url_result(page_url, PromoDJUserPageIE, video_title=page_title) + + def _fetch_blog_page(self, url, playlist_id, page): + page_url = self._set_url_page(url, page + 1) + html = self._download_webpage(page_url, f'{playlist_id}-page-{page + 1}') + current_page = int(clean_html(get_element_by_class('NavigatorCurrentPage', html)) or '1') + if current_page != page + 1: + return + + for a in get_elements_by_class('post_title', html): + if not a: + continue + if url := traverse_obj(extract_attributes(a), ('href', {url_or_none})): + yield self.url_result(url, PromoDJBlogPageIE) + def _real_extract(self, url): login, type = self._match_valid_url(url).groups() + playlist_id = f'{login}-{type}' + if type == 'pages': + entries = self._parse_pages(url, playlist_id) + elif type == 'blog': + entries = OnDemandPagedList( + functools.partial(self._fetch_blog_page, url, playlist_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_id) class PromoDJUserPageIE(PromoDJBaseIE): @@ -246,6 +283,11 @@ class PromoDJUserPageIE(PromoDJBaseIE): def _real_extract(self, url): login, slug = self._match_valid_url(url).groups() + page_id = f'{login}-{slug}' + html = self._download_webpage(url, page_id) + content_html = get_element_by_class('perfect', html) + return self.playlist_result( + self._parse_page_content(content_html), playlist_id=page_id) class PromoDJBlogPageIE(PromoDJBaseIE): @@ -266,6 +308,11 @@ class PromoDJBlogPageIE(PromoDJBaseIE): def _real_extract(self, url): login, id, slug = self._match_valid_url(url).groups() + page_id = f'{login}-blog-{id}-{slug}' + html = self._download_webpage(url, page_id) + content_html = get_element_by_class('post_body', html) + return self.playlist_result( + self._parse_page_content(content_html), playlist_id=page_id) class PromoDJPlaylistIE(PromoDJBaseIE): @@ -439,9 +486,20 @@ class PromoDJEmbedIE(PromoDJBaseIE): # video (can be only big) 'url': 'https://promodj.com/embed/3922099/big', 'only_matching': True, + }, { + # blocked + 'url': 'https://promodj.com/embed/5586967/big', + 'only_matching': True, + }, { + # deleted + 'url': 'https://promodj.com/embed/5606804/big', + 'only_matching': True, }] def _get_full_url(self, media_data, id): + if player_error := media_data.get('player_error'): + raise ExtractorError(player_error, expected=True) + if media_data.get('video'): video_config = self._parse_json(media_data['config'], id) video = traverse_obj(video_config, ('playlist', 'item', 0))