From 0bea4fd8072c1421ab3a94f0601ddef9df14f133 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Sun, 5 Jun 2022 14:37:05 +0900 Subject: [PATCH] [extractor/0000studio] Add extractors (#3959) Authored by: Lesmiscore --- yt_dlp/extractor/common.py | 7 +- yt_dlp/extractor/extractors.py | 4 ++ yt_dlp/extractor/fourzerostudio.py | 110 +++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/fourzerostudio.py diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2e62660c7..c0b1fa9e0 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1568,7 +1568,7 @@ class InfoExtractor: webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_full_data=False): ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' # not all website do this, but it can be changed # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source @@ -1584,7 +1584,10 @@ class InfoExtractor: if val in ('undefined', 'void 0'): args[key] = 'null' - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + ret = self._parse_json(js_to_json(js, args), video_id) + if return_full_data: + return ret + return ret['data'][0] @staticmethod def _hidden_inputs(html): diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index fa147ad2a..430c08eb4 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -517,6 +517,10 @@ from .fourtube import ( PornerBrosIE, FuxIE, ) +from .fourzerostudio import ( + FourZeroStudioArchiveIE, + FourZeroStudioClipIE, +) from .fox import FOXIE from .fox9 import ( FOX9IE, diff --git a/yt_dlp/extractor/fourzerostudio.py b/yt_dlp/extractor/fourzerostudio.py new file mode 100644 index 000000000..3fa159987 --- /dev/null +++ b/yt_dlp/extractor/fourzerostudio.py @@ -0,0 +1,110 @@ +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + unified_timestamp, +) + + +class FourZeroStudioArchiveIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P[^/]+)/broadcasts/(?P[^/]+)/archive' + IE_NAME = '0000studio:archive' + _TESTS = [{ + 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', + 'info_dict': { + 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', + 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', + 'timestamp': 1653802534, + 'release_timestamp': 1653796604, + 'thumbnails': 'count:1', + 'comments': 'count:7', + 'uploader': '『中崎雄心』の執務室。', + 'uploader_id': 'mumeijiten', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) + + pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) + uploader_internal_id = traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) + + formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': pcb.get('title'), + 'age_limit': 18 if pcb.get('isAdult') else None, + 'timestamp': unified_timestamp(pcb.get('finishTime')), + 'release_timestamp': unified_timestamp(pcb.get('createdAt')), + 'thumbnails': [{ + 'url': pcb['thumbnailUrl'], + 'ext': 'png', + }] if pcb.get('thumbnailUrl') else None, + 'formats': formats, + 'subtitles': subs, + 'comments': [{ + 'author': c.get('username'), + 'author_id': c.get('postedUserId'), + 'author_thumbnail': c.get('userThumbnailUrl'), + 'id': c.get('id'), + 'text': c.get('body'), + 'timestamp': unified_timestamp(c.get('createdAt')), + 'like_count': c.get('likeCount'), + 'is_favorited': c.get('isLikedByOwner'), + 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, + } for c in traverse_obj(nuxt_data, ( + 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } + + +class FourZeroStudioClipIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P[^/]+)/archive-clip/(?P[^/]+)' + IE_NAME = '0000studio:clip' + _TESTS = [{ + 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'info_dict': { + 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', + 'timestamp': 1652109105, + 'like_count': 1, + 'uploader': 'ソエジマケイタ', + 'uploader_id': 'soeji', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) + + clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) + + info = next(( + m for m in self._parse_html5_media_entries(url, webpage, video_id) + if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) + ), None) + if not info: + self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') + info = { + 'formats': [{ + 'ext': 'mp4', + 'url': url, + } for url in clip_info.get('mediaFiles') or [] if url], + } + return { + **info, + 'id': video_id, + 'title': clip_info.get('clipComment'), + 'timestamp': unified_timestamp(clip_info.get('createdAt')), + 'like_count': clip_info.get('likeCount'), + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + }