[extractor/bilibili] Add space.bilibili extractors (#4468)

Authored by: lockmatrix
pull/4947/head
Locke 2 years ago committed by GitHub
parent 2314b4d89f
commit 2b9d02167f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -190,7 +190,9 @@ from .bilibili import (
BilibiliAudioIE, BilibiliAudioIE,
BilibiliAudioAlbumIE, BilibiliAudioAlbumIE,
BiliBiliPlayerIE, BiliBiliPlayerIE,
BilibiliChannelIE, BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE,
BiliIntlIE, BiliIntlIE,
BiliIntlSeriesIE, BiliIntlSeriesIE,
BiliLiveIE, BiliLiveIE,

@ -2,8 +2,8 @@ import base64
import hashlib import hashlib
import itertools import itertools
import functools import functools
import re
import math import math
import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..compat import ( from ..compat import (
@ -13,23 +13,24 @@ from ..compat import (
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
InAdvancePagedList,
OnDemandPagedList,
filter_dict, filter_dict,
int_or_none,
float_or_none, float_or_none,
int_or_none,
mimetype2ext, mimetype2ext,
parse_count,
parse_iso8601, parse_iso8601,
qualities, qualities,
traverse_obj,
parse_count,
smuggle_url, smuggle_url,
srt_subtitles_timecode, srt_subtitles_timecode,
str_or_none, str_or_none,
strip_jsonp, strip_jsonp,
traverse_obj,
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
urlencode_postdata, urlencode_postdata,
url_or_none, url_or_none,
OnDemandPagedList
) )
@ -505,39 +506,126 @@ class BiliBiliBangumiIE(InfoExtractor):
season_info.get('bangumi_title'), season_info.get('evaluate')) season_info.get('bangumi_title'), season_info.get('evaluate'))
class BilibiliChannelIE(InfoExtractor): class BilibiliSpaceBaseIE(InfoExtractor):
_VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' def _extract_playlist(self, fetch_page, get_metadata, get_entries):
_API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp" first_page = fetch_page(1)
metadata = get_metadata(first_page)
paged_list = InAdvancePagedList(
lambda idx: get_entries(fetch_page(idx) if idx > 1 else first_page),
metadata['page_count'], metadata['page_size'])
return metadata, paged_list
class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)'
_TESTS = [{ _TESTS = [{
'url': 'https://space.bilibili.com/3985676/video', 'url': 'https://space.bilibili.com/3985676/video',
'info_dict': {}, 'info_dict': {
'playlist_mincount': 112, 'id': '3985676',
},
'playlist_mincount': 178,
}] }]
def _entries(self, list_id): def _real_extract(self, url):
count, max_count = 0, None playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
if not is_video_url:
self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
'To download audios, add a "/audio" to the URL')
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/space/arc/search', playlist_id,
note=f'Downloading page {page_idx}',
query={'mid': playlist_id, 'pn': page_idx, 'jsonp': 'jsonp'})['data']
def get_metadata(page_data):
page_size = page_data['page']['ps']
entry_count = page_data['page']['count']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
}
for page_num in itertools.count(1): def get_entries(page_data):
data = self._download_json( for entry in traverse_obj(page_data, ('list', 'vlist')) or []:
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid'])
max_count = max_count or traverse_obj(data, ('page', 'count')) metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id)
entries = traverse_obj(data, ('list', 'vlist'))
if not entries:
return
for entry in entries:
yield self.url_result(
'https://www.bilibili.com/video/%s' % entry['bvid'],
BiliBiliIE.ie_key(), entry['bvid'])
count += len(entries) class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
if max_count and count >= max_count: _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio'
return _TESTS = [{
'url': 'https://space.bilibili.com/3985676/audio',
'info_dict': {
'id': '3985676',
},
'playlist_mincount': 1,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id,
note=f'Downloading page {page_idx}',
query={'uid': playlist_id, 'pn': page_idx, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data']
def get_metadata(page_data):
return {
'page_count': page_data['pageCount'],
'page_size': page_data['pageSize'],
}
def get_entries(page_data):
for entry in page_data.get('data', []):
yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': {
'id': '2142762_57445',
'title': '《底特律 变人》'
},
'playlist_mincount': 31,
}]
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) mid, sid = self._match_valid_url(url).group('mid', 'sid')
return self.playlist_result(self._entries(list_id), list_id) playlist_id = f'{mid}_{sid}'
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/polymer/space/seasons_archives_list',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'season_id': sid, 'page_num': page_idx, 'page_size': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['page_size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name'))
}
def get_entries(page_data):
for entry in page_data.get('archives', []):
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title'])
class BilibiliCategoryIE(InfoExtractor): class BilibiliCategoryIE(InfoExtractor):

Loading…
Cancel
Save