[extractor/vk] Fix playlists for new API (#6122)

Authored by: the-marenga
Closes #6219
pull/6227/head
Marenga 2 years ago committed by GitHub
parent c154302c58
commit a9c685453f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,20 +9,22 @@ from .pladform import PladformIE
from .sibnet import SibnetEmbedIE from .sibnet import SibnetEmbedIE
from .vimeo import VimeoIE from .vimeo import VimeoIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
get_element_by_class, get_element_by_class,
get_element_html_by_id,
int_or_none, int_or_none,
orderedSet, join_nonempty,
str_or_none, str_or_none,
str_to_int, str_to_int,
try_call,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
@ -117,7 +119,7 @@ class VKIE(VKBaseIE):
'upload_date': '20120212', 'upload_date': '20120212',
'comment_count': int, 'comment_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:https?://.+\.jpg$', 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, },
@ -134,7 +136,7 @@ class VKIE(VKBaseIE):
'upload_date': '20130720', 'upload_date': '20130720',
'comment_count': int, 'comment_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:https?://.+\.jpg$', 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
} }
}, },
{ {
@ -149,55 +151,10 @@ class VKIE(VKBaseIE):
'upload_date': '20120212', 'upload_date': '20120212',
'timestamp': 1329049880, 'timestamp': 1329049880,
'uploader_id': '39545378', 'uploader_id': '39545378',
'thumbnail': r're:https?://.+\.jpg$', 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, },
{
# VIDEO NOW REMOVED
# please update if you find a video whose URL follows the same pattern
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'note': 'Only available for registered users',
'info_dict': {
'id': '-8871596_164049491',
'ext': 'mp4',
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352,
'upload_date': '20121218',
'view_count': int,
},
'skip': 'Removed',
},
{
'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
'info_dict': {
'id': '-43215063_168067957',
'ext': 'mp4',
'uploader': 'Bro Mazter',
'title': ' ',
'duration': 7291,
'upload_date': '20140328',
'uploader_id': '223413403',
'timestamp': 1396018030,
},
'skip': 'Requires vk account credentials',
},
{
'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540',
'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',
'note': 'ivi.ru embed',
'info_dict': {
'id': '-43215063_169084319',
'ext': 'mp4',
'title': 'Книга Илая',
'duration': 6771,
'upload_date': '20140626',
'view_count': int,
},
'skip': 'Removed',
},
{ {
'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT', 'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT',
'info_dict': { 'info_dict': {
@ -211,26 +168,11 @@ class VKIE(VKBaseIE):
'timestamp': 1640162189, 'timestamp': 1640162189,
'upload_date': '20211222', 'upload_date': '20211222',
'uploader_id': '-93049196', 'uploader_id': '-93049196',
'thumbnail': r're:https?://.+\.jpg$', 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
}, },
}, },
{ {
# video (removed?) only available with list id 'note': 'youtube embed',
'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
'md5': '091287af5402239a1051c37ec7b92913',
'info_dict': {
'id': '30481095_171201961',
'ext': 'mp4',
'title': 'ТюменцевВВ_09.07.2015',
'uploader': 'Anton Ivanov',
'duration': 109,
'upload_date': '20150709',
'view_count': int,
},
'skip': 'Removed',
},
{
# youtube embed
'url': 'https://vk.com/video276849682_170681728', 'url': 'https://vk.com/video276849682_170681728',
'info_dict': { 'info_dict': {
'id': 'V3K4mi0SYkc', 'id': 'V3K4mi0SYkc',
@ -254,23 +196,45 @@ class VKIE(VKBaseIE):
'start_time': 0.0, 'start_time': 0.0,
'categories': ['Nonprofits & Activism'], 'categories': ['Nonprofits & Activism'],
'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw', 'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw',
'channel_follower_count': int,
'age_limit': 0,
},
},
{
'note': 'dailymotion embed',
'url': 'https://vk.com/video-95168827_456239103?list=cca524a0f0d5557e16',
'info_dict': {
'id': 'x8gfli0',
'ext': 'mp4',
'title': 'md5:45410f60ccd4b2760da98cb5fc777d70',
'description': 'md5:2e71c5c9413735cfa06cf1a166f16c84',
'uploader': 'Movies and cinema.',
'upload_date': '20221218',
'uploader_id': 'x1jdavv',
'timestamp': 1671387617,
'age_limit': 0, 'age_limit': 0,
'duration': 2918,
'like_count': int,
'view_count': int,
'thumbnail': r're:https?://.+x1080$',
'tags': list
}, },
}, },
{ {
# dailymotion embed 'url': 'https://vk.com/clips-74006511?z=clip-74006511_456247211',
'url': 'https://vk.com/video-37468416_456239855',
'info_dict': { 'info_dict': {
'id': 'k3lz2cmXyRuJQSjGHUv', 'id': '-74006511_456247211',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', 'comment_count': int,
'description': 'md5:424b8e88cc873217f520e582ba28bb36', 'duration': 9,
'uploader': 'AniLibria.Tv', 'like_count': int,
'upload_date': '20160914', 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
'uploader_id': 'x1p5vl5', 'timestamp': 1664995597,
'timestamp': 1473877246, 'title': 'Clip by @madempress',
'upload_date': '20221005',
'uploader': 'Шальная императрица',
'uploader_id': '-74006511',
}, },
'skip': 'Removed'
}, },
{ {
# video key is extra_data not url\d+ # video key is extra_data not url\d+
@ -288,7 +252,7 @@ class VKIE(VKBaseIE):
'skip': 'Removed', 'skip': 'Removed',
}, },
{ {
# finished live stream, postlive_mp4 'note': 'finished live stream, postlive_mp4',
'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2',
'info_dict': { 'info_dict': {
'id': '-387766_456242764', 'id': '-387766_456242764',
@ -552,7 +516,7 @@ class VKUserVideosIE(VKBaseIE):
}, { }, {
'url': 'https://vk.com/video/playlist/-174476437_2', 'url': 'https://vk.com/video/playlist/-174476437_2',
'info_dict': { 'info_dict': {
'id': '-174476437_2', 'id': '-174476437_playlist_2',
'title': 'Анонсы' 'title': 'Анонсы'
}, },
'playlist_mincount': 108, 'playlist_mincount': 108,
@ -595,6 +559,7 @@ class VKUserVideosIE(VKBaseIE):
page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id')
elif '_' in u_id: elif '_' in u_id:
page_id, section = u_id.split('_', 1) page_id, section = u_id.split('_', 1)
section = f'playlist_{section}'
else: else:
raise ExtractorError('Invalid URL', expected=True) raise ExtractorError('Invalid URL', expected=True)
@ -614,13 +579,13 @@ class VKWallPostIE(VKBaseIE):
'info_dict': { 'info_dict': {
'id': '-23538238_35', 'id': '-23538238_35',
'title': 'Black Shadow - Wall post -23538238_35', 'title': 'Black Shadow - Wall post -23538238_35',
'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 'description': 'md5:190c78f905a53e0de793d83933c6e67f',
}, },
'playlist': [{ 'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': { 'info_dict': {
'id': '135220665_111806521', 'id': '135220665_111806521',
'ext': 'mp4', 'ext': 'm4a',
'title': 'Black Shadow - Слепое Верование', 'title': 'Black Shadow - Слепое Верование',
'duration': 370, 'duration': 370,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
@ -631,7 +596,7 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233', 'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': { 'info_dict': {
'id': '135220665_111802303', 'id': '135220665_111802303',
'ext': 'mp4', 'ext': 'm4a',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423, 'duration': 423,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
@ -642,16 +607,15 @@ class VKWallPostIE(VKBaseIE):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Requires vk account credentials',
}, { }, {
# single YouTube embed, no leading - # single YouTube embed with irrelevant reaction videos
'url': 'https://vk.com/wall85155021_6319', 'url': 'https://vk.com/wall-32370614_7173954',
'info_dict': { 'info_dict': {
'id': '85155021_6319', 'id': '-32370614_7173954',
'title': 'Сергей Горбунов - Wall post 85155021_6319', 'title': 'md5:9f93c405bbc00061d34007d78c75e3bc',
'description': 'md5:953b811f26fa9f21ee5856e2ea8e68fc',
}, },
'playlist_count': 1, 'playlist_count': 1,
'skip': 'Requires vk account credentials',
}, { }, {
# wall page URL # wall page URL
'url': 'https://vk.com/wall-23538238_35', 'url': 'https://vk.com/wall-23538238_35',
@ -703,39 +667,37 @@ class VKWallPostIE(VKBaseIE):
'w': 'wall' + post_id, 'w': 'wall' + post_id,
})[1] })[1]
description = clean_html(get_element_by_class('wall_post_text', webpage)) uploader = clean_html(get_element_by_class('PostHeaderTitle__authorName', webpage))
uploader = clean_html(get_element_by_class('author', webpage))
entries = [] entries = []
for audio in re.findall(r'data-audio="([^"]+)', webpage): for audio in re.findall(r'data-audio="([^"]+)', webpage):
audio = self._parse_json(unescapeHTML(audio), post_id) audio = self._parse_json(unescapeHTML(audio), post_id)
a = self._AUDIO._make(audio[:16]) if not audio['url']:
if not a.url:
continue continue
title = unescapeHTML(a.title) title = unescapeHTML(audio.get('title'))
performer = unescapeHTML(a.performer) artist = unescapeHTML(audio.get('artist'))
entries.append({ entries.append({
'id': '%s_%s' % (a.owner_id, a.id), 'id': f'{audio["owner_id"]}_{audio["id"]}',
'url': self._unmask_url(a.url, a.ads['vk_id']), 'title': join_nonempty(artist, title, delim=' - '),
'title': '%s - %s' % (performer, title) if performer else title, 'thumbnails': try_call(lambda: [{'url': u} for u in audio['coverUrl'].split(',')]),
'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, 'duration': int_or_none(audio.get('duration')),
'duration': int_or_none(a.duration),
'uploader': uploader, 'uploader': uploader,
'artist': performer, 'artist': artist,
'track': title, 'track': title,
'ext': 'mp4', 'formats': [{
'protocol': 'm3u8_native', 'url': audio['url'],
'ext': 'm4a',
'vcodec': 'none',
'acodec': 'mp3',
'container': 'm4a_dash',
}],
}) })
for video in re.finditer( entries.extend(self.url_result(urljoin(url, entry), VKIE) for entry in set(re.findall(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): r'<a[^>]+href=(?:["\'])(/video(?:-?[\d_]+)[^"\']*)',
entries.append(self.url_result( get_element_html_by_id('wl_post_body', webpage))))
compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
title = 'Wall post %s' % post_id
return self.playlist_result( return self.playlist_result(
orderedSet(entries), post_id, entries, post_id, join_nonempty(uploader, f'Wall post {post_id}', delim=' - '),
'%s - %s' % (uploader, title) if uploader else title, clean_html(get_element_by_class('wall_post_text', webpage)))
description)

Loading…
Cancel
Save