You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yt-dlp/yt_dlp/extractor/imgur.py

385 lines
14 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import functools
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
float_or_none,
int_or_none,
js_to_json,
mimetype2ext,
parse_iso8601,
str_or_none,
strip_or_none,
traverse_obj,
url_or_none,
)
class ImgurBaseIE(InfoExtractor):
_CLIENT_ID = '546c25a59c58ad7'
@classmethod
def _imgur_result(cls, item_id):
return cls.url_result(f'https://imgur.com/{item_id}', ImgurIE, item_id)
def _call_api(self, endpoint, video_id, **kwargs):
return self._download_json(
f'https://api.imgur.com/post/v1/{endpoint}/{video_id}?client_id={self._CLIENT_ID}&include=media,account',
video_id, **kwargs)
@staticmethod
def get_description(s):
if 'Discover the magic of the internet at Imgur' in s:
return None
return s or None
class ImgurIE(ImgurBaseIE):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://imgur.com/A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 'MRW gifv is up and running without any bugs',
'timestamp': 1416446068,
'upload_date': '20141120',
'dislike_count': int,
'comment_count': int,
'release_timestamp': 1416446068,
'release_date': '20141120',
'like_count': int,
'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg',
},
}, {
'url': 'https://i.imgur.com/A61SaA1.gifv',
'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}, {
'url': 'https://i.imgur.com/jxBXAMC.gifv',
'info_dict': {
'id': 'jxBXAMC',
'ext': 'mp4',
'title': 'Fahaka puffer feeding',
'timestamp': 1533835503,
'upload_date': '20180809',
'release_date': '20180809',
'like_count': int,
'duration': 30.0,
'comment_count': int,
'release_timestamp': 1533835503,
'thumbnail': 'https://i.imgur.com/jxBXAMCh.jpg',
'dislike_count': int,
},
}, {
# needs Accept header, ref: https://github.com/yt-dlp/yt-dlp/issues/9458
'url': 'https://imgur.com/zV03bd5',
'md5': '59df97884e8ba76143ff6b640a0e2904',
'info_dict': {
'id': 'zV03bd5',
'ext': 'mp4',
'title': 'Ive - Liz',
'timestamp': 1710491255,
'upload_date': '20240315',
'like_count': int,
'dislike_count': int,
'duration': 56.92,
'comment_count': int,
'release_timestamp': 1710491255,
'release_date': '20240315',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._call_api('media', video_id)
if not traverse_obj(data, ('media', 0, (
('type', {lambda t: t == 'video' or None}),
('metadata', 'is_animated'))), get_all=False):
raise ExtractorError(f'{video_id} is not a video or animated image', expected=True)
webpage = self._download_webpage(
f'https://i.imgur.com/{video_id}.gifv', video_id, fatal=False) or ''
formats = []
media_fmt = traverse_obj(data, ('media', 0, {
'url': ('url', {url_or_none}),
'ext': ('ext', {str}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('metadata', 'has_sound', {lambda b: None if b else 'none'}),
}))
media_url = media_fmt.get('url')
if media_url:
if not media_fmt.get('ext'):
media_fmt['ext'] = mimetype2ext(traverse_obj(
data, ('media', 0, 'mime_type'))) or determine_ext(media_url)
if traverse_obj(data, ('media', 0, 'type')) == 'image':
media_fmt['acodec'] = 'none'
media_fmt.setdefault('preference', -10)
formats.append(media_fmt)
video_elements = self._search_regex(
r'(?s)<div class="video-elements">(.*?)</div>',
webpage, 'video elements', default=None)
if video_elements:
def og_get_size(media_type):
return {
p: int_or_none(self._og_search_property(f'{media_type}:{p}', webpage, default=None))
for p in ('width', 'height')
}
size = og_get_size('video')
if not any(size.values()):
size = og_get_size('image')
formats = traverse_obj(
re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements),
(..., {
'format_id': ('type', {lambda s: s.partition('/')[2]}),
'url': ('src', {self._proto_relative_url}),
'ext': ('type', {mimetype2ext}),
}))
for f in formats:
f.update(size)
# We can get the original gif format from the webpage as well
gif_json = traverse_obj(self._search_json(
r'var\s+videoItem\s*=', webpage, 'GIF info', video_id,
transform_source=js_to_json, fatal=False), {
'url': ('gifUrl', {self._proto_relative_url}),
'filesize': ('size', {int_or_none}),
})
if gif_json:
gif_json.update(size)
gif_json.update({
'format_id': 'gif',
'preference': -10, # gifs < videos
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
})
formats.append(gif_json)
search = functools.partial(self._html_search_meta, html=webpage, default=None)
twitter_fmt = {
'format_id': 'twitter',
'url': url_or_none(search('twitter:player:stream')),
'ext': mimetype2ext(search('twitter:player:stream:content_type')),
'width': int_or_none(search('twitter:width')),
'height': int_or_none(search('twitter:height')),
}
if twitter_fmt['url']:
formats.append(twitter_fmt)
if not formats:
self.raise_no_formats(
f'No sources found for video {video_id}. Maybe a plain image?', expected=True)
self._remove_duplicate_formats(formats)
return {
'title': self._og_search_title(webpage, default=None),
'description': self.get_description(self._og_search_description(webpage, default='')),
**traverse_obj(data, {
'uploader_id': ('account_id', {lambda a: str(a) if int_or_none(a) else None}),
'uploader': ('account', 'username', {lambda x: strip_or_none(x) or None}),
'uploader_url': ('account', 'avatar_url', {url_or_none}),
'like_count': ('upvote_count', {int_or_none}),
'dislike_count': ('downvote_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
'age_limit': ('is_mature', {lambda x: 18 if x else None}),
'timestamp': (('updated_at', 'created_at'), {parse_iso8601}),
'release_timestamp': ('created_at', {parse_iso8601}),
}, get_all=False),
**traverse_obj(data, ('media', 0, 'metadata', {
'title': ('title', {lambda x: strip_or_none(x) or None}),
'description': ('description', {self.get_description}),
'duration': ('duration', {float_or_none}),
'timestamp': (('updated_at', 'created_at'), {parse_iso8601}),
'release_timestamp': ('created_at', {parse_iso8601}),
}), get_all=False),
'id': video_id,
'formats': formats,
'thumbnail': url_or_none(search('thumbnailUrl')),
'http_headers': {'Accept': '*/*'},
}
class ImgurGalleryBaseIE(ImgurBaseIE):
_GALLERY = True
def _real_extract(self, url):
gallery_id = self._match_id(url)
data = self._call_api('albums', gallery_id, fatal=False, expected_status=404)
info = traverse_obj(data, {
'title': ('title', {lambda x: strip_or_none(x) or None}),
'description': ('description', {self.get_description}),
})
if traverse_obj(data, 'is_album'):
def yield_media_ids():
for m_id in traverse_obj(data, (
'media', lambda _, v: v.get('type') == 'video' or v['metadata']['is_animated'],
'id', {lambda x: str_or_none(x) or None})):
yield m_id
# if a gallery with exactly one video, apply album metadata to video
media_id = (
self._GALLERY
and traverse_obj(data, ('image_count', {lambda c: c == 1}))
and next(yield_media_ids(), None))
if not media_id:
result = self.playlist_result(
map(self._imgur_result, yield_media_ids()), gallery_id)
result.update(info)
return result
gallery_id = media_id
result = self._imgur_result(gallery_id)
info['_type'] = 'url_transparent'
result.update(info)
return result
class ImgurGalleryIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:gallery'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://imgur.com/gallery/Q95ko',
'info_dict': {
'id': 'Q95ko',
'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
'skip': 'Zoinks! You\'ve taken a wrong turn.',
}, {
# TODO: static images - replace with animated/video gallery
'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
}, {
'url': 'https://imgur.com/gallery/YcAQlkx',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'YcAQlkx',
'ext': 'mp4',
'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
'timestamp': 1358554297,
'upload_date': '20130119',
'uploader_id': '1648642',
'uploader': 'wittyusernamehere',
'release_timestamp': 1358554297,
'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg',
'release_date': '20130119',
'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand',
'comment_count': int,
'dislike_count': int,
'like_count': int,
},
}, {
# TODO: static image - replace with animated/video gallery
'url': 'http://imgur.com/topic/Funny/N8rOudd',
'only_matching': True,
}, {
'url': 'http://imgur.com/r/aww/VQcQPhM',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'VQcQPhM',
'ext': 'mp4',
'title': 'The boss is here',
'timestamp': 1476494751,
'upload_date': '20161015',
'uploader_id': '19138530',
'uploader': 'thematrixcam',
'comment_count': int,
'dislike_count': int,
'uploader_url': 'https://i.imgur.com/qCjr5Pi_d.png?maxwidth=290&fidelity=grand',
'release_timestamp': 1476494751,
'like_count': int,
'release_date': '20161015',
'thumbnail': 'https://i.imgur.com/VQcQPhMh.jpg',
},
},
# from https://github.com/ytdl-org/youtube-dl/pull/16674
{
'url': 'https://imgur.com/t/unmuted/6lAn9VQ',
'info_dict': {
'id': '6lAn9VQ',
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/kx2uD3C',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'ZVMv45i',
'ext': 'mp4',
'title': 'Intruder',
'timestamp': 1528129683,
'upload_date': '20180604',
'release_timestamp': 1528129683,
'release_date': '20180604',
'like_count': int,
'dislike_count': int,
'comment_count': int,
'duration': 30.03,
'thumbnail': 'https://i.imgur.com/ZVMv45ih.jpg',
},
}, {
'url': 'https://imgur.com/t/unmuted/wXSK0YH',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'JCAP4io',
'ext': 'mp4',
'title': 're:I got the blues$',
'description': 'Lukas vocal stylings.\n\nFP edit: dont encourage me. Ill never stop posting Luka and friends.',
'timestamp': 1527809525,
'upload_date': '20180531',
'like_count': int,
'dislike_count': int,
'duration': 30.03,
'comment_count': int,
'release_timestamp': 1527809525,
'thumbnail': 'https://i.imgur.com/JCAP4ioh.jpg',
'release_date': '20180531',
},
}]
class ImgurAlbumIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:album'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
_GALLERY = False
_TESTS = [{
# TODO: only static images - replace with animated/video gallery
'url': 'http://imgur.com/a/j6Orj',
'only_matching': True,
},
# from https://github.com/ytdl-org/youtube-dl/pull/21693
{
'url': 'https://imgur.com/a/iX265HX',
'info_dict': {
'id': 'iX265HX',
'title': 'enen-no-shouboutai'
},
'playlist_count': 2,
}, {
'url': 'https://imgur.com/a/8pih2Ed',
'info_dict': {
'id': '8pih2Ed'
},
'playlist_mincount': 1,
}]