[ie/pr0gramm] Rewrite extractor (#8151)

Authored by: Grub4K
pull/5782/head
Simon Sawicki 9 months ago committed by GitHub
parent cf11b40ac4
commit b532556d0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1524,7 +1524,7 @@ from .puhutv import (
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE from .presstv import PressTVIE

@ -1,97 +1,155 @@
import re import json
from datetime import date
from urllib.parse import unquote
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import merge_dicts from ..compat import functools
from ..utils import ExtractorError, make_archive_id, urljoin
from ..utils.traversal import traverse_obj
class Pr0grammStaticIE(InfoExtractor): class Pr0grammIE(InfoExtractor):
# Possible urls: _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
# https://pr0gramm.com/static/5466437 _TESTS = [{
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' # Tags require account
_TEST = { 'url': 'https://pr0gramm.com/new/video/5466437',
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': { 'info_dict': {
'id': '5466437', 'id': '5466437',
'ext': 'mp4', 'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st', 'title': 'pr0gramm-5466437 by g11st',
'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'],
'uploader': 'g11st', 'uploader': 'g11st',
'uploader_id': 394718,
'upload_timestamp': 1671590240,
'upload_date': '20221221', 'upload_date': '20221221',
} 'like_count': int,
} 'dislike_count': int,
'age_limit': 0,
def _real_extract(self, url): 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
video_id = self._match_id(url) },
webpage = self._download_webpage(url, video_id) }, {
# Tags require account
'url': 'https://pr0gramm.com/new/3052805:comment28391322',
'info_dict': {
'id': '3052805',
'ext': 'mp4',
'title': 'pr0gramm-3052805 by Hansking1',
'tags': 'count:15',
'uploader': 'Hansking1',
'uploader_id': 385563,
'upload_timestamp': 1552930408,
'upload_date': '20190318',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Requires verified account
'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332',
'info_dict': {
'id': '5848332',
'ext': 'mp4',
'title': 'pr0gramm-5848332 by erd0pfel',
'tags': 'count:18',
'uploader': 'erd0pfel',
'uploader_id': 349094,
'upload_timestamp': 1694489652,
'upload_date': '20230912',
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290',
'only_matching': True,
}]
# Fetch media sources BASE_URL = 'https://pr0gramm.com'
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# Fetch author @functools.cached_property
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') def _is_logged_in(self):
return 'pp' in self._get_cookies(self.BASE_URL)
# Fetch approx upload timestamp from filename @functools.cached_property
# Have None-defaults in case the extraction fails def _maximum_flags(self):
uploadDay = None # We need to guess the flags for the content otherwise the api will raise an error
uploadMon = None # We can guess the maximum allowed flags for the account from the cookies
uploadYear = None # Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
uploadTimestr = None flags = 0b0001
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) if self._is_logged_in:
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) flags |= 0b1000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
if (m): return flags
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({ def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'):
'id': video_id, data = self._download_json(
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), f'https://pr0gramm.com/api/items/{endpoint}',
'uploader': uploader, video_id, note, query=query, expected_status=403)
'upload_date': uploadTimestr
}, media_info)
error = traverse_obj(data, ('error', {str}))
if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'):
if not self._is_logged_in:
self.raise_login_required()
raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True)
elif error:
message = traverse_obj(data, ('msg', {str})) or error
raise ExtractorError(f'API returned error: {message}', expected=True)
# This extractor is for the primary url (used for sharing, and appears in the return data
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _generic_title():
return "oof"
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_info = traverse_obj(
self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
('items', 0, {dict}))
source = urljoin('https://img.pr0gramm.com', video_info.get('image'))
if not source or not source.endswith('mp4'):
self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
tags = None
if self._is_logged_in:
metadata = self._call_api('info', video_id, {'itemId': video_id})
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
return self.url_result( return {
'https://pr0gramm.com/static/' + video_id, 'id': video_id,
video_id=video_id, 'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
ie=Pr0grammStaticIE.ie_key()) 'formats': [{
'url': source,
'ext': 'mp4',
**traverse_obj(video_info, {
'width': ('width', {int}),
'height': ('height', {int}),
}),
}],
'tags': tags,
'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
'_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
**traverse_obj(video_info, {
'uploader': ('user', {str}),
'uploader_id': ('userId', {int}),
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'upload_timestamp': ('created', {int}),
'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}

Loading…
Cancel
Save