[ie/pr0gramm] Rewrite extractor (#8151)

Authored by: Grub4K
pull/5782/head
Simon Sawicki 9 months ago committed by GitHub
parent cf11b40ac4
commit b532556d0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1524,7 +1524,7 @@ from .puhutv import (
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE from .presstv import PressTVIE

@ -1,97 +1,155 @@
import re import json
from datetime import date
from urllib.parse import unquote
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import merge_dicts from ..compat import functools
from ..utils import ExtractorError, make_archive_id, urljoin
from ..utils.traversal import traverse_obj
class Pr0grammStaticIE(InfoExtractor): class Pr0grammIE(InfoExtractor):
# Possible urls: _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
# https://pr0gramm.com/static/5466437 _TESTS = [{
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' # Tags require account
_TEST = { 'url': 'https://pr0gramm.com/new/video/5466437',
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': { 'info_dict': {
'id': '5466437', 'id': '5466437',
'ext': 'mp4', 'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st', 'title': 'pr0gramm-5466437 by g11st',
'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'],
'uploader': 'g11st', 'uploader': 'g11st',
'uploader_id': 394718,
'upload_timestamp': 1671590240,
'upload_date': '20221221', 'upload_date': '20221221',
} 'like_count': int,
} 'dislike_count': int,
'age_limit': 0,
def _real_extract(self, url): 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
video_id = self._match_id(url) },
webpage = self._download_webpage(url, video_id) }, {
# Tags require account
# Fetch media sources 'url': 'https://pr0gramm.com/new/3052805:comment28391322',
entries = self._parse_html5_media_entries(url, webpage, video_id) 'info_dict': {
media_info = entries[0] 'id': '3052805',
'ext': 'mp4',
# Fetch author 'title': 'pr0gramm-3052805 by Hansking1',
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') 'tags': 'count:15',
'uploader': 'Hansking1',
'uploader_id': 385563,
'upload_timestamp': 1552930408,
'upload_date': '20190318',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Requires verified account
'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332',
'info_dict': {
'id': '5848332',
'ext': 'mp4',
'title': 'pr0gramm-5848332 by erd0pfel',
'tags': 'count:18',
'uploader': 'erd0pfel',
'uploader_id': 349094,
'upload_timestamp': 1694489652,
'upload_date': '20230912',
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290',
'only_matching': True,
}]
# Fetch approx upload timestamp from filename BASE_URL = 'https://pr0gramm.com'
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m): @functools.cached_property
# Up to a day of accuracy should suffice... def _is_logged_in(self):
uploadDay = m.groupdict().get('day') return 'pp' in self._get_cookies(self.BASE_URL)
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({ @functools.cached_property
'id': video_id, def _maximum_flags(self):
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), # We need to guess the flags for the content otherwise the api will raise an error
'uploader': uploader, # We can guess the maximum allowed flags for the account from the cookies
'upload_date': uploadTimestr # Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
}, media_info) flags = 0b0001
if self._is_logged_in:
flags |= 0b1000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
return flags
# This extractor is for the primary url (used for sharing, and appears in the def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'):
# location bar) Since this page loads the DOM via JS, yt-dl can't find any data = self._download_json(
# video information here. So let's redirect to a compatibility version of f'https://pr0gramm.com/api/items/{endpoint}',
# the site, which does contain the <video>-element by itself, without requiring video_id, note, query=query, expected_status=403)
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' error = traverse_obj(data, ('error', {str}))
_TEST = { if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'):
'url': 'https://pr0gramm.com/new/video/5466437', if not self._is_logged_in:
'info_dict': { self.raise_login_required()
'id': '5466437', raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True)
'ext': 'mp4', elif error:
'title': 'pr0gramm-5466437 by g11st', message = traverse_obj(data, ('msg', {str})) or error
'uploader': 'g11st', raise ExtractorError(f'API returned error: {message}', expected=True)
'upload_date': '20221221',
}
}
def _generic_title(): return data
return "oof"
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_info = traverse_obj(
self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
('items', 0, {dict}))
source = urljoin('https://img.pr0gramm.com', video_info.get('image'))
if not source or not source.endswith('mp4'):
self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
return self.url_result( tags = None
'https://pr0gramm.com/static/' + video_id, if self._is_logged_in:
video_id=video_id, metadata = self._call_api('info', video_id, {'itemId': video_id})
ie=Pr0grammStaticIE.ie_key()) tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
return {
'id': video_id,
'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
'formats': [{
'url': source,
'ext': 'mp4',
**traverse_obj(video_info, {
'width': ('width', {int}),
'height': ('height', {int}),
}),
}],
'tags': tags,
'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
'_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
**traverse_obj(video_info, {
'uploader': ('user', {str}),
'uploader_id': ('userId', {int}),
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'upload_timestamp': ('created', {int}),
'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}

Loading…
Cancel
Save