[extractor/bitchute] Improve `BitChuteChannelIE` (#5066)

Authored by: flashdagger, pukkandan
pull/5474/head
MMM 2 years ago committed by GitHub
parent 8fddc232bf
commit c61473c1d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,14 +1,18 @@
import itertools import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
OnDemandPagedList,
clean_html, clean_html,
get_element_by_class, get_element_by_class,
get_elements_html_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
parse_count,
parse_duration,
traverse_obj, traverse_obj,
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
@ -109,51 +113,103 @@ class BitChuteIE(InfoExtractor):
class BitChuteChannelIE(InfoExtractor): class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_TEST = { _TESTS = [{
'url': 'https://www.bitchute.com/channel/victoriaxrave/', 'url': 'https://www.bitchute.com/channel/bitchute/',
'playlist_mincount': 185,
'info_dict': { 'info_dict': {
'id': 'victoriaxrave', 'id': 'bitchute',
'title': 'BitChute',
'description': 'md5:5329fb3866125afa9446835594a9b138',
}, },
} 'playlist': [
{
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'UGlrF9o9b-Q',
'ext': 'mp4',
'filesize': None,
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20170103',
'duration': 16,
'view_count': int,
},
}
],
'params': {
'skip_download': True,
'playlist_items': '-1',
},
}, {
'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
'playlist_mincount': 20,
'info_dict': {
'id': 'wV9Imujxasw9',
'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:04913227d2714af1d36d804aa2ab6b1e',
}
}]
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
PAGE_SIZE = 25
HTML_CLASS_NAMES = {
'channel': {
'container': 'channel-videos-container',
'title': 'channel-videos-title',
'description': 'channel-videos-text',
},
'playlist': {
'container': 'playlist-video',
'title': 'title',
'description': 'description',
}
}
def _entries(self, channel_id): @staticmethod
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id def _make_url(playlist_id, playlist_type):
offset = 0 return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
for page_num in itertools.count(1):
data = self._download_json( def _fetch_page(self, playlist_id, playlist_type, page_num):
'%sextend/' % channel_url, channel_id, playlist_url = self._make_url(playlist_id, playlist_type)
'Downloading channel page %d' % page_num, data = self._download_json(
data=urlencode_postdata({ f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
'csrfmiddlewaretoken': self._TOKEN, data=urlencode_postdata({
'name': '', 'csrfmiddlewaretoken': self._TOKEN,
'offset': offset, 'name': '',
}), headers={ 'offset': page_num * self.PAGE_SIZE,
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }), headers={
'Referer': channel_url, 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest', 'Referer': playlist_url,
'Cookie': 'csrftoken=%s' % self._TOKEN, 'X-Requested-With': 'XMLHttpRequest',
}) 'Cookie': f'csrftoken={self._TOKEN}',
if data.get('success') is False: })
break if not data.get('success'):
html = data.get('html') return
if not html: classes = self.HTML_CLASS_NAMES[playlist_type]
break for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
video_ids = re.findall( video_id = self._search_regex(
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
html) if not video_id:
if not video_ids: continue
break yield self.url_result(
offset += len(video_ids) f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
for video_id in video_ids: title=clean_html(get_element_by_class(classes['title'], video_html)),
yield self.url_result( description=clean_html(get_element_by_class(classes['description'], video_html)),
'https://www.bitchute.com/video/%s' % video_id, duration=parse_duration(get_element_by_class('video-duration', video_html)),
ie=BitChuteIE.ie_key(), video_id=video_id) view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
return self.playlist_result( return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id) OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
title=self._html_extract_title(webpage, default=None),
description=self._html_search_meta(
('description', 'og:description', 'twitter:description'), webpage, default=None),
playlist_count=int_or_none(self._html_search_regex(
r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))

@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
Return the text (content) and the html (whole) of the tag with the specified Return the text (content) and the html (whole) of the tag with the specified
attribute in the passed HTML document attribute in the passed HTML document
""" """
if not value:
return
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'

Loading…
Cancel
Save