import os import re import xml.etree.ElementTree from .ant1newsgr import Ant1NewsGrEmbedIE from .anvato import AnvatoIE from .apa import APAIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .arte import ArteTVEmbedIE from .bitchute import BitChuteIE from .blogger import BloggerIE from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE from .channel9 import Channel9IE from .cloudflarestream import CloudflareStreamIE from .common import InfoExtractor from .commonprotocols import RtmpIE from .condenast import CondeNastIE from .dailymail import DailyMailIE from .dailymotion import DailymotionIE from .dbtv import DBTVIE from .digiteka import DigitekaIE from .drtuber import DrTuberIE from .eagleplatform import EaglePlatformIE from .ertgr import ERTWebtvEmbedIE from .expressen import ExpressenIE from .facebook import FacebookIE from .foxnews import FoxNewsIE from .gedidigital import GediDigitalIE from .gfycat import GfycatIE from .glomex import GlomexEmbedIE from .googledrive import GoogleDriveIE from .indavideo import IndavideoEmbedIE from .instagram import InstagramIE from .joj import JojIE from .jwplatform import JWPlatformIE from .kaltura import KalturaIE from .kinja import KinjaEmbedIE from .limelight import LimelightBaseIE from .mainstreaming import MainStreamingIE from .medialaan import MedialaanIE from .mediaset import MediasetIE from .mediasite import MediasiteIE from .megaphone import MegaphoneIE from .megatvcom import MegaTVComEmbedIE from .mofosex import MofosexEmbedIE from .mtv import MTVServicesEmbeddedIE from .myvi import MyviIE from .nbc import NBCSportsVPlayerIE from .nexx import NexxEmbedIE, NexxIE from .odnoklassniki import OdnoklassnikiIE from .onionstudios import OnionStudiosIE from .ooyala import OoyalaIE from .panopto import PanoptoBaseIE from .peertube import PeerTubeIE from .piksel import PikselIE from .pladform import PladformIE from .pornhub import PornHubIE from .rcs import RCSEmbedsIE from .redtube import RedTubeIE from .rumble import RumbleEmbedIE from .rutube import RutubeIE from .rutv import RUTVIE from .ruutu import RuutuIE from .senategov import SenateISVPIE from .simplecast import SimplecastIE from .soundcloud import SoundcloudEmbedIE from .spankwire import SpankwireIE from .sportbox import SportBoxIE from .springboardplatform import SpringboardPlatformIE from .svt import SVTIE from .teachable import TeachableIE from .ted import TedEmbedIE from .theplatform import ThePlatformIE from .threeqsdn import ThreeQSDNIE from .tnaflix import TNAFlixNetworkEmbedIE from .tube8 import Tube8IE from .tunein import TuneInBaseIE from .tvc import TVCIE from .tvopengr import TVOpenGrEmbedIE from .tvp import TVPEmbedIE from .twentymin import TwentyMinutenIE from .udn import UDNEmbedIE from .ustream import UstreamIE from .vbox7 import Vbox7IE from .vice import ViceIE from .videa import VideaIE from .videomore import VideomoreIE from .videopress import VideoPressIE from .viewlift import ViewLiftEmbedIE from .vimeo import VHXEmbedIE, VimeoIE from .viqeo import ViqeoIE from .vk import VKIE from .vshare import VShareIE from .vzaar import VzaarIE from .washingtonpost import WashingtonPostIE from .webcaster import WebcasterFeedIE from .wimtv import WimTVIE from .wistia import WistiaIE from .xfileshare import XFileShareIE from .xhamster import XHamsterEmbedIE from .yapfiles import YapFilesIE from .youporn import YouPornIE from .youtube import YoutubeIE from .zype import ZypeIE from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, HEADRequest, UnsupportedError, determine_ext, dict_get, float_or_none, int_or_none, is_html, js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, sanitized_Request, smuggle_url, str_or_none, unescapeHTML, unified_timestamp, unsmuggle_url, url_or_none, xpath_attr, xpath_text, xpath_with_ns, ) class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' _NETRC_MACHINE = False # Supress username warning _TESTS = [ # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', } }, # Direct link to media delivered compressed (until Accept-Encoding is *) { 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'md5': '128c42e68b13950268b648275386fc74', 'info_dict': { 'id': 'FictionJunction-Parallel_Hearts', 'ext': 'flac', 'title': 'FictionJunction-Parallel_Hearts', 'upload_date': '20140522', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ], 'skip': 'URL invalid', }, # Direct download with broken HEAD { 'url': 'http://ai-radio.org:8000/radio.opus', 'info_dict': { 'id': 'radio', 'ext': 'opus', 'title': 'radio', }, 'params': { 'skip_download': True, # infinite live stream }, 'expected_warnings': [ r'501.*Not Implemented', r'400.*Bad Request', ], }, # Direct link with incorrect MIME type { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'md5': '4ccbebe5f36706d85221f204d7eb5913', 'info_dict': { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'id': '5_Lennart_Poettering_-_Systemd', 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] }, # RSS feed { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*' }, 'playlist_mincount': 11, }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'title': 'MSNBC Rachel Maddow (video)', 'description': 're:.*her unique approach to storytelling.*', }, 'playlist': [{ 'info_dict': { 'ext': 'mov', 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', 'description': 're:.*her unique approach to storytelling.*', 'upload_date': '20201204', }, }], }, # RSS feed with item with description and thumbnails { 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', 'info_dict': { 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', 'title': 're:.*100% Hydrogen.*', 'description': 're:.*In this episode.*', }, 'playlist': [{ 'info_dict': { 'ext': 'm4a', 'id': 'c1c879525ce2cb640b344507e682c36d', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', 'duration': 459, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, 'age_limit': 0, 'season': 'Season 1', 'direct': True, 'episode': 'Episode 1', }, }], 'params': { 'skip_download': True, }, }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', 'info_dict': { 'id': 'http://www.hellointernet.fm/podcast?format=rss', 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', 'title': 'Hello Internet', }, 'playlist_mincount': 100, }, # RSS feed with guid { 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'info_dict': { 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'description': 'md5:be809a44b63b0c56fb485caf68685520', 'title': 'The Little Red Podcast', }, 'playlist_mincount': 76, }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', 'info_dict': { 'id': 'smil', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, 'params': { 'force_generic_extractor': True, 'skip_download': True, }, }, # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html { 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', 'info_dict': { 'id': 'hds', 'ext': 'flv', 'title': 'hds', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from https://www.restudy.dk/video/play/id/1637 { 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', 'info_dict': { 'id': 'video_1637', 'ext': 'flv', 'title': 'video_1637', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm { 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', 'info_dict': { 'id': 'smil-service', 'ext': 'flv', 'title': 'smil-service', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 { 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', 'info_dict': { 'id': '4719370', 'ext': 'mp4', 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html { 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', 'info_dict': { 'id': 'mZlp2ctYIUEB', 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { 'skip_download': True, }, }, # MPD from http://dash-mse-test.appspot.com/media.html { 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', 'info_dict': { 'id': 'car-20120827-manifest', 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', 'info_dict': { 'id': 'content', 'ext': 'mp4', 'title': 'content', 'formats': 'mincount:8', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', 'info_dict': { 'id': 'index', 'ext': 'mp4', 'title': 'index', 'upload_date': '20140720', 'formats': 'mincount:11', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'info_dict': { 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, } }, { # redirect in Refresh HTTP header 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', 'info_dict': { 'id': 'pO8h3EaFRdo', 'ext': 'mp4', 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', 'upload_date': '20150917', 'uploader_id': 'brtvofficial', 'uploader': 'Boiler Room', }, 'params': { 'skip_download': False, }, }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', 'info_dict': { 'id': '13601338388002', 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', } }, # bandcamp page with custom domain { 'add_ie': ['Bandcamp'], 'url': 'http://bronyrock.com/track/the-pony-mash', 'info_dict': { 'id': '3235767654', 'ext': 'mp3', 'title': 'The Pony Mash', 'uploader': 'M_Pallante', }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, { # embedded brightcove video # it also tests brightcove videos that need to set the 'Referer' # in the http requests 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', 'ext': 'mp4', 'title': 'Le cours de bourse : l’analyse technique', 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', 'uploader': 'BFM BUSINESS', }, 'params': { 'skip_download': True, }, }, { # embedded with itemprop embedURL and video id spelled as `idVideo` 'add_id': ['BrightcoveLegacy'], 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', 'info_dict': { 'id': '5255628253001', 'ext': 'mp4', 'title': 'md5:37c519b1128915607601e75a87995fc0', 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', 'uploader': 'BFM BUSINESS', 'uploader_id': '876450612001', 'timestamp': 1482255315, 'upload_date': '20161220', }, 'params': { 'skip_download': True, }, }, { # https://github.com/ytdl-org/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', 'md5': '0ba9446db037002366bab3b3eb30c88c', 'info_dict': { 'id': '3101154703001', 'ext': 'mp4', 'title': 'Still no power', 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', 'md5': 'fb973ecf6e4a78a67453647444222983', 'info_dict': { 'id': '3414141473001', 'ext': 'mp4', 'title': 'Видео. Удаление Дзагоева (ЦСКА)', 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', 'uploader': 'Championat', }, }, { # https://github.com/ytdl-org/youtube-dl/issues/3541 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', 'ext': 'mp4', 'title': 'Leer mij vrouwen kennen: Aflevering 1', 'description': 'Leer mij vrouwen kennen: Aflevering 1', 'uploader': 'SBS Broadcasting', }, 'skip': 'Restricted to Netherlands', 'params': { 'skip_download': True, # m3u8 download }, }, { # Brightcove video in