From c76c96677ff6a056f5844a568ef05ee22c46d6f4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 11 Nov 2023 14:03:50 -0600 Subject: [PATCH] [ie/thisoldhouse] Add login support (#8561) Closes #8257 Authored by: bashonly --- yt_dlp/extractor/thisoldhouse.py | 71 +++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index cc7beeea5..15f8380d3 100644 --- a/yt_dlp/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py @@ -1,11 +1,23 @@ +import json + from .common import InfoExtractor +from .zype import ZypeIE from ..networking import HEADRequest +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + filter_dict, + parse_qs, + try_call, + urlencode_postdata, +) class ThisOldHouseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P[^/?#]+)' + _NETRC_MACHINE = 'thisoldhouse' + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P[^/?#]+)' _TESTS = [{ - 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', + 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', 'info_dict': { 'id': '5dcdddf673c3f956ef5db202', 'ext': 'mp4', @@ -23,13 +35,16 @@ class ThisOldHouseIE(InfoExtractor): 'skip_download': True, }, }, { + # Page no longer has video 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', 'only_matching': True, }, { + # 404 Not Found 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', 'only_matching': True, }, { - 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', + # 404 Not Found + 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', 'only_matching': True, }, { 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', @@ -39,17 +54,51 @@ class ThisOldHouseIE(InfoExtractor): 'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project', 'only_matching': True, }] - _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' + + _LOGIN_URL = 'https://login.thisoldhouse.com/usernamepassword/login' + + def _perform_login(self, username, password): + self._request_webpage( + HEADRequest('https://www.thisoldhouse.com/insider'), None, 'Requesting session cookies') + urlh = self._request_webpage( + 'https://www.thisoldhouse.com/wp-login.php', None, 'Requesting login info', + errnote='Unable to login', query={'redirect_to': 'https://www.thisoldhouse.com/insider'}) + + try: + auth_form = self._download_webpage( + self._LOGIN_URL, None, 'Submitting credentials', headers={ + 'Content-Type': 'application/json', + 'Referer': urlh.url, + }, data=json.dumps(filter_dict({ + **{('client_id' if k == 'client' else k): v[0] for k, v in parse_qs(urlh.url).items()}, + 'tenant': 'thisoldhouse', + 'username': username, + 'password': password, + 'popup_options': {}, + 'sso': True, + '_csrf': try_call(lambda: self._get_cookies(self._LOGIN_URL)['_csrf'].value), + '_intstate': 'deprecated', + }), separators=(',', ':')).encode()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError('Invalid username or password', expected=True) + raise + + self._request_webpage( + 'https://login.thisoldhouse.com/login/callback', None, 'Completing login', + data=urlencode_postdata(self._hidden_inputs(auth_form))) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) if 'To Unlock This content' in webpage: - self.raise_login_required(method='cookies') - video_url = self._search_regex( + self.raise_login_required( + 'This video is only available for subscribers. ' + 'Note that --cookies-from-browser may not work due to this site using session cookies') + + video_url, video_id = self._search_regex( r']+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', - webpage, 'video url') - if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage: - return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).url, 'Zype', display_id) - video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id') - return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) + webpage, 'video url', group=(1, 2)) + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + + return self.url_result(video_url, ZypeIE, video_id)