diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 200c59bbe..106006671 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -704,6 +704,7 @@ from .line import ( LineLiveChannelIE, ) from .linkedin import ( + LinkedInIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c2d347efd..9255b3301 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -6,21 +6,56 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, ExtractorError, float_or_none, + get_element_by_class, int_or_none, srt_subtitles_timecode, + strip_or_none, + mimetype2ext, try_get, urlencode_postdata, urljoin, ) -class LinkedInLearningBaseIE(InfoExtractor): +class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' - _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' _logged_in = False + def _real_initialize(self): + if self._logged_in: + return + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r']+class="error"[^>]*>\s*(.+?)\s*', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + LinkedInBaseIE._logged_in = True + + +class LinkedInLearningBaseIE(LinkedInBaseIE): + _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): query = { 'courseSlug': course_slug, @@ -52,32 +87,47 @@ class LinkedInLearningBaseIE(InfoExtractor): def _get_video_id(self, video_data, course_slug, video_slug): return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) - def _real_initialize(self): - if self._logged_in: - return - email, password = self._get_login_info() - if email is None: - return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - action_url = urljoin(self._LOGIN_URL, self._search_regex( - r']+action=(["\'])(?P.+?)\1', login_page, 'post url', - default='https://www.linkedin.com/uas/login-submit', group='url')) - data = self._hidden_inputs(login_page) - data.update({ - 'session_key': email, - 'session_password': password, - }) - login_submit_page = self._download_webpage( - action_url, None, 'Logging in', - data=urlencode_postdata(data)) - error = self._search_regex( - r']+class="error"[^>]*>\s*(.+?)\s*', - login_submit_page, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - LinkedInLearningBaseIE._logged_in = True +class LinkedInIE(LinkedInBaseIE): + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P\d+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', + 'info_dict': { + 'id': '6850898786781339649', + 'ext': 'mp4', + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', + 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', + 'creator': 'Mishal K.' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'([^<]+)', webpage, 'title') + description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) + like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) + creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) + + sources = self._parse_json(extract_attributes(self._search_regex(r'(]+>)', webpage, 'video'))['data-sources'], video_id) + formats = [{ + 'url': source['src'], + 'ext': mimetype2ext(source.get('type')), + 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), + } for source in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'like_count': like_count, + 'creator': creator, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': description, + } class LinkedInLearningIE(LinkedInLearningBaseIE): @@ -108,7 +158,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() - video_data = None formats = [] for width, height in ((640, 360), (960, 540), (1280, 720)): video_data = self._call_api(