diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 5df13ad9b..3b0ef323d 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -77,3 +77,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 644c87a7e..c8702c356 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -89,3 +89,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 59d0474c2..5a6d2b0fb 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -85,3 +85,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index e20739673..a17770f61 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -70,3 +70,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index e06db9ccf..c600a9dcb 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -64,3 +64,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 571223a9c..57bc9daf5 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -70,3 +70,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12ec5b0d8..bd2e42d9a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -266,7 +266,7 @@ jobs: # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do @@ -409,7 +409,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | @@ -469,7 +469,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index fdfdebc65..21a64efa9 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -55,6 +55,7 @@ jobs: - name: Install test requirements run: python3 ./devscripts/install_deps.py --include test --include curl-cffi - name: Run tests + timeout-minutes: 15 continue-on-error: False run: | python3 -m yt_dlp -v || true # Print debug head diff --git a/.github/workflows/issue-lockdown.yml b/.github/workflows/issue-lockdown.yml new file mode 100644 index 000000000..4b973e2e6 --- /dev/null +++ b/.github/workflows/issue-lockdown.yml @@ -0,0 +1,21 @@ +name: Issue Lockdown +on: + issues: + types: [opened] + +permissions: + issues: write + +jobs: + lockdown: + name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN + runs-on: ubuntu-latest + steps: + - name: "Lock new issue" + env: + GH_TOKEN: ${{ github.token }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPOSITORY: ${{ github.repository }} + run: | + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 3afb51a30..1571d3cab 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -15,8 +15,9 @@ jobs: with: python-version: '3.8' - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include test + run: python3 ./devscripts/install_deps.py -o --include test - name: Run tests + timeout-minutes: 15 run: | python3 -m yt_dlp -v || true python3 ./devscripts/run_tests.py core diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fa5ad7e51..8d0bc4026 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -204,7 +204,7 @@ jobs: git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u git commit -m "Release ${{ env.version }}" \ - -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" + -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all" git push origin --force ${{ github.event.ref }}:release - name: Get target commitish @@ -325,7 +325,7 @@ jobs: "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF - #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + #### A description of the various files is in the [README](https://github.com/${{ github.repository }}#release-files) --- $(python ./devscripts/make_changelog.py -vv --collapsible) EOF diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 000000000..45c87cdd4 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 489ab7da8..c80f71405 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -657,3 +657,19 @@ luvyana szantnerb hugepower scribblemaniac +Codenade +Demon000 +Deukhoofd +grqz +hibes +Khaoklong51 +kieraneglin +lengzuo +naglis +ndyanx +otovalek +quad +rakslice +sahilsinghss73 +tony-hn +xingchensong diff --git a/Changelog.md b/Changelog.md index 0b96ab29c..2ef28fa07 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,96 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.09.27 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.9** +Since Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086) + +#### Core changes +- [Allow `none` arg to negate `--convert-subs` and `--convert-thumbnails`](https://github.com/yt-dlp/yt-dlp/commit/c08e0b20b5edd8957b8318716bc14e896d1b96f4) ([#11066](https://github.com/yt-dlp/yt-dlp/issues/11066)) by [kieraneglin](https://github.com/kieraneglin) +- [Fix format sorting bug with vp9.2 vcodec](https://github.com/yt-dlp/yt-dlp/commit/8f4ea14680c7865d8ffac10a9174205d1d84ada7) ([#10884](https://github.com/yt-dlp/yt-dlp/issues/10884)) by [rakslice](https://github.com/rakslice) +- [Raise minimum recommended Python version to 3.9](https://github.com/yt-dlp/yt-dlp/commit/cca534cd9e6850c70244f225a4a1895ef4bcdbec) ([#11098](https://github.com/yt-dlp/yt-dlp/issues/11098)) by [bashonly](https://github.com/bashonly) +- **cookies**: [Improve error message for Windows `--cookies-from-browser chrome` issue](https://github.com/yt-dlp/yt-dlp/commit/b397a64691421ace5df09457c2a764821a2dc6f2) ([#11090](https://github.com/yt-dlp/yt-dlp/issues/11090)) by [seproDev](https://github.com/seproDev) +- **utils**: `mimetype2ext`: [Recognize `aacp` as `aac`](https://github.com/yt-dlp/yt-dlp/commit/cc85596d5b59f0c14e9381b3675f619c1e12e597) ([#10860](https://github.com/yt-dlp/yt-dlp/issues/10860)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- [Fix JW Player format parsing](https://github.com/yt-dlp/yt-dlp/commit/409f8e9e3b4bde81ef76fc563256f876d2ff8099) ([#10956](https://github.com/yt-dlp/yt-dlp/issues/10956)) by [seproDev](https://github.com/seproDev) +- [Handle decode errors when reading responses](https://github.com/yt-dlp/yt-dlp/commit/325001317d97f4545d66fac44c4ba772c6f45f22) ([#10868](https://github.com/yt-dlp/yt-dlp/issues/10868)) by [bashonly](https://github.com/bashonly) +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7f909046f4dc0fba472b4963145aef6e0d42491b) ([#11101](https://github.com/yt-dlp/yt-dlp/issues/11101)) by [bashonly](https://github.com/bashonly) +- **adn**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/cc88a54bb1ef285154775f8a6a413335ce4c71ce) ([#10749](https://github.com/yt-dlp/yt-dlp/issues/10749)) by [infanf](https://github.com/infanf) +- **asobistage**: [Support redirected URLs](https://github.com/yt-dlp/yt-dlp/commit/a7d3235c84dac57a127cbe0ff38f7f7c2fdd8fa0) ([#10768](https://github.com/yt-dlp/yt-dlp/issues/10768)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **bandcamp**: user: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0176547f16a3642cd71627126e9dfc24981e20) ([#10328](https://github.com/yt-dlp/yt-dlp/issues/10328)) by [bashonly](https://github.com/bashonly), [quad](https://github.com/quad) +- **beacon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b4760c778d0c92c6e3f2bc8346cd72c8f08595ae) ([#9901](https://github.com/yt-dlp/yt-dlp/issues/9901)) by [Deukhoofd](https://github.com/Deukhoofd) +- **bilibili** + - [Fix chapters and subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/a2000bc85730c950351d78bb818493dc39dca3cb) ([#11099](https://github.com/yt-dlp/yt-dlp/issues/11099)) by [bashonly](https://github.com/bashonly) + - [Fix festival URL support](https://github.com/yt-dlp/yt-dlp/commit/b43bd864851f2862e26caa85461c5d825d49d463) ([#10740](https://github.com/yt-dlp/yt-dlp/issues/10740)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **biliintl**: [Fix referer header](https://github.com/yt-dlp/yt-dlp/commit/a06bb586795ebab87a2356923acfc674d6f0e152) ([#11003](https://github.com/yt-dlp/yt-dlp/issues/11003)) by [Khaoklong51](https://github.com/Khaoklong51) +- **dropbox**: [Fix password-protected video support](https://github.com/yt-dlp/yt-dlp/commit/63da31b3b29af90062d8a72a905ffe4b5e499042) ([#10735](https://github.com/yt-dlp/yt-dlp/issues/10735)) by [ndyanx](https://github.com/ndyanx) +- **ertgr**: [Fix video extraction](https://github.com/yt-dlp/yt-dlp/commit/416686ed0cf792ec44ab059f3b229dd776077e14) ([#11091](https://github.com/yt-dlp/yt-dlp/issues/11091)) by [seproDev](https://github.com/seproDev) +- **eurosport**: [Support local URL variants](https://github.com/yt-dlp/yt-dlp/commit/f0bb28504c8c2b75ee3e5796aed50de2a7f90a1b) ([#10785](https://github.com/yt-dlp/yt-dlp/issues/10785)) by [seproDev](https://github.com/seproDev) +- **facebook** + - ads: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d62fef7e07d454c0d2ba2d69fb96d691dba1ded0) ([#10704](https://github.com/yt-dlp/yt-dlp/issues/10704)) by [kclauhk](https://github.com/kclauhk) + - reel: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1b941c6b2caa688b0d3332e723d16dbafa4311) by [lengzuo](https://github.com/lengzuo) +- **germanupa**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/124f058b546d652a359c67025bb479789bfbef0b) ([#10538](https://github.com/yt-dlp/yt-dlp/issues/10538)) by [grqz](https://github.com/grqz) +- **hgtvde**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a555389c9bb32e589e00b4664974423fb7b04dcd) ([#10992](https://github.com/yt-dlp/yt-dlp/issues/10992)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) +- **huya**: video: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/25c1cdaa2650563494d3bf00a38f72d0d9486bff) ([#10686](https://github.com/yt-dlp/yt-dlp/issues/10686)) by [hugepower](https://github.com/hugepower) +- **iprima**: [Fix zoom URL support](https://github.com/yt-dlp/yt-dlp/commit/4a27b8f092f7f7c10b7a334d3535c97c2af02f0a) ([#10959](https://github.com/yt-dlp/yt-dlp/issues/10959)) by [otovalek](https://github.com/otovalek) +- **khanacademy**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0fba08485b6445b72b5b63ae23ca2a73fa5d967f) ([#10913](https://github.com/yt-dlp/yt-dlp/issues/10913)) by [seproDev](https://github.com/seproDev) +- **kick** + - clips: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/0aa4426e9a35f7f8e184f1f2082b3b313c1448f7) ([#11107](https://github.com/yt-dlp/yt-dlp/issues/11107)) by [bashonly](https://github.com/bashonly) + - vod: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/173d54c151b987409e3eb09552d8d89ed8fc50f7) ([#10988](https://github.com/yt-dlp/yt-dlp/issues/10988)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **kika**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a) ([#5788](https://github.com/yt-dlp/yt-dlp/issues/5788)) by [1100101](https://github.com/1100101) +- **lnkgo**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/fa83d0b36bc43d30fe9241c1e923f4614864b758) ([#10904](https://github.com/yt-dlp/yt-dlp/issues/10904)) by [naglis](https://github.com/naglis) +- **loom**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/7509d692b37a7ec6230ea75bfe1e44a8de5eefce) ([#10760](https://github.com/yt-dlp/yt-dlp/issues/10760)) by [kclauhk](https://github.com/kclauhk) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e2b3634e299be9c16a247ece3b1858d83889c324) ([#11083](https://github.com/yt-dlp/yt-dlp/issues/11083)) by [szantnerb](https://github.com/szantnerb) +- **mojevideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/28b0ecba2af5b4919f198474b3d00a76ef322c31) ([#11019](https://github.com/yt-dlp/yt-dlp/issues/11019)) by [04-pasha-04](https://github.com/04-pasha-04), [pzhlkj6612](https://github.com/pzhlkj6612) +- **niconico**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/eabb4680fdb09ba1f48d174a700a2e3b43f82add) ([#11103](https://github.com/yt-dlp/yt-dlp/issues/11103)) by [bashonly](https://github.com/bashonly) +- **nzz**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4a9bc8c3630378bc29f0266126b503f6190c0430) ([#10461](https://github.com/yt-dlp/yt-dlp/issues/10461)) by [1-Byte](https://github.com/1-Byte) +- **patreoncampaign**: [Support API URLs](https://github.com/yt-dlp/yt-dlp/commit/232e6db30c474d1b387e405342f34173ceeaf832) ([#10734](https://github.com/yt-dlp/yt-dlp/issues/10734)) by [bashonly](https://github.com/bashonly), [hibes](https://github.com/hibes) +- **pinterest**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c8c078fe28b0ffc15ef9646346c00c592fe71a78) ([#10867](https://github.com/yt-dlp/yt-dlp/issues/10867)) by [bashonly](https://github.com/bashonly), [sahilsinghss73](https://github.com/sahilsinghss73) +- **radiko**: [Extract unique `id` values](https://github.com/yt-dlp/yt-dlp/commit/c8d096c5ce111411fbdbe2abb8fed54f317a6182) ([#10726](https://github.com/yt-dlp/yt-dlp/issues/10726)) by [garret1317](https://github.com/garret1317) +- **rtp**: [Support more subpages](https://github.com/yt-dlp/yt-dlp/commit/d02df303d8e49390599db9f34482697e4d1cf5b2) ([#10787](https://github.com/yt-dlp/yt-dlp/issues/10787)) by [Demon000](https://github.com/Demon000) +- **rumblechannel**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ad0b857f459a6d390fbf124183916218c52f223a) ([#11049](https://github.com/yt-dlp/yt-dlp/issues/11049)) by [tony-hn](https://github.com/tony-hn) +- **rutube**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/41be32e78c3845000dbac188ffb90ea3ea7c4dfa) ([#10844](https://github.com/yt-dlp/yt-dlp/issues/10844)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **samplefocus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/46f4c80bc363ee8116c33d37f65202e6c3470954) ([#10947](https://github.com/yt-dlp/yt-dlp/issues/10947)) by [seproDev](https://github.com/seproDev) +- **screenrec**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36f9e602ad55679764bc75a4f67f7562b1d6adcf) ([#10917](https://github.com/yt-dlp/yt-dlp/issues/10917)) by [naglis](https://github.com/naglis) +- **sen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/41a241ca6ffb95b3d9aaf4f42106ca8cba9af1a6) ([#10952](https://github.com/yt-dlp/yt-dlp/issues/10952)) by [seproDev](https://github.com/seproDev) +- **servus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/300c91274f7ea5b1b0528fc5ee11cf1a61d4079e) ([#10944](https://github.com/yt-dlp/yt-dlp/issues/10944)) by [seproDev](https://github.com/seproDev) +- **snapchatspotlight**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37417e4f934fd8909788b493d017777155b0ae5) ([#11030](https://github.com/yt-dlp/yt-dlp/issues/11030)) by [seproDev](https://github.com/seproDev) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5a8a05aebb49693e78e1123015837ed5e961ff76) ([#11010](https://github.com/yt-dlp/yt-dlp/issues/11010)) by [diman8](https://github.com/diman8) +- **tenplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d8d473002b654ab0e7b97ead869f58b4361eeae1) ([#10928](https://github.com/yt-dlp/yt-dlp/issues/10928)) by [aarubui](https://github.com/aarubui) +- **tiktok**: [Fix web formats extraction](https://github.com/yt-dlp/yt-dlp/commit/3ad0b7f422d547204df687b6d0b2d9110fff3990) ([#11074](https://github.com/yt-dlp/yt-dlp/issues/11074)) by [bashonly](https://github.com/bashonly) +- **twitter**: spaces: [Support video spaces](https://github.com/yt-dlp/yt-dlp/commit/bef1d4d6fc9493fda7f75e2289c07c507d10092f) ([#10789](https://github.com/yt-dlp/yt-dlp/issues/10789)) by [bashonly](https://github.com/bashonly) +- **vidflex**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e978c312d6550a6ae4c9df18001afb1b420cb72f) ([#10002](https://github.com/yt-dlp/yt-dlp/issues/10002)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **vimeo** + - [Always try to extract original format](https://github.com/yt-dlp/yt-dlp/commit/4115c24d157c5b5f63089d75c4e0f51d1f8b4489) ([#10721](https://github.com/yt-dlp/yt-dlp/issues/10721)) by [bashonly](https://github.com/bashonly) (With fixes in [e8e6a98](https://github.com/yt-dlp/yt-dlp/commit/e8e6a982a1b659eed434d225d7922f632bac6568) by [seproDev](https://github.com/seproDev)) + - [Fix HLS audio format sorting](https://github.com/yt-dlp/yt-dlp/commit/a1b4ac2b8ed8e6eaa56044d439f1e0d00c2ba218) ([#11082](https://github.com/yt-dlp/yt-dlp/issues/11082)) by [fireattack](https://github.com/fireattack) +- **watchespn**: [Improve auth support](https://github.com/yt-dlp/yt-dlp/commit/7adff8caf152dcf96d03aff69ed8545c0a63567c) ([#10910](https://github.com/yt-dlp/yt-dlp/issues/10910)) by [ischmidt20](https://github.com/ischmidt20) +- **wistia**: [Support password-protected videos](https://github.com/yt-dlp/yt-dlp/commit/9f5c9a90898c5a1e672922d9cd799716c73cee34) ([#11100](https://github.com/yt-dlp/yt-dlp/issues/11100)) by [bashonly](https://github.com/bashonly) +- **ximalaya**: [Add VIP support](https://github.com/yt-dlp/yt-dlp/commit/3dfd720d098b4d49d69cfc77e6376f22bcd90934) ([#10832](https://github.com/yt-dlp/yt-dlp/issues/10832)) by [seproDev](https://github.com/seproDev), [xingchensong](https://github.com/xingchensong) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3aa0156e05662923d130ddbc1c82596e38c01a00) ([#10950](https://github.com/yt-dlp/yt-dlp/issues/10950)) by [seproDev](https://github.com/seproDev) +- **yleareena**: [Support podcasts](https://github.com/yt-dlp/yt-dlp/commit/48d629d461e05b1b19f5e53dc959bb9ebe95da42) ([#11104](https://github.com/yt-dlp/yt-dlp/issues/11104)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `po_token`, `visitor_data`, `data_sync_id` extractor args](https://github.com/yt-dlp/yt-dlp/commit/3a3bd00037e9908e87da4fa9f2ad772aa34dc60e) ([#10648](https://github.com/yt-dlp/yt-dlp/issues/10648)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [seproDev](https://github.com/seproDev) (With fixes in [fa2be9a](https://github.com/yt-dlp/yt-dlp/commit/fa2be9a7c63babede07480151363e54eee5702bd) by [bashonly](https://github.com/bashonly)) + - [Support excluding `player_client`s in extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/49f3741a820ed142f6866317c2e7d247b130960e) ([#10710](https://github.com/yt-dlp/yt-dlp/issues/10710)) by [bashonly](https://github.com/bashonly) + - clip: [Prioritize `https` formats](https://github.com/yt-dlp/yt-dlp/commit/1d84b780cf33a1d84756825ac23f990a905703df) ([#11102](https://github.com/yt-dlp/yt-dlp/issues/11102)) by [bashonly](https://github.com/bashonly) + - tab: [Fix shorts tab extraction](https://github.com/yt-dlp/yt-dlp/commit/9431777b4c37129a6093080c77ca59960afbb9d7) ([#10938](https://github.com/yt-dlp/yt-dlp/issues/10938)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Fix handler not being added to RequestError](https://github.com/yt-dlp/yt-dlp/commit/d1c4d88b2d912e8da5e76db455562ca63b1af690) ([#10955](https://github.com/yt-dlp/yt-dlp/issues/10955)) by [coletdjnz](https://github.com/coletdjnz) +- [Pin `curl-cffi` version to < 0.7.2](https://github.com/yt-dlp/yt-dlp/commit/5bb1aa04dafce13ba9de707ea53169fab58b5207) ([#11092](https://github.com/yt-dlp/yt-dlp/issues/11092)) by [bashonly](https://github.com/bashonly) +- **Request Handler**: websockets: [Upgrade websockets to 13.0](https://github.com/yt-dlp/yt-dlp/commit/6f9e6537434562d513d0c9b68ced8a61ade94a64) ([#10815](https://github.com/yt-dlp/yt-dlp/issues/10815)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Bump PyInstaller version pin to `>=6.10.0`](https://github.com/yt-dlp/yt-dlp/commit/fb8b7f226d251e521a89b23c415e249e5b788e5c) ([#10709](https://github.com/yt-dlp/yt-dlp/issues/10709)) by [bashonly](https://github.com/bashonly) + - [Pin `delocate` version for `macos`](https://github.com/yt-dlp/yt-dlp/commit/7e41628ff523b3fe373b0981a5db441358980dab) ([#10901](https://github.com/yt-dlp/yt-dlp/issues/10901)) by [bashonly](https://github.com/bashonly) +- **ci** + - [Add comment sanitization workflow](https://github.com/yt-dlp/yt-dlp/commit/b6200bdcf3a9415ae36859188f9a57e3e461c696) ([#10915](https://github.com/yt-dlp/yt-dlp/issues/10915)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Add issue tracker anti-spam protection](https://github.com/yt-dlp/yt-dlp/commit/ad9a8115aa29a1a95c961b16fcf129a228d98f50) ([#10861](https://github.com/yt-dlp/yt-dlp/issues/10861)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [c6387ab](https://github.com/yt-dlp/yt-dlp/commit/c6387abc1af9842bb0541288a5610abba9b1ab51) by [bashonly](https://github.com/bashonly), [Codenade](https://github.com/Codenade), [coletdjnz](https://github.com/coletdjnz), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [pzhlkj6612](https://github.com/pzhlkj6612), [seproDev](https://github.com/seproDev) + ### 2024.08.06 #### Core changes diff --git a/README.md b/README.md index ca32e09bf..3e76a4efb 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. -* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) +* [**curl_cffi**](https://github.com/lexiforest/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lexiforest/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/lexiforest/curl_cffi/blob/main/LICENSE) * Can be installed with the `curl-cffi` group, e.g. `pip install "yt-dlp[default,curl-cffi]"` * Currently included in `yt-dlp.exe`, `yt-dlp_linux` and `yt-dlp_macos` builds @@ -459,17 +459,17 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git conditions. Use a "\" to escape "&" or quotes if needed. If used multiple times, the filter matches if at least one of the - conditions is met. E.g. --match-filter - !is_live --match-filter "like_count>?100 & + conditions is met. E.g. --match-filters + !is_live --match-filters "like_count>?100 & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live OR those that have a like count more than 100 (or the like field is not available) and also has a description that contains the phrase "cats & - dogs" (caseless). Use "--match-filter -" to + dogs" (caseless). Use "--match-filters -" to interactively ask whether to download each video - --no-match-filters Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filters (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -490,7 +490,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git encountering a file that is in the archive (default) --break-per-input Alters --max-downloads, --break-on-existing, - --break-match-filter, and autonumber to + --break-match-filters, and autonumber to reset per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue @@ -999,12 +999,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format - (currently supported: ass, lrc, srt, vtt) - (Alias: --convert-subtitles) + (currently supported: ass, lrc, srt, vtt). + Use "--convert-subs none" to disable + conversion (default) (Alias: --convert- + subtitles) --convert-thumbnails FORMAT Convert the thumbnails to another format (currently supported: jpg, png, webp). You can specify multiple rules using similar - syntax as --remux-video + syntax as "--remux-video". Use "--convert- + thumbnails none" to disable conversion + (default) --split-chapters Split video into multiple files based on internal chapters. The "chapter:" prefix can be used with "--paths" and "--output" to set @@ -1767,7 +1771,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1777,6 +1781,9 @@ The following extractors use this feature: * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning +* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` +* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) +* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -2177,9 +2184,9 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl: * **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` -* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc +* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filters` etc -* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc +* **Improvements**: Regex and other operators in `--format`/`--match-filters`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc * **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details @@ -2220,7 +2227,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * ~~yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [aria2c](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is~~ -* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filters` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this * yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values * yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. * The sub-modules `swfinterp`, `casefold` are removed. @@ -2266,11 +2273,11 @@ While these options are redundant, they are still expected to be used due to the --get-thumbnail --print thumbnail -e, --get-title --print title -g, --get-url --print urls - --match-title REGEX --match-filter "title ~= (?i)REGEX" - --reject-title REGEX --match-filter "title !~= (?i)REGEX" - --min-views COUNT --match-filter "view_count >=? COUNT" - --max-views COUNT --match-filter "view_count <=? COUNT" - --break-on-reject Use --break-match-filter + --match-title REGEX --match-filters "title ~= (?i)REGEX" + --reject-title REGEX --match-filters "title !~= (?i)REGEX" + --min-views COUNT --match-filters "view_count >=? COUNT" + --max-views COUNT --match-filters "view_count <=? COUNT" + --break-on-reject Use --break-match-filters --user-agent UA --add-header "User-Agent:UA" --referer URL --add-header "Referer:URL" --playlist-start NUMBER -I NUMBER: diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 5189de2d7..7be750cfb 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -185,5 +185,10 @@ "action": "add", "when": "6075a029dba70a89675ae1250e7cdfd91f0eba41", "short": "[priority] Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785)\n - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors" + }, + { + "action": "add", + "when": "fb8b7f226d251e521a89b23c415e249e5b788e5c", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.9**\nSince Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" } ] diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index a5d59f3c0..8135689c7 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -46,6 +46,14 @@ VERBOSE_TMPL = ''' render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' diff --git a/pyproject.toml b/pyproject.toml index d5480e1c6..f54980d57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,14 +49,14 @@ dependencies = [ "pycryptodomex", "requests>=2.32.2,<3", "urllib3>=1.26.17,<3", - "websockets>=12.0", + "websockets>=13.0", ] [project.optional-dependencies] default = [] curl-cffi = [ "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", - "curl-cffi>=0.5.10,!=0.6.*,<0.8; os_name!='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", ] secretstorage = [ "cffi", @@ -76,13 +76,13 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.5.0", + "ruff~=0.6.0", ] test = [ "pytest~=8.1", ] pyinstaller = [ - "pyinstaller>=6.7.0", # for compat with setuptools>=70 + "pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0 ] py2exe = [ "py2exe>=0.12", diff --git a/supportedsites.md b/supportedsites.md index d1ba12258..153774677 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -143,6 +143,7 @@ - **BBVTV**: [*bbvtv*](## "netrc machine") - **BBVTVLive**: [*bbvtv*](## "netrc machine") - **BBVTVRecordings**: [*bbvtv*](## "netrc machine") + - **BeaconTv** - **BeatBumpPlaylist** - **BeatBumpVideo** - **Beatport** @@ -505,6 +506,7 @@ - **gem.cbc.ca:playlist** - **Genius** - **GeniusLyrics** + - **Germanupa**: germanupa.de - **GetCourseRu**: [*getcourseru*](## "netrc machine") - **GetCourseRuPlayer** - **Gettr** @@ -580,6 +582,7 @@ - **HungamaAlbumPlaylist** - **HungamaSong** - **huya:live**: huya.com + - **huya:video**: 虎牙视频 - **Hypem** - **Hytale** - **Icareus** @@ -660,6 +663,7 @@ - **kick:vod** - **Kicker** - **KickStarter** + - **Kika**: KiKA.de - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -722,7 +726,6 @@ - **livestream:original** - **Livestreamfails** - **Lnk** - - **LnkGo** - **loc**: Library of Congress - **loom** - **loom:folder** @@ -756,7 +759,7 @@ - **Masters** - **MatchTV** - **MBN**: mbn.co.kr (매일방송) - - **MDR**: MDR.DE and KiKA + - **MDR**: MDR.DE - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** @@ -811,6 +814,7 @@ - **MNetTVLive**: [*mnettv*](## "netrc machine") - **MNetTVRecordings**: [*mnettv*](## "netrc machine") - **MochaVideo** + - **Mojevideo**: mojevideo.sk - **Mojvideo** - **Monstercat** - **MonsterSirenHypergryphMusic** @@ -1285,12 +1289,14 @@ - **Screencast** - **Screencastify** - **ScreencastOMatic** + - **ScreenRec** - **ScrippsNetworks** - **scrippsnetworks:watch** - **Scrolller** - **SCTE**: [*scte*](## "netrc machine") (**Currently broken**) - **SCTECourse**: [*scte*](## "netrc machine") (**Currently broken**) - **sejm** + - **Sen** - **SenalColombiaLive**: (**Currently broken**) - **SenateGov** - **SenateISVP** @@ -1327,6 +1333,7 @@ - **SlidesLive** - **Slutload** - **Smotrim** + - **SnapchatSpotlight** - **Snotr** - **Sohu** - **SohuV** @@ -1610,6 +1617,7 @@ - **videomore:season** - **videomore:video** - **VideoPress** + - **Vidflex** - **Vidio**: [*vidio*](## "netrc machine") - **VidioLive**: [*vidio*](## "netrc machine") - **VidioPremier**: [*vidio*](## "netrc machine") @@ -1738,7 +1746,7 @@ - **XiaoHongShu**: 小红书 - **ximalaya**: 喜马拉雅FM - **ximalaya:album**: 喜马拉雅FM 专辑 - - **xinpianchang**: xinpianchang.com (**Currently broken**) + - **Xinpianchang**: 新片场 - **XMinus**: (**Currently broken**) - **XNXX** - **Xstream** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1847c4ffd..a99e62408 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -236,6 +236,35 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_by_vcodec_sort(self): + formats = [ + {'format_id': 'av1-format', 'ext': 'mp4', 'vcodec': 'av1', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-hdr-format', 'ext': 'mp4', 'vcodec': 'vp09.02.50.10.01.09.18.09.00', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-sdr-format', 'ext': 'mp4', 'vcodec': 'vp09.00.50.08', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'h265-format', 'ext': 'mp4', 'vcodec': 'h265', 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + def test_format_selection_string_ops(self): formats = [ {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, diff --git a/test/test_networking.py b/test/test_networking.py index 826f11a56..d96624af1 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -822,6 +822,24 @@ class TestRequestHandlerMisc: rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): diff --git a/test/test_utils.py b/test/test_utils.py index a2b459352..4f5fa1e10 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -921,6 +921,11 @@ class TestUtil(unittest.TestCase): 'acodec': 'none', 'dynamic_range': 'HDR10', }) + self.assertEqual(parse_codecs('vp09.02.50.10.01.09.18.09.00'), { + 'vcodec': 'vp09.02.50.10.01.09.18.09.00', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) self.assertEqual(parse_codecs('av01.0.12M.10.0.110.09.16.09.0'), { 'vcodec': 'av01.0.12M.10.0.110.09.16.09.0', 'acodec': 'none', diff --git a/test/test_websockets.py b/test/test_websockets.py index 43f20ac65..06112cc0b 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -88,7 +88,7 @@ def create_wss_websocket_server(): certfn = os.path.join(TEST_DIR, 'testcert.pem') sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.load_cert_chain(certfn, None) - return create_websocket_server(ssl_context=sslctx) + return create_websocket_server(ssl=sslctx) MTLS_CERT_DIR = os.path.join(TEST_DIR, 'testdata', 'certificate') @@ -103,7 +103,7 @@ def create_mtls_wss_websocket_server(): sslctx.load_verify_locations(cafile=cacertfn) sslctx.load_cert_chain(certfn, None) - return create_websocket_server(ssl_context=sslctx) + return create_websocket_server(ssl=sslctx) def create_legacy_wss_websocket_server(): @@ -112,7 +112,7 @@ def create_legacy_wss_websocket_server(): sslctx.maximum_version = ssl.TLSVersion.TLSv1_2 sslctx.set_ciphers('SHA1:AESCCM:aDSS:eNULL:aNULL') sslctx.load_cert_chain(certfn, None) - return create_websocket_server(ssl_context=sslctx) + return create_websocket_server(ssl=sslctx) def ws_validate_and_send(rh, req): @@ -139,7 +139,7 @@ class TestWebsSocketRequestHandlerConformance: cls.wss_thread, cls.wss_port = create_wss_websocket_server() cls.wss_base_url = f'wss://127.0.0.1:{cls.wss_port}' - cls.bad_wss_thread, cls.bad_wss_port = create_websocket_server(ssl_context=ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)) + cls.bad_wss_thread, cls.bad_wss_port = create_websocket_server(ssl=ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)) cls.bad_wss_host = f'wss://127.0.0.1:{cls.bad_wss_port}' cls.mtls_wss_thread, cls.mtls_wss_port = create_mtls_wss_websocket_server() diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c0b8e3b50..c2d19f94a 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -235,6 +235,11 @@ def validate_options(opts): validate_regex('format sorting', f, FormatSorter.regex) # Postprocessor formats + if opts.convertsubtitles == 'none': + opts.convertsubtitles = None + if opts.convertthumbnails == 'none': + opts.convertthumbnails = None + validate_regex('merge output format', opts.merge_output_format, r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS)))) validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 070d2fcb9..cff8d74a7 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1053,8 +1053,9 @@ def _decrypt_windows_dpapi(ciphertext, logger): ctypes.byref(blob_out), # pDataOut ) if not ret: - logger.warning('failed to decrypt with DPAPI', only_once=True) - return None + message = 'Failed to decrypt with DPAPI. See https://github.com/yt-dlp/yt-dlp/issues/10927 for more info' + logger.error(message) + raise DownloadError(message) # force exit result = ctypes.string_at(blob_out.pbData, blob_out.cbData) ctypes.windll.kernel32.LocalFree(blob_out.pbData) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ae2372915..6c1ec403c 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -508,7 +508,7 @@ class FFmpegFD(ExternalFD): env = None proxy = self.params.get('proxy') if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): + if not re.match(r'[\da-zA-Z]+://', proxy): proxy = f'http://{proxy}' if proxy.startswith('socks'): @@ -559,7 +559,7 @@ class FFmpegFD(ExternalFD): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - is_http = re.match(r'^https?://', fmt['url']) + is_http = re.match(r'https?://', fmt['url']) cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] if cookies: args.extend(['-cookies', ''.join( diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 60e8cc782..933ef1787 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -217,6 +217,7 @@ from .bbc import ( BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .beacon import BeaconTvIE from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, @@ -729,6 +730,7 @@ from .genius import ( GeniusIE, GeniusLyricsIE, ) +from .germanupa import GermanupaIE from .getcourseru import ( GetCourseRuIE, GetCourseRuPlayerIE, @@ -822,7 +824,10 @@ from .hungama import ( HungamaIE, HungamaSongIE, ) -from .huya import HuyaLiveIE +from .huya import ( + HuyaLiveIE, + HuyaVideoIE, +) from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE @@ -945,6 +950,7 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1036,10 +1042,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .livestreamfails import LivestreamfailsIE -from .lnkgo import ( - LnkGoIE, - LnkIE, -) +from .lnk import LnkIE from .loom import ( LoomFolderIE, LoomIE, @@ -1164,6 +1167,7 @@ from .mlb import ( ) from .mlssoccer import MLSSoccerIE from .mocha import MochaVideoIE +from .mojevideo import MojevideoIE from .mojvideo import MojvideoIE from .monstercat import MonstercatIE from .motherless import ( @@ -1810,6 +1814,7 @@ from .screen9 import Screen9IE from .screencast import ScreencastIE from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE +from .screenrec import ScreenRecIE from .scrippsnetworks import ( ScrippsNetworksIE, ScrippsNetworksWatchIE, @@ -1820,6 +1825,7 @@ from .scte import ( SCTECourseIE, ) from .sejmpl import SejmIE +from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( SenateGovIE, @@ -1875,6 +1881,7 @@ from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .smotrim import SmotrimIE +from .snapchat import SnapchatSpotlightIE from .snotr import SnotrIE from .sohu import ( SohuIE, @@ -2315,6 +2322,7 @@ from .videomore import ( VideomoreVideoIE, ) from .videopress import VideoPressIE +from .vidflex import VidflexIE from .vidio import ( VidioIE, VidioLiveIE, diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 7518ba6f0..7296be73b 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -387,17 +387,27 @@ class ABCIViewShowSeriesIE(InfoExtractor): 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$', }, 'playlist_count': 15, + 'skip': 'This program is not currently available in ABC iview', + }, { + 'url': 'https://iview.abc.net.au/show/inbestigators', + 'info_dict': { + 'id': '175343-1', + 'title': 'Series 1', + 'description': 'md5:b9976935a6450e5b78ce2a940a755685', + 'series': 'The Inbestigators', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.+\.jpg', + }, + 'playlist_count': 17, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - webpage_data = self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', - webpage, 'initial state') - video_data = self._parse_json( - unescapeHTML(webpage_data).encode().decode('unicode_escape'), show_id) - video_data = video_data['route']['pageData']['_embedded'] + video_data = self._search_json( + r'window\.__INITIAL_STATE__\s*=\s*[\'"]', webpage, 'initial state', show_id, + transform_source=lambda x: x.encode().decode('unicode_escape'), + end_pattern=r'[\'"]\s*;')['route']['pageData']['_embedded'] highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): diff --git a/yt_dlp/extractor/academicearth.py b/yt_dlp/extractor/academicearth.py index d9691cb5c..b997a0288 100644 --- a/yt_dlp/extractor/academicearth.py +++ b/yt_dlp/extractor/academicearth.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' + _VALID_URL = r'https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' _TEST = { 'url': 'http://academicearth.org/playlists/laws-of-nature/', diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 337071794..c8a261375 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -49,9 +49,9 @@ class ADNBaseIE(InfoExtractor): class ADNIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.com/(?:(?Pde)/)?video/[^/?#]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/[^/?#]+/(?P\d+)' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.com/video/fruits-basket/9841-episode-1-a-ce-soir', + 'url': 'https://animationdigitalnetwork.com/video/558-fruits-basket/9841-episode-1-a-ce-soir', 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { 'id': '9841', @@ -71,10 +71,7 @@ class ADNIE(ADNBaseIE): }, 'skip': 'Only available in French and German speaking Europe', }, { - 'url': 'http://animedigitalnetwork.com/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'only_matching': True, - }, { - 'url': 'https://animationdigitalnetwork.com/de/video/the-eminence-in-shadow/23550-folge-1', + 'url': 'https://animationdigitalnetwork.com/de/video/973-the-eminence-in-shadow/23550-folge-1', 'md5': '5c5651bf5791fa6fcd7906012b9d94e8', 'info_dict': { 'id': '23550', @@ -167,7 +164,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' 'username': username, })) or {}).get('accessToken') if access_token: - self._HEADERS = {'authorization': 'Bearer ' + access_token} + self._HEADERS['Authorization'] = f'Bearer {access_token}' except ExtractorError as e: message = None if isinstance(e.cause, HTTPError) and e.cause.status == 401: @@ -178,6 +175,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' video_base_url = self._PLAYER_BASE_URL + f'video/{video_id}/' player = self._download_json( video_base_url + 'configuration', video_id, @@ -218,7 +216,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' links_data = self._download_json( links_url, video_id, 'Downloading links JSON metadata', headers={ 'X-Player-Token': authorization, - 'X-Target-Distribution': lang or 'fr', **self._HEADERS, }, query={ 'freeWithAds': 'true', @@ -257,6 +254,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' load_balancer_data = self._download_json( load_balancer_url, video_id, f'Downloading {format_id} {quality} JSON metadata', + headers=self._HEADERS, fatal=False) or {} m3u8_url = load_balancer_data.get('location') if not m3u8_url: @@ -277,7 +275,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' video = (self._download_json( self._API_BASE_URL + f'video/{video_id}', video_id, - 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + 'Downloading additional video metadata', fatal=False, headers=self._HEADERS) or {}).get('video') or {} show = video.get('show') or {} return { @@ -299,9 +297,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' class ADNSeasonIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.com/(?:(?Pde)/)?video/(?P[^/?#]+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/(?P\d+)[^/?#]*/?(?:$|[#?])' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.com/video/tokyo-mew-mew-new', + 'url': 'https://animationdigitalnetwork.com/video/911-tokyo-mew-mew-new', 'playlist_count': 12, 'info_dict': { 'id': '911', @@ -312,16 +310,14 @@ class ADNSeasonIE(ADNBaseIE): def _real_extract(self, url): lang, video_show_slug = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' show = self._download_json( f'{self._API_BASE_URL}show/{video_show_slug}/', video_show_slug, 'Downloading show JSON metadata', headers=self._HEADERS)['show'] show_id = str(show['id']) episodes = self._download_json( f'{self._API_BASE_URL}video/show/{show_id}', video_show_slug, - 'Downloading episode list', headers={ - 'X-Target-Distribution': lang or 'fr', - **self._HEADERS, - }, query={ + 'Downloading episode list', headers=self._HEADERS, query={ 'order': 'asc', 'limit': '-1', }) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 6fd641347..efc79dd14 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -231,7 +231,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(InfoExtractor): IE_NAME = 'ARDMediathek' - _VALID_URL = r'''(?x)https:// + _VALID_URL = r'''(?x)https?:// (?:(?:beta|www)\.)?ardmediathek\.de/ (?:[^/]+/)? (?:player|live|video)/ @@ -470,7 +470,7 @@ class ARDBetaMediathekIE(InfoExtractor): class ARDMediathekCollectionIE(InfoExtractor): - _VALID_URL = r'''(?x)https:// + _VALID_URL = r'''(?x)https?:// (?:(?:beta|www)\.)?ardmediathek\.de/ (?:[^/?#]+/)? (?Psendung|serie|sammlung)/ diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py index 8fa8f3edb..0437908bf 100644 --- a/yt_dlp/extractor/asobistage.py +++ b/yt_dlp/extractor/asobistage.py @@ -101,9 +101,10 @@ class AsobiStageIE(InfoExtractor): self._HEADERS['Authorization'] = f'Bearer {token}' def _real_extract(self, url): - video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug') + webpage, urlh = self._download_webpage_handle(url, self._match_id(url)) + video_id, event, type_, slug = self._match_valid_url(urlh.url).group('id', 'event', 'type', 'slug') video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] - webpage = self._download_webpage(url, video_id) + event_data = traverse_obj( self._search_nextjs_data(webpage, video_id, default={}), ('props', 'pageProps', 'eventCMSData', { diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 61cbab5a7..0abe05982 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -1,3 +1,5 @@ +import functools +import json import random import re import time @@ -6,7 +8,9 @@ from .common import InfoExtractor from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, + extract_attributes, float_or_none, + get_element_html_by_id, int_or_none, parse_filesize, str_or_none, @@ -17,6 +21,7 @@ from ..utils import ( url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class BandcampIE(InfoExtractor): @@ -459,7 +464,7 @@ class BandcampUserIE(InfoExtractor): }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', - 'playlist_mincount': 10, + 'playlist_mincount': 7, 'info_dict': { 'id': 'coldworldofficial', 'title': 'Discography of coldworldofficial', @@ -473,12 +478,19 @@ class BandcampUserIE(InfoExtractor): }, }] + def _yield_items(self, webpage): + yield from ( + re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + yield from traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes}, + 'data-client-items', {json.loads}, ..., 'page_url', {str})) + def _real_extract(self, url): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) - return self.playlist_from_matches( - discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) + self._yield_items(webpage), uploader, f'Discography of {uploader}', + getter=functools.partial(urljoin, url)) diff --git a/yt_dlp/extractor/beacon.py b/yt_dlp/extractor/beacon.py new file mode 100644 index 000000000..ae47687cc --- /dev/null +++ b/yt_dlp/extractor/beacon.py @@ -0,0 +1,68 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, + traverse_obj, +) + + +class BeaconTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?beacon\.tv/content/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://beacon.tv/content/welcome-to-beacon', + 'md5': 'b3f5932d437f288e662f10f3bfc5bd04', + 'info_dict': { + 'id': 'welcome-to-beacon', + 'ext': 'mp4', + 'upload_date': '20240509', + 'description': 'md5:ea2bd32e71acf3f9fca6937412cc3563', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/I4CkkEvN/poster.jpg?width=720', + 'title': 'Your home for Critical Role!', + 'timestamp': 1715227200, + 'duration': 105.494, + }, + }, { + 'url': 'https://beacon.tv/content/re-slayers-take-trailer', + 'md5': 'd879b091485dbed2245094c8152afd89', + 'info_dict': { + 'id': 're-slayers-take-trailer', + 'ext': 'mp4', + 'title': 'The Re-Slayer’s Take | Official Trailer', + 'timestamp': 1715189040, + 'upload_date': '20240508', + 'duration': 53.249, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/PW5ApIw3/poster.jpg?width=720', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + content_data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', '__APOLLO_STATE__', + lambda k, v: k.startswith('Content:') and v['slug'] == video_id, any)) + if not content_data: + raise ExtractorError('Failed to extract content data') + + jwplayer_data = traverse_obj(content_data, ( + (('contentVideo', 'video', 'videoData'), + ('contentPodcast', 'podcast', 'audioData')), {json.loads}, {dict}, any)) + if not jwplayer_data: + if content_data.get('contentType') not in ('videoPodcast', 'video', 'podcast'): + raise ExtractorError('Content is not a video/podcast', expected=True) + if traverse_obj(content_data, ('contentTier', '__ref')) != 'MemberTier:65b258d178f89be87b4dc0a4': + self.raise_login_required('This video/podcast is for members only') + raise ExtractorError('Failed to extract content') + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id), + **traverse_obj(content_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('publishedAt', {parse_iso8601}), + }), + } diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index a84b7a6f7..62f68fbc6 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -46,6 +46,7 @@ from ..utils import ( class BilibiliBaseIE(InfoExtractor): + _HEADERS = {'Referer': 'https://www.bilibili.com/'} _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') _WBI_KEY_CACHE_TIMEOUT = 30 # exact expire timeout is unclear, use 30s for one session _wbi_key_cache = {} @@ -192,7 +193,7 @@ class BilibiliBaseIE(InfoExtractor): video_info = self._download_json( 'https://api.bilibili.com/x/player/v2', video_id, query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, - note=f'Extracting subtitle info {cid}') + note=f'Extracting subtitle info {cid}', headers=self._HEADERS) if traverse_obj(video_info, ('data', 'need_login_subtitle')): self.report_warning( f'Subtitles are only available when logged in. {self._login_hint()}', only_once=True) @@ -207,7 +208,7 @@ class BilibiliBaseIE(InfoExtractor): def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, - note='Extracting chapters', fatal=False) + note='Extracting chapters', fatal=False, headers=self._HEADERS) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', 'start_time': 'from', @@ -298,7 +299,7 @@ class BilibiliBaseIE(InfoExtractor): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/[^/?#]+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -622,6 +623,10 @@ class BiliBiliIE(BilibiliBaseIE): 'ext': 'mp4', }, 'skip': 'geo-restricted', + }, { + 'note': 'has - in the last path segment of the url', + 'url': 'https://www.bilibili.com/festival/bh3-7th?bvid=BV1tr4y1f7p2&', + 'only_matching': True, }] def _real_extract(self, url): @@ -1017,8 +1022,6 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BilibiliCheeseBaseIE(BilibiliBaseIE): - _HEADERS = {'Referer': 'https://www.bilibili.com/'} - def _extract_episode(self, season_info, ep_id): episode_info = traverse_obj(season_info, ( 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) @@ -1848,7 +1851,7 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' - _HEADERS = {'Referer': 'https://www.bilibili.com/'} + _HEADERS = {'Referer': 'https://www.bilibili.tv/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index b7061a7d1..ee2e56f8e 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -3,7 +3,7 @@ from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P[-a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?P[-a-zA-Z]+)' _TESTS = [{ 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', 'info_dict': { diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 187f73e7b..486a4ea3c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request from ..networking.exceptions import ( HTTPError, IncompleteRead, + TransportError, network_exceptions, ) from ..networking.impersonate import ImpersonateTarget @@ -965,6 +966,9 @@ class InfoExtractor: return False content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data) + if content is False: + assert not fatal + return False return (content, urlh) @staticmethod @@ -1039,7 +1043,15 @@ class InfoExtractor: def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None, data=None): - webpage_bytes = urlh.read() + try: + webpage_bytes = urlh.read() + except TransportError as err: + errmsg = f'{video_id}: Error reading response: {err.msg}' + if fatal: + raise ExtractorError(errmsg, cause=err) + self.report_warning(errmsg) + return False + if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): @@ -2065,7 +2077,7 @@ class InfoExtractor: has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) + return url if re.match(r'https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2800,11 +2812,11 @@ class InfoExtractor: base_url_e = element.find(_add_ns('BaseURL')) if try_call(lambda: base_url_e.text) is not None: base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): + if re.match(r'https?://', base_url): break if mpd_base_url and base_url.startswith('/'): base_url = urllib.parse.urljoin(mpd_base_url, base_url) - elif mpd_base_url and not re.match(r'^https?://', base_url): + elif mpd_base_url and not re.match(r'https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' base_url = mpd_base_url + base_url @@ -2894,7 +2906,7 @@ class InfoExtractor: } def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' + return 'url' if re.match(r'https?://', location) else 'path' if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: @@ -3489,7 +3501,7 @@ class InfoExtractor: continue urls.add(source_url) source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) + ext = determine_ext(source_url, default_ext=mimetype2ext(source_type)) if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 8d7707271..86950b244 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -319,32 +319,6 @@ class DPlayIE(DPlayBaseIE): url, display_id, host, 'dplay' + country, country, domain) -class HGTVDeIE(DPlayBaseIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', - 'info_dict': { - 'id': '151205', - 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', - 'ext': 'mp4', - 'title': 'Wer braucht schon eine Toilette', - 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', - 'duration': 1177.024, - 'timestamp': 1595705400, - 'upload_date': '20200725', - 'creator': 'HGTV', - 'series': 'Tiny House - klein, aber oho', - 'season_number': 3, - 'episode_number': 3, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') - - class DiscoveryPlusBaseIE(DPlayBaseIE): """Subclasses must set _PRODUCT, _DISCO_API_PARAMS""" @@ -373,6 +347,45 @@ class DiscoveryPlusBaseIE(DPlayBaseIE): return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS) +class HGTVDeIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'info_dict': { + 'id': '7332936', + 'ext': 'mp4', + 'display_id': 'mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'title': 'Vom Landleben ins Loft', + 'description': 'md5:e5f72c02c853970796dd3818f2e25745', + 'episode': 'Episode 7', + 'episode_number': 7, + 'season': 'Season 7', + 'season_number': 7, + 'series': 'Mein Kleinstadt-Traumhaus', + 'duration': 2645.0, + 'timestamp': 1725998100, + 'upload_date': '20240910', + 'creators': ['HGTV'], + 'tags': [], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/08/09/82a386b9-c688-32c7-b9ff-0b13865f0bae.jpeg', + }, + }] + + _PRODUCT = 'hgtv' + _DISCO_API_PARAMS = { + 'disco_host': 'eu1-prod.disco-api.com', + 'realm': 'hgtv', + 'country': 'de', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': 'Alps:HyogaPlayer:0.0.0', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class GoDiscoveryIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 51b40df42..c12209623 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -6,8 +6,10 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + update_url, update_url_query, url_basename, + urlencode_postdata, ) @@ -36,43 +38,58 @@ class DropboxIE(InfoExtractor): }, ] + def _yield_decoded_parts(self, webpage): + for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): + yield base64.b64decode(encoded).decode('utf-8', 'ignore') + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) fn = urllib.parse.unquote(url_basename(url)) title = os.path.splitext(fn)[0] - password = self.get_param('videopassword') - if (self._og_search_title(webpage) == 'Dropbox - Password Required' - or 'Enter the password for this link' in webpage): + for part in self._yield_decoded_parts(webpage): + if '/sm/password' in part: + webpage = self._download_webpage( + update_url('https://www.dropbox.com/sm/password', query=part.partition('?')[2]), video_id) + break + + if (self._og_search_title(webpage, default=None) == 'Dropbox - Password Required' + or 'Enter the password for this link' in webpage): if password: - content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id') - payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}' response = self._download_json( - 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode(), - headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', + headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}, + data=urlencode_postdata({ + 'is_xhr': 'true', + 't': self._get_cookies('https://www.dropbox.com')['t'].value, + 'content_id': self._search_regex(r'content_id=([\w.+=/-]+)["\']', webpage, 'content id'), + 'password': password, + 'url': url, + })) if response.get('status') != 'authed': - raise ExtractorError('Authentication failed!', expected=True) - webpage = self._download_webpage(url, video_id) - elif self._get_cookies('https://dropbox.com').get('sm_auth'): - webpage = self._download_webpage(url, video_id) - else: + raise ExtractorError('Invalid password', expected=True) + elif not self._get_cookies('https://dropbox.com').get('sm_auth'): raise ExtractorError('Password protected video, use --video-password ', expected=True) + webpage = self._download_webpage(url, video_id) - formats, subtitles, has_anonymous_download = [], {}, False - for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): - decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + formats, subtitles = [], {} + has_anonymous_download = False + thumbnail = None + for part in self._yield_decoded_parts(webpage): if not has_anonymous_download: has_anonymous_download = self._search_regex( - r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) + r'(anonymous:\tanonymous)', part, 'anonymous', default=False) transcode_url = self._search_regex( - r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) + r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', part, 'transcode url', default=None) if not transcode_url: continue formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') + thumbnail = self._search_regex( + r'(https://www\.dropbox\.com/temp_thumb_from_token/[\w/?&=]+)', part, 'thumbnail', default=None) break # downloads enabled we can get the original file @@ -89,4 +106,5 @@ class DropboxIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, + 'thumbnail': thumbnail, } diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py index 864aa6dc5..6f3f60ff4 100644 --- a/yt_dlp/extractor/ertgr.py +++ b/yt_dlp/extractor/ertgr.py @@ -17,6 +17,7 @@ from ..utils import ( url_or_none, variadic, ) +from ..utils.traversal import traverse_obj class ERTFlixBaseIE(InfoExtractor): @@ -74,29 +75,28 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): def _extract_formats_and_subs(self, video_id): media_info = self._call_api(video_id, codename=video_id) - formats, subs = [], {} - for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: - for media in try_get(media_file, lambda x: x['Formats'], list) or []: - fmt_url = url_or_none(try_get(media, lambda x: x['Url'])) - if not fmt_url: - continue - ext = determine_ext(fmt_url) - if ext == 'm3u8': - formats_, subs_ = self._extract_m3u8_formats_and_subtitles( - fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) - elif ext == 'mpd': - formats_, subs_ = self._extract_mpd_formats_and_subtitles( - fmt_url, video_id, mpd_id='dash', fatal=False) - else: - formats.append({ - 'url': fmt_url, - 'format_id': str_or_none(media.get('Id')), - }) - continue - formats.extend(formats_) - self._merge_subtitles(subs_, target=subs) + formats, subtitles = [], {} + for media in traverse_obj(media_info, ( + 'MediaFiles', lambda _, v: v['RoleCodename'] == 'main', + 'Formats', lambda _, v: url_or_none(v['Url']))): + fmt_url = media['Url'] + ext = determine_ext(fmt_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + fmt_url, video_id, mpd_id='dash', fatal=False) + else: + formats.append({ + 'url': fmt_url, + 'format_id': str_or_none(media.get('Id')), + }) + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) - return formats, subs + return formats, subtitles def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 4e9b63524..552f9af12 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -294,37 +294,37 @@ class ESPNCricInfoIE(InfoExtractor): class WatchESPNIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' _TESTS = [{ - 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'info_dict': { - 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'ext': 'mp4', - 'title': 'Huddersfield vs. Burnley', - 'duration': 7500, - 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + 'title': 'Abilene Chrstn vs. Texas Tech', + 'duration': 14166, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'info_dict': { - 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'ext': 'mp4', - 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', - 'duration': 8335, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'UC Davis vs. California', + 'duration': 9547, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'info_dict': { - 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'ext': 'mp4', - 'title': 'The Wheel - Episode 10', - 'duration': 3352, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'The College Football Show', + 'duration': 3639, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, @@ -353,6 +353,13 @@ class WatchESPNIE(AdobePassIE): if not cookie: self.raise_login_required(method='cookies') + jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt') + id_token = self._download_json( + 'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth', + None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'], + }).encode())['data']['token']['id_token'] + assertion = self._call_bamgrid_api( 'devices', video_id, headers={'Content-Type': 'application/json; charset=UTF-8'}, @@ -371,7 +378,7 @@ class WatchESPNIE(AdobePassIE): })['access_token'] assertion = self._call_bamgrid_api( - 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + 'accounts/grant', video_id, payload={'id_token': id_token}, headers={ 'Authorization': token, 'Content-Type': 'application/json; charset=UTF-8', diff --git a/yt_dlp/extractor/eurosport.py b/yt_dlp/extractor/eurosport.py index 0c5e1238d..682546f8f 100644 --- a/yt_dlp/extractor/eurosport.py +++ b/yt_dlp/extractor/eurosport.py @@ -3,7 +3,12 @@ from ..utils import traverse_obj class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?Pvid\d+)' + _VALID_URL = r'''(?x) + https?://(?: + (?:(?:www|espanol)\.)?eurosport\.(?:com(?:\.tr)?|de|dk|es|fr|hu|it|nl|no|ro)| + eurosport\.tvn24\.pl + )/[\w-]+/(?:[\w-]+/[\d-]+/)?[\w.-]+_(?Pvid\d+) + ''' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -70,6 +75,42 @@ class EurosportIE(InfoExtractor): 'duration': 105.0, 'upload_date': '20230518', }, + }, { + 'url': 'https://www.eurosport.de/radsport/vuelta-a-espana/2024/vuelta-a-espana-2024-wout-van-aert-und-co.-verzweifeln-an-mcnulty-zeitfahr-krimi-in-lissabon_vid2219478/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.dk/speedway/mikkel-michelsen-misser-finalen-i-cardiff-se-danskeren-i-semifinalen-her_vid2219363/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.nl/mixed-martial-arts/ufc/2022/ufc-305-respect-tussen-adesanya-en-du-plessis_vid2219650/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.es/ciclismo/la-vuelta-2024-carlos-rodriguez-olvida-la-crono-y-ya-espera-que-llegue-la-montana-no-me-encontre-nada-comodo_vid2219682/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.fr/football/supercoupe-d-europe/2024-2025/kylian-mbappe-vinicius-junior-eduardo-camavinga-touche.-extraits-de-l-entrainement-du-real-madrid-en-video_vid2216993/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.it/calcio/serie-a/2024-2025/samardzic-a-bergamo-per-le-visite-mediche-con-l-atalanta_vid2219680/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.hu/kerekpar/vuelta-a-espana/2024/dramai-harc-a-masodpercekert-meglepetesgyoztes-a-vuelta-nyitoszakaszan_vid2219481/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.no/golf/fedex-st-jude-championship/2024/ligger-pa-andreplass-sa-skjer-dette-drama_vid30000618/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.no/golf/fedex-st-jude-championship/2024/ligger-pa-andreplass-sa-skjer-dette-drama_vid2219531/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.ro/tenis/western-southern-open-2/2024/rezumatul-partidei-dintre-zverev-si-shelton-de-la-cincinnati_vid2219657/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://www.eurosport.com.tr/hentbol/olympic-games-paris-2024/2024/paris-2024-denmark-ile-germany-olimpiyatlarin-onemli-anlari_vid2215836/video.shtml', + 'only_matching': True, + }, { + 'url': 'https://eurosport.tvn24.pl/kolarstwo/tour-de-france-kobiet/2024/kasia-niewiadoma-przed-ostatnim-8.-etapem-tour-de-france-kobiet_vid2219765/video.shtml', + 'only_matching': True, }] _TOKEN = None @@ -77,6 +118,7 @@ class EurosportIE(InfoExtractor): # actually defined in https://netsport.eurosport.io/?variables={"databaseId":,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 .. # but this method require to get sha256 hash _GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work + _GEO_BYPASS = False def _real_initialize(self): if EurosportIE._TOKEN is None: @@ -98,13 +140,13 @@ class EurosportIE(InfoExtractor): for stream_type in json_data['attributes']['streaming']: if stream_type == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles( - traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4') + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4', fatal=False) elif stream_type == 'dash': fmts, subs = self._extract_mpd_formats_and_subtitles( - traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, fatal=False) elif stream_type == 'mss': fmts, subs = self._extract_ism_formats_and_subtitles( - traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 6aba477a6..1adb35b5f 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -84,7 +84,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1692346159, 'thumbnail': r're:^https?://.*', 'uploader_id': '100063551323670', - 'duration': 3132.184, + 'duration': 3133.583, 'view_count': int, 'concurrent_view_count': 0, }, @@ -112,9 +112,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl', 'duration': 131.03, 'concurrent_view_count': int, + 'view_count': int, }, }, { 'note': 'Video with DASH manifest', @@ -167,7 +168,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': 'ca63897a90c9452efee5f8c40d080e25', + 'md5': '1659aa21fb3dd1585874f668e81a72c8', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -180,9 +181,10 @@ class FacebookIE(InfoExtractor): 'view_count': int, 'uploader_id': '100059479812265', 'concurrent_view_count': int, - 'duration': 44.478, + 'duration': 44.181, }, }, { + # FIXME: unable to extract uploader, no formats found # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', @@ -241,9 +243,9 @@ class FacebookIE(InfoExtractor): 'timestamp': 1511548260, 'upload_date': '20171124', 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l', 'thumbnail': r're:^https?://.*', - 'duration': 148.435, + 'duration': 148.224, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -271,7 +273,7 @@ class FacebookIE(InfoExtractor): 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', 'thumbnail': r're:^https?://.*', 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl', 'upload_date': '20231228', 'timestamp': 1703804085, 'duration': 394.347, @@ -322,7 +324,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', 'uploader_id': '100066514874195', - 'duration': 4524.212, + 'duration': 4524.001, 'view_count': int, 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, @@ -339,9 +341,9 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', + 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl', 'timestamp': 1549275572, - 'duration': 3.413, + 'duration': 3.283, 'uploader': 'Josef Novak', 'description': '', 'upload_date': '20190204', @@ -396,6 +398,7 @@ class FacebookIE(InfoExtractor): 'playlist_count': 1, 'skip': 'Requires logging in', }, { + # FIXME: Cannot parse data error # data.event.cover_media_renderer.cover_video 'url': 'https://m.facebook.com/events/1509582499515440', 'info_dict': { @@ -498,7 +501,8 @@ class FacebookIE(InfoExtractor): or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) or {}) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( @@ -524,6 +528,11 @@ class FacebookIE(InfoExtractor): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -932,18 +941,21 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'f13dd37f2633595982db5ed8765474d3', + 'md5': 'a53256d10fc2105441fe0c4212ed8cea', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$', + 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$', + 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }] @@ -963,6 +975,7 @@ class FacebookAdsIE(InfoExtractor): 'id': '899206155126718', 'ext': 'mp4', 'title': 'video by Kandao', + 'description': 'md5:0822724069e3aca97cbed5dabbab282e', 'uploader': 'Kandao', 'uploader_id': '774114102743284', 'uploader_url': r're:^https?://.*', @@ -971,6 +984,22 @@ class FacebookAdsIE(InfoExtractor): 'upload_date': '20231214', 'like_count': int, }, + }, { + # key 'watermarked_video_sd_url' missing + 'url': 'https://www.facebook.com/ads/library/?id=501152689226254', + 'info_dict': { + 'id': '501152689226254', + 'ext': 'mp4', + 'title': 'video by mat.nawrocki', + 'description': 'md5:02a446ace7ff8c3c37a2892922492490', + 'uploader': 'mat.nawrocki', + 'uploader_id': '148586968341456', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1723452305, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20240812', + 'like_count': int, + }, }, { 'url': 'https://www.facebook.com/ads/library/?id=893637265423481', 'info_dict': { @@ -1017,34 +1046,42 @@ class FacebookAdsIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - post_data = [self._parse_json(j, video_id, fatal=False) - for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)] - data = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False) + post_data = traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) if not data: raise ExtractorError('Unable to extract ad data') title = data.get('title') if not title or title == '{{product.name}}': title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) - - info_dict = traverse_obj(data, { - 'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}), + markup_id = traverse_obj(data, ('body', '__m', {str})) + markup = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id), + ..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any)) + + info_dict = merge_dicts({ + 'title': title, + 'description': markup or None, + }, traverse_obj(data, { + 'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}), 'uploader': ('page_name', {str}), 'uploader_id': ('page_id', {str_or_none}), 'uploader_url': ('page_profile_uri', {url_or_none}), 'timestamp': ('creation_time', {int_or_none}), 'like_count': ('page_like_count', {int_or_none}), - }) + })) entries = [] for idx, entry in enumerate(traverse_obj( - data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1, + data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1, ): entries.append({ 'id': f'{video_id}_{idx}', 'title': entry.get('title') or title, - 'description': entry.get('link_description') or info_dict.get('description'), + 'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'), 'thumbnail': url_or_none(entry.get('video_preview_image_url')), 'formats': self._extract_formats(entry), }) diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index eac70f6a9..f7b883155 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -14,7 +14,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' + _VALID_URL = r'(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 04cffaa86..592800287 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2340,7 +2340,7 @@ class GenericIE(InfoExtractor): default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): - if re.match(r'^[^\s/]+\.[^\s/]+/', url): + if re.match(r'[^\s/]+\.[^\s/]+/', url): self.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': @@ -2400,7 +2400,7 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = full_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) + m = re.match(r'(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: self.report_detected('direct video link') headers = filter_dict({'Referer': smuggled_data.get('referer')}) diff --git a/yt_dlp/extractor/germanupa.py b/yt_dlp/extractor/germanupa.py new file mode 100644 index 000000000..e40f016b2 --- /dev/null +++ b/yt_dlp/extractor/germanupa.py @@ -0,0 +1,91 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + parse_qs, + traverse_obj, + url_or_none, +) + + +class GermanupaIE(InfoExtractor): + IE_DESC = 'germanupa.de' + _VALID_URL = r'https?://germanupa\.de/mediathek/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://germanupa.de/mediathek/4-figma-beratung-deine-sprechstunde-fuer-figma-fragen', + 'info_dict': { + 'id': '909179246', + 'title': 'Tutorial: #4 Figma Beratung - Deine Sprechstunde für Figma-Fragen', + 'ext': 'mp4', + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'thumbnail': 'https://i.vimeocdn.com/video/1792564420-7415283ccef8bf8702dab8c6b7515555ceeb7a1c11371ffcc133b8e887dbf70e-d_1280', + 'uploader_url': 'https://vimeo.com/germanupa', + 'duration': 3987, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'audio, uses GenericIE', + 'url': 'https://germanupa.de/mediathek/live-vom-ux-festival-neuigkeiten-von-figma-jobmarkt-agenturszene-interview-zu-sustainable', + 'info_dict': { + 'id': '1867346676', + 'title': 'Live vom UX Festival: Neuigkeiten von Figma, Jobmarkt, Agenturszene & Interview zu Sustainable UX', + 'ext': 'opus', + 'timestamp': 1720545088, + 'upload_date': '20240709', + 'duration': 3910.557, + 'like_count': int, + 'description': 'md5:db2aed5ff131e177a7b33901e9a8db05', + 'uploader': 'German UPA', + 'repost_count': int, + 'genres': ['Science'], + 'license': 'all-rights-reserved', + 'uploader_url': 'https://soundcloud.com/user-80097677', + 'uploader_id': '471579486', + 'view_count': int, + 'comment_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-oCti2e9GhaZFWBqY-48ybGw-original.jpg', + }, + }, { + 'note': 'Nur für Mitglieder/Just for members', + 'url': 'https://germanupa.de/mediathek/ux-festival-2024-usability-tests-und-ai', + 'info_dict': { + 'id': '986994430', + 'title': 'UX Festival 2024 "Usability Tests und AI" von Lennart Weber', + 'ext': 'mp4', + 'release_date': '20240719', + 'uploader_url': 'https://vimeo.com/germanupa', + 'timestamp': 1721373980, + 'license': 'by-sa', + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/1904187064-2a672630c30f9ad787bd390bff3f51d7506a3e8416763ba6dbf465732b165c5c-d_1280', + 'duration': 2146, + 'release_timestamp': 1721373980, + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'upload_date': '20240719', + 'comment_count': int, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'skip': 'login required', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + param_url = traverse_obj( + self._search_regex( + r']+data-src\s*?=\s*?([\'"])(?Phttps://germanupa\.de/media/oembed\?url=(?:(?!\1).)+)\1', + webpage, 'embedded video', default=None, group='url'), + ({parse_qs}, 'url', 0, {url_or_none})) + + if not param_url: + if self._search_regex( + r']+class\s*?=\s*?([\'"])(?:(?!\1).)*login-wrapper(?:(?!\1).)*\1', + webpage, 'login wrapper', default=None): + self.raise_login_required('This video is only available for members') + return self.url_result(url, 'Generic') # Fall back to generic to extract audio + + real_url = param_url.replace('https://vimeo.com/', 'https://player.vimeo.com/video/') + return self.url_result(VimeoIE._smuggle_referrer(real_url, url), VimeoIE, video_id) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 53b881011..b7581d77e 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -52,7 +52,7 @@ class GetCourseRuIE(InfoExtractor): _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', - rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + rf'{_BASE_URL_RE}/(?:pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] _TESTS = [{ 'url': 'http://academymel.online/3video_1', diff --git a/yt_dlp/extractor/golem.py b/yt_dlp/extractor/golem.py index 90d2fe6c2..964bf6519 100644 --- a/yt_dlp/extractor/golem.py +++ b/yt_dlp/extractor/golem.py @@ -7,7 +7,7 @@ from ..utils import ( class GolemIE(InfoExtractor): - _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P.+?)/' + _VALID_URL = r'https?://video\.golem\.de/.+?/(?P.+?)/' _TEST = { 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index 17673d5b8..b5a7b14a5 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -13,7 +13,7 @@ from ..utils import ( class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' - _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' + _VALID_URL = r'https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 5663a78a3..f79e032e4 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -8,15 +8,19 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + parse_duration, str_or_none, try_get, unescapeHTML, + unified_strdate, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class HuyaLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P[^/#?&]+)(?:\D|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P[^/#?&]+)(?:\D|$)' IE_NAME = 'huya:live' IE_DESC = 'huya.com' TESTS = [{ @@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor): 'info_dict': { 'id': '572329', 'title': str, + 'ext': 'flv', 'description': str, 'is_live': True, 'view_count': int, @@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor): fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) return fm, ss + + +class HuyaVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P\d+)\.html' + IE_NAME = 'huya:video' + IE_DESC = '虎牙视频' + + _TESTS = [{ + 'url': 'https://www.huya.com/video/play/1002412640.html', + 'info_dict': { + 'id': '1002412640', + 'ext': 'mp4', + 'title': '8月3日', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 14, + 'uploader': '虎牙-ATS欧卡车队青木', + 'uploader_id': '1564376151', + 'upload_date': '20240803', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://www.huya.com/video/play/556054543.html', + 'info_dict': { + 'id': '556054543', + 'ext': 'mp4', + 'title': '我不挑事 也不怕事', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 1864, + 'uploader': '卡尔', + 'uploader_id': '367138632', + 'upload_date': '20210811', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }] + + def _real_extract(self, url: str): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://liveapi.huya.com/moment/getMomentContent', video_id, + query={'videoId': video_id})['data']['moment']['videoInfo'] + + formats = [] + for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))): + formats.append({ + 'url': definition['url'], + **traverse_obj(definition, { + 'format_id': ('defName', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('videoCover', {url_or_none}), + 'duration': ('videoDuration', {parse_duration}), + 'uploader': ('nickName', {str}), + 'uploader_id': ('uid', {str_or_none}), + 'upload_date': ('videoUploadTime', {unified_strdate}), + 'view_count': ('videoPlayNum', {int_or_none}), + 'comment_count': ('videoCommentNum', {int_or_none}), + 'like_count': ('favorCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index ab26dc5ef..9b91a454b 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor): 'id': 'p51388', 'ext': 'mp4', 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - 'upload_date': '20201103', - 'timestamp': 1604437480, + 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01', + 'episode': 'Partička (92)', + 'season': 'Partička', + 'series': 'Prima Partička', + 'episode_number': 92, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne', + 'info_dict': { + 'id': 'p1412199', + 'ext': 'mp4', + 'episode_number': 3, + 'episode': 'Tenerife: V říši ohně', + 'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c', + 'duration': 3111.0, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768', + 'title': 'Tenerife: V říši ohně', + 'timestamp': 1711825800, + 'upload_date': '20240330', }, 'params': { 'skip_download': True, # m3u8 download @@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor): video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?Pp\d+)\1', r'pproduct_id\s*=\s*([\'"])(?Pp\d+)\1', + r'let\s+videos\s*=\s*([\'"])(?Pp\d+)\1', ), webpage, 'real id', group='id', default=None) if not video_id: @@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor): final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, - 'title': title, + 'title': final_result.get('title') or title, 'thumbnail': self._html_search_meta( ['thumbnail', 'og:image', 'twitter:image'], webpage, 'thumbnail', default=None), diff --git a/yt_dlp/extractor/japandiet.py b/yt_dlp/extractor/japandiet.py index 2ef091aff..994da22ae 100644 --- a/yt_dlp/extractor/japandiet.py +++ b/yt_dlp/extractor/japandiet.py @@ -194,11 +194,14 @@ class ShugiinItvVodIE(ShugiinItvBaseIE): class SangiinInstructionIE(InfoExtractor): - _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' IE_DESC = False # this shouldn't be listed as a supported site def _real_extract(self, url): - raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + raise ExtractorError( + 'Copy the link from the button below the video description/player ' + 'and use that link to download. If there is no button in the frame, ' + 'get the URL of the frame showing the video.', expected=True) class SangiinIE(InfoExtractor): diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index e5737b1e9..6d51e32f6 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -22,7 +22,7 @@ class KalturaIE(InfoExtractor): (?: kaltura:(?P\w+):(?P\w+)(?::(?P\w+))?| https?:// - (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ + (?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ (?: (?: # flash player diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 3f03f9e4c..42eef3c92 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -15,7 +15,7 @@ from ..utils import ( class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + _PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4' def _parse_video(self, video): return { @@ -39,7 +39,7 @@ class KhanAcademyBaseIE(InfoExtractor): query={ 'fastly_cacheable': 'persist_until_publish', 'pcv': self._PUBLISHED_CONTENT_VERSION, - 'hash': '1242644265', + 'hash': '3712657851', 'variables': json.dumps({ 'path': display_id, 'countryCode': 'US', diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 1c1b2a177..bd21e5950 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -67,7 +67,7 @@ class KickIE(KickBaseIE): @classmethod def suitable(cls, url): - return False if KickClipIE.suitable(url) else super().suitable(url) + return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): channel = self._match_id(url) @@ -98,25 +98,25 @@ class KickIE(KickBaseIE): class KickVODIE(KickBaseIE): IE_NAME = 'kick:vod' - _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', + 'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': 'e74614f4-5270-4319-90ad-32179f19a45c', + 'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'ext': 'mp4', - 'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', + 'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑', 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', 'channel': 'xqc', 'channel_id': '668', 'uploader': 'xQc', 'uploader_id': '676', - 'upload_date': '20240724', - 'timestamp': 1721796562, - 'duration': 18566.0, + 'upload_date': '20240909', + 'timestamp': 1725919141, + 'duration': 10155.0, 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, - 'categories': ['VALORANT'], + 'categories': ['Just Chatting'], 'age_limit': 0, }, 'params': {'skip_download': 'm3u8'}, @@ -148,7 +148,7 @@ class KickVODIE(KickBaseIE): class KickClipIE(KickBaseIE): IE_NAME = 'kick:clips' - _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/?\?(?:[^#]+&)?clip=(?Pclip_[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+(?:/clips/|/?\?(?:[^#]+&)?clip=)(?Pclip_[\w-]+)' _TESTS = [{ 'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X', 'info_dict': { @@ -189,6 +189,26 @@ class KickClipIE(KickBaseIE): 'age_limit': 0, }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/spreen/clips/clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'info_dict': { + 'id': 'clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'ext': 'mp4', + 'title': 'KLJASLDJKLJKASDLJKDAS', + 'channel': 'spreen', + 'channel_id': '5312671', + 'uploader': 'AnormalBarraBaja', + 'uploader_id': '26518262', + 'duration': 43.0, + 'upload_date': '20240927', + 'timestamp': 1727399987, + 'thumbnail': 'https://clips.kick.com/clips/f2/clip_01J8RGZRKHXHXXKJEHGRM932A5/thumbnail.webp', + 'view_count': int, + 'like_count': int, + 'categories': ['Minecraft'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py new file mode 100644 index 000000000..852a4de3f --- /dev/null +++ b/yt_dlp/extractor/kika.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class KikaIE(InfoExtractor): + IE_DESC = 'KiKA.de' + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P[a-z-]+\d+)' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + 'info_dict': { + 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'ext': 'mp4', + 'upload_date': '20240831', + 'timestamp': 1725126600, + 'season_number': 2024, + 'modified_date': '20240831', + 'episode': 'Episode 476', + 'episode_number': 476, + 'season': 'Season 2024', + 'duration': 634, + 'title': 'logo! vom Samstag, 31. August 2024', + 'modified_timestamp': 1725129983, + }, + }, { + 'url': 'https://www.kika.de/kaltstart/videos/video92498', + 'md5': '710ece827e5055094afeb474beacb7aa', + 'info_dict': { + 'id': 'video92498', + 'ext': 'mp4', + 'title': '7. Wo ist Leo?', + 'description': 'md5:fb48396a5b75068bcac1df74f1524920', + 'duration': 436, + 'timestamp': 1702926876, + 'upload_date': '20231218', + 'episode_number': 7, + 'modified_date': '20240319', + 'modified_timestamp': 1710880610, + 'episode': 'Episode 7', + 'season_number': 1, + 'season': 'Season 1', + }, + }, { + 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', + 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + 'info_dict': { + 'id': 'video90088', + 'ext': 'mp4', + 'upload_date': '20221102', + 'timestamp': 1667390580, + 'duration': 197, + 'modified_timestamp': 1711093771, + 'episode_number': 8, + 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', + 'modified_date': '20240322', + 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', + 'season_number': 1, + 'episode': 'Episode 8', + 'season': 'Season 1', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id) + video_assets = self._download_json(doc['assets']['url'], video_id) + + subtitles = {} + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': list(self._extract_formats(video_assets, video_id)), + 'subtitles': subtitles, + **traverse_obj(doc, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('date', {parse_iso8601}), + 'modified_timestamp': ('modificationDate', {parse_iso8601}), + 'duration': (( + ('durationInSeconds', {int_or_none}), + ('duration', {parse_duration})), any), + 'episode_number': ('episodeNumber', {int_or_none}), + 'season_number': ('season', {int_or_none}), + }), + } + + def _extract_formats(self, media_info, video_id): + for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))): + stream_url = media['url'] + ext = determine_ext(stream_url) + if ext == 'm3u8': + yield from self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + yield { + 'url': stream_url, + 'format_id': ext, + **traverse_obj(media, { + 'width': ('frameWidth', {int_or_none}), + 'height': ('frameHeight', {int_or_none}), + # NB: filesize is 0 if unknown, bitrate is -1 if unknown + 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), + 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), + }), + } diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnk.py similarity index 53% rename from yt_dlp/extractor/lnkgo.py rename to yt_dlp/extractor/lnk.py index 31a7cefd8..593f73410 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnk.py @@ -1,86 +1,11 @@ from .common import InfoExtractor from ..utils import ( - clean_html, format_field, int_or_none, - parse_iso8601, unified_strdate, ) -class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P[A-Za-z0-9-]+)(?:/(?P\d+))?' - _TESTS = [{ - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', - 'info_dict': { - 'id': '10809', - 'ext': 'mp4', - 'title': "Put'ka: Trys Klausimai", - 'upload_date': '20161216', - 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', - 'age_limit': 18, - 'duration': 117, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1481904000, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', - 'info_dict': { - 'id': '10467', - 'ext': 'mp4', - 'title': 'Nėrdas: Kompiuterio Valymas', - 'upload_date': '20150113', - 'description': 'md5:7352d113a242a808676ff17e69db6a69', - 'age_limit': 18, - 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421164800, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', - 'only_matching': True, - }] - _AGE_LIMITS = { - 'N-7': 7, - 'N-14': 14, - 'S': 18, - } - _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'), - display_id)['videoConfig']['videoInfo'] - - video_id = str(video_info['id']) - title = video_info['title'] - prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' - formats = self._extract_m3u8_formats( - self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), - video_id, 'mp4', 'm3u8_native') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'), - 'duration': int_or_none(video_info.get('duration')), - 'description': clean_html(video_info.get('htmlDescription')), - 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), - 'timestamp': parse_iso8601(video_info.get('airDate')), - 'view_count': int_or_none(video_info.get('viewsCount')), - } - - class LnkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P\d+)' diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py index 1191aa17e..b0878c33e 100644 --- a/yt_dlp/extractor/loom.py +++ b/yt_dlp/extractor/loom.py @@ -92,9 +92,9 @@ class LoomIE(InfoExtractor): }, 'params': {'videopassword': 'seniorinfants2'}, }, { - # embed, transcoded-url endpoint sends empty JSON response + # embed, transcoded-url endpoint sends empty JSON response, split video and audio HLS formats 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', - 'md5': '8488817242a0db1cb2ad0ea522553cf6', + 'md5': 'b321d261656848c184a94e3b93eae28d', 'info_dict': { 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', 'ext': 'mp4', @@ -104,6 +104,7 @@ class LoomIE(InfoExtractor): 'timestamp': 1657216459, 'duration': 181, }, + 'params': {'format': 'bestvideo'}, # Test video-only fixup 'expected_warnings': ['Failed to parse JSON'], }] _WEBPAGE_TESTS = [{ @@ -293,7 +294,11 @@ class LoomIE(InfoExtractor): format_url = format_url.replace('-split.m3u8', '.m3u8') m3u8_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) + # Sometimes only split video/audio formats are available, need to fixup video-only formats + is_not_premerged = 'none' in traverse_obj(m3u8_formats, (..., 'vcodec')) for fmt in m3u8_formats: + if is_not_premerged and fmt.get('vcodec') != 'none': + fmt['acodec'] = 'none' yield { **fmt, 'url': update_url(fmt['url'], query=query), diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py index cca678f14..0496a87f0 100644 --- a/yt_dlp/extractor/mailru.py +++ b/yt_dlp/extractor/mailru.py @@ -126,7 +126,7 @@ class MailRuIE(InfoExtractor): video_data = None # fix meta_url if missing the host address - if re.match(r'^\/\+\/', meta_url): + if re.match(r'\/\+\/', meta_url): meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 46097fa20..dfda3cc53 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -13,8 +13,8 @@ from ..utils import ( class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + IE_DESC = 'MDR.DE' + _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _GEO_COUNTRIES = ['DE'] @@ -34,30 +34,6 @@ class MDRIE(InfoExtractor): 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, }, { # audio with alternative playerURL pattern 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', @@ -68,28 +44,7 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, + 'skip': '404 not found', }, { 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', 'only_matching': True, diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index f51342060..197e91d1d 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -16,6 +16,15 @@ class MediaKlikkIE(InfoExtractor): (?P[^/#?_]+)''' _TESTS = [{ + 'url': 'https://mediaklikk.hu/filmajanlo/cikk/az-ajto/', + 'info_dict': { + 'id': '668177', + 'title': 'Az ajtó', + 'display_id': 'az-ajto', + 'ext': 'mp4', + 'thumbnail': 'https://cdn.cms.mtv.hu/wp-content/uploads/sites/4/2016/01/vlcsnap-2023-07-31-14h18m52s111.jpg', + }, + }, { # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { @@ -37,6 +46,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230903', 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', @@ -59,6 +69,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230908', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # m4sport with *video/ url and no date 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', @@ -69,6 +80,7 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', @@ -90,6 +102,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230911', 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg', }, + 'skip': 'Webpage redirects to video list page', }, { # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', @@ -112,6 +125,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230909', 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg', }, + 'skip': 'Webpage redirects to video list page', }] def _real_extract(self, url): @@ -143,14 +157,14 @@ class MediaKlikkIE(InfoExtractor): if not playlist_url: raise ExtractorError('Unable to extract playlist url') - formats = self._extract_wowza_formats( - playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(playlist_url, video_id) return { 'id': video_id, 'title': title, 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, 'upload_date': upload_date, 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage), } diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index d5dda06f9..c793626fd 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -16,7 +16,7 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/[bv]/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' IE_NAME = 'MangoTV' diff --git a/yt_dlp/extractor/mit.py b/yt_dlp/extractor/mit.py index e75c540a2..66c3b0793 100644 --- a/yt_dlp/extractor/mit.py +++ b/yt_dlp/extractor/mit.py @@ -65,7 +65,7 @@ class TechTVMITIE(InfoExtractor): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P[a-z0-9\-]+)' + _VALID_URL = r'https?://ocw\.mit\.edu/courses/(?P[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/yt_dlp/extractor/mojevideo.py b/yt_dlp/extractor/mojevideo.py new file mode 100644 index 000000000..145e30697 --- /dev/null +++ b/yt_dlp/extractor/mojevideo.py @@ -0,0 +1,121 @@ +from .common import InfoExtractor +from ..utils import js_to_json, remove_end, update_url_query + + +class MojevideoIE(InfoExtractor): + IE_DESC = 'mojevideo.sk' + _VALID_URL = r'https?://(?:www\.)?mojevideo\.sk/video/(?P\w+)/(?P[\w()]+?)\.html' + + _TESTS = [{ + 'url': 'https://www.mojevideo.sk/video/3d17c/chlapci_dobetonovali_sme_mame_hotovo.html', + 'md5': '384a4628bd2bbd261c5206cf77c38c17', + 'info_dict': { + 'id': '3d17c', + 'ext': 'mp4', + 'title': 'Chlapci dobetónovali sme, máme hotovo!', + 'display_id': 'chlapci_dobetonovali_sme_mame_hotovo', + 'description': 'md5:a0822126044050d304a9ef58c92ddb34', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/250236.jpg', + 'duration': 21.0, + 'upload_date': '20230919', + 'timestamp': 1695129706, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/14677/den_blbec.html', + 'md5': '517c3e111c53a67d10b429c1f344ba2f', + 'info_dict': { + 'id': '14677', + 'ext': 'mp4', + 'title': 'Deň blbec?', + 'display_id': 'den_blbec', + 'description': 'I maličkosť vám môže zmeniť celý deň. Nikdy nezahadzujte žuvačky na zem!', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/83575.jpg', + 'duration': 100.0, + 'upload_date': '20120515', + 'timestamp': 1337076481, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2feb2/band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd).html', + 'md5': '64599a23d3ac31cf2fe069e4353d8162', + 'info_dict': { + 'id': '2feb2', + 'ext': 'mp4', + 'title': 'BAND-MAID - onset (Instrumental) Live - Zepp Tokyo (Full HD)', + 'display_id': 'band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd)', + 'description': 'Výborná inštrumentálna skladba od skupiny BAND-MAID.', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/196274.jpg', + 'duration': 240.0, + 'upload_date': '20190708', + 'timestamp': 1562576592, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/358c8/dva_nissany_skyline_strielaju_v_londyne.html', + 'only_matching': True, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/2455d/gopro_hero4_session_nova_sportova_vodotesna_kamera.html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/352ee/amd_rx_6800_xt_vs_nvidia_rtx_3080_(test_v_9_hrach).html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2cbeb/trailer_z_avengers_infinity_war.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + + video_id_dec = self._search_regex( + r'\bvId\s*=\s*(\d+)', webpage, 'video id', fatal=False) or str(int(video_id, 16)) + video_exp = self._search_regex(r'\bvEx\s*=\s*["\'](\d+)', webpage, 'video expiry') + video_hashes = self._search_json( + r'\bvHash\s*=', webpage, 'video hashes', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json) + + formats = [] + for video_hash, (suffix, quality, format_note) in zip(video_hashes, [ + ('', 1, 'normálna kvalita'), + ('_lq', 0, 'nízka kvalita'), + ('_hd', 2, 'HD-720p'), + ('_fhd', 3, 'FULL HD-1080p'), + ('_2k', 4, '2K-1440p'), + ]): + formats.append({ + 'format_id': f'mp4-{quality}', + 'quality': quality, + 'format_note': format_note, + 'url': update_url_query( + f'https://cache01.mojevideo.sk/securevideos69/{video_id_dec}{suffix}.mp4', { + 'md5': video_hash, + 'expires': video_exp, + }), + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': (self._og_search_title(webpage, default=None) + or remove_end(self._html_extract_title(webpage, 'title'), ' - Mojevideo')), + 'description': self._og_search_description(webpage), + **self._search_json_ld(webpage, video_id, default={}), + } diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 179e7a9b1..e06740d62 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -420,7 +420,7 @@ class NiconicoIE(InfoExtractor): 'x-request-with': 'https://www.nicovideo.jp', })['data']['contentUrl'] # Getting all audio formats results in duplicate video formats which we filter out later - dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id) + dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id, 'mp4') # m3u8 extraction does not provide audio bitrates, so extract from the API data and fix for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'): @@ -432,7 +432,6 @@ class NiconicoIE(InfoExtractor): 'asr': ('samplingRate', {int_or_none}), }), get_all=False), 'acodec': 'aac', - 'ext': 'm4a', } # Sort before removing dupes to keep the format dicts with the lowest tbr diff --git a/yt_dlp/extractor/nzonscreen.py b/yt_dlp/extractor/nzonscreen.py index 5fc516daf..755039804 100644 --- a/yt_dlp/extractor/nzonscreen.py +++ b/yt_dlp/extractor/nzonscreen.py @@ -10,7 +10,7 @@ from ..utils import ( class NZOnScreenIE(InfoExtractor): - _VALID_URL = r'^https?://www\.nzonscreen\.com/title/(?P[^/?#]+)' + _VALID_URL = r'https?://www\.nzonscreen\.com/title/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', 'info_dict': { diff --git a/yt_dlp/extractor/nzz.py b/yt_dlp/extractor/nzz.py index ac3b73156..047c4e1ac 100644 --- a/yt_dlp/extractor/nzz.py +++ b/yt_dlp/extractor/nzz.py @@ -1,9 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - extract_attributes, -) class NZZIE(InfoExtractor): @@ -22,19 +19,14 @@ class NZZIE(InfoExtractor): 'playlist_count': 1, }] + def _entries(self, webpage, page_id): + for script in re.findall(r'(?s)]* data-hid="jw-video-jw[^>]+>(.+?)', webpage): + settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False) + if entry := self._parse_jwplayer_data(settings, page_id): + yield entry + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - entries = [] - for player_element in re.findall( - r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): - player_params = extract_attributes(player_element) - if player_params.get('data-type') not in ('kaltura_singleArticle',): - self.report_warning('Unsupported player type') - continue - entry_id = player_params['data-id'] - entries.append(self.url_result( - 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) - - return self.playlist_result(entries, page_id) + return self.playlist_result(self._entries(webpage, page_id), page_id) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7d6e8439c..4489d533a 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -420,7 +420,7 @@ class PatreonIE(PatreonBaseIE): class PatreonCampaignIE(PatreonBaseIE): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P\d+))|(?P[-\w]+))' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P\d+)|(?P[-\w]+))' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { @@ -442,25 +442,44 @@ class PatreonCampaignIE(PatreonBaseIE): 'url': 'https://www.patreon.com/m/4767637/posts', 'info_dict': { 'title': 'Not Just Bikes', - 'channel_follower_count': int, 'id': '4767637', 'channel_id': '4767637', 'channel_url': 'https://www.patreon.com/notjustbikes', - 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1', + 'description': 'md5:9f4b70051216c4d5c58afe580ffc8d0f', 'age_limit': 0, 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.patreon.com/notjustbikes', - 'uploader': 'Not Just Bikes', + 'uploader': 'Jason', 'uploader_id': '37306634', 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 71, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769/posts', + 'info_dict': { + 'title': 'Second Thought', + 'channel_follower_count': int, + 'id': '4243769', + 'channel_id': '4243769', + 'channel_url': 'https://www.patreon.com/secondthought', + 'description': 'md5:69c89a3aba43efdb76e85eb023e8de8b', + 'age_limit': 0, + 'channel': 'Second Thought', + 'uploader_url': 'https://www.patreon.com/secondthought', + 'uploader': 'JT Chapman', + 'uploader_id': '32718287', + 'thumbnail': r're:^https?://.*$', + }, + 'playlist_mincount': 201, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True, }, { 'url': 'https://www.patreon.com/m/5932659', 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769', + 'only_matching': True, }] @classmethod diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 07f249498..f0b38893b 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -109,7 +109,7 @@ class PinterestBaseIE(InfoExtractor): class PinterestIE(PinterestBaseIE): - _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P\d+)' + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE): }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, + }, + { + 'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927', + 'info_dict': { + 'id': '2885187256207927', + 'ext': 'mp4', + 'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅', + 'description': 'md5:5da41c767d2317e42e49b663b0b2150f', + 'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs', + 'uploader_id': '1142999717836434688', + 'upload_date': '20240702', + 'timestamp': 1719939156, + 'duration': 7.967, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': ['#BlueLagoonPediNails', '#SpaExperience'], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 679dc6323..e1e9777e8 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -628,8 +628,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - for e in page_entries: - yield e + yield from page_entries if not self._has_more(webpage): break diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index b0b6681c9..f94d6a3e7 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, clean_html, + join_nonempty, time_seconds, try_call, unified_timestamp, @@ -167,7 +168,7 @@ class RadikoBaseIE(InfoExtractor): class RadikoIE(RadikoBaseIE): - _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P[A-Z0-9-]+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P[A-Z0-9-]+)/(?P\d+)' _TESTS = [{ # QRR (文化放送) station provides @@ -183,8 +184,9 @@ class RadikoIE(RadikoBaseIE): }] def _real_extract(self, url): - station, video_id = self._match_valid_url(url).groups() - vid_int = unified_timestamp(video_id, False) + station, timestring = self._match_valid_url(url).group('station', 'timestring') + video_id = join_nonempty(station, timestring) + vid_int = unified_timestamp(timestring, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) auth_token, area_id = self._auth_client() @@ -207,7 +209,7 @@ class RadikoIE(RadikoBaseIE): 'ft': radio_begin, 'end_at': radio_end, 'to': radio_end, - 'seek': video_id, + 'seek': timestring, }, ), } diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index ff2196354..9d9043984 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -16,7 +16,7 @@ from ..utils import ( class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' + _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' IE_NAME = 'radiofrance' _TEST = { diff --git a/yt_dlp/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py index ddf8c3753..f3bcc2c32 100644 --- a/yt_dlp/extractor/reverbnation.py +++ b/yt_dlp/extractor/reverbnation.py @@ -6,7 +6,7 @@ from ..utils import ( class ReverbNationIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' + _VALID_URL = r'https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 944e8636a..26aec2e4c 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -8,7 +8,7 @@ from ..utils import js_to_json class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P[0-9]+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -19,9 +19,25 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', + 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'info_dict': { + 'id': 'e757904', + 'ext': 'mp4', + 'title': '25 Curiosidades, 25 de Abril', + 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', + 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', + 'only_matching': True, }] _RX_OBFUSCATION = re.compile(r'''(?xs) @@ -49,17 +65,17 @@ class RTPIE(InfoExtractor): f, config = self._search_regex( r'''(?sx) - var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s* + (?:var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s*)? var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P{(?:(?!\*/).)+?})\);(?!\s*\*/) ''', webpage, 'player config', group=('f', 'config')) - f = self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) config = self._parse_json( config, video_id, lambda data: self.__unobfuscate(data, video_id=video_id)) + f = config['file'] if not f else self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) formats = [] if isinstance(f, dict): diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index db780a2cf..74c7e4f17 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -8,14 +8,17 @@ from ..utils import ( UnsupportedError, clean_html, determine_ext, + extract_attributes, format_field, get_element_by_class, + get_elements_html_by_class, int_or_none, join_nonempty, parse_count, parse_iso8601, traverse_obj, unescapeHTML, + urljoin, ) @@ -382,8 +385,10 @@ class RumbleChannelIE(InfoExtractor): if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise - for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): - yield self.url_result('https://rumble.com' + video_url) + for video_url in traverse_obj( + get_elements_html_by_class('videostream__link', webpage), (..., {extract_attributes}, 'href'), + ): + yield self.url_result(urljoin('https://rumble.com', video_url)) def _real_extract(self, url): url, playlist_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index d389b3209..2c416811a 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -6,6 +6,7 @@ from ..utils import ( determine_ext, int_or_none, parse_qs, + traverse_obj, try_get, unified_timestamp, url_or_none, @@ -80,6 +81,8 @@ class RutubeBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) + for hls_url in traverse_obj(options, ('live_streams', 'hls', ..., 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4', fatal=False)) return formats def _download_and_extract_formats(self, video_id, query=None): @@ -90,7 +93,7 @@ class RutubeBaseIE(InfoExtractor): class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:(?:live/)?video(?:/private)?|(?:play/)?embed)/(?P[\da-z]{32})' _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ @@ -164,6 +167,29 @@ class RutubeIE(RutubeBaseIE): 'uploader': 'Стас Быков', }, 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/live/video/c58f502c7bb34a8fcdd976b221fca292/', + 'info_dict': { + 'id': 'c58f502c7bb34a8fcdd976b221fca292', + 'ext': 'mp4', + 'categories': ['Телепередачи'], + 'description': '', + 'thumbnail': 'http://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg', + 'live_status': 'is_live', + 'age_limit': 0, + 'uploader_id': '23460655', + 'timestamp': 1652972968, + 'view_count': int, + 'upload_date': '20220519', + 'title': r're:Первый канал. Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader': 'Первый канал', + }, + }, { + 'url': 'https://rutube.ru/video/5ab908fccfac5bb43ef2b1e4182256b0/', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/live/video/private/c58f502c7bb34a8fcdd976b221fca292/', + 'only_matching': True, }] @classmethod diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254..3db3ce142 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', @@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, diff --git a/yt_dlp/extractor/screenrec.py b/yt_dlp/extractor/screenrec.py new file mode 100644 index 000000000..64f8d2494 --- /dev/null +++ b/yt_dlp/extractor/screenrec.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor + + +class ScreenRecIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P\w{10})' + _TESTS = [{ + 'url': 'https://screenrec.com/share/DasLtbknYo', + 'info_dict': { + 'id': 'DasLtbknYo', + 'ext': 'mp4', + 'title': '02.05.2024_03.01.25_REC', + 'description': 'Recorded with ScreenRec', + 'thumbnail': r're:^https?://.*\.gif$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'customUrl\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + } diff --git a/yt_dlp/extractor/sen.py b/yt_dlp/extractor/sen.py new file mode 100644 index 000000000..d8f14ecdc --- /dev/null +++ b/yt_dlp/extractor/sen.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class SenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P[0-9a-f-]+)' + _TEST = { + 'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'ext': 'mp4', + 'description': 'Florida, 28 Sep 2022', + 'title': 'Hurricane Ian', + 'tags': ['North America', 'Storm', 'Weather'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id) + m3u8_url = (traverse_obj(api_data, ( + 'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any)) + or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + **traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', { + 'title': ('title', 'text', {str}), + 'description': ('descriptions', 0, 'text', {str}), + 'tags': ('badges', ..., 'text', {str}), + })), + } diff --git a/yt_dlp/extractor/servus.py b/yt_dlp/extractor/servus.py index 117f18081..841c7ebf3 100644 --- a/yt_dlp/extractor/servus.py +++ b/yt_dlp/extractor/servus.py @@ -27,7 +27,7 @@ class ServusIE(InfoExtractor): 'info_dict': { 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Klettersteige in den Alpen', + 'title': 'Vie Ferrate - Klettersteige in den Alpen', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2823, @@ -38,6 +38,7 @@ class ServusIE(InfoExtractor): 'season_number': 11, 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', 'episode_number': 8, + 'categories': ['Bergwelten'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -71,8 +72,11 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id, fatal=False) + video = self._download_json( - 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + 'https://api-player.redbull.com/stv/servus-tv-playnet', video_id, 'Downloading video JSON', query={'videoId': video_id}) if not video.get('videoUrl'): self._report_errors(video) @@ -89,7 +93,7 @@ class ServusIE(InfoExtractor): return { 'id': video_id, 'title': video.get('title'), - 'description': self._get_description(video_id) or video.get('description'), + 'description': self._get_description(next_data) or video.get('description'), 'thumbnail': video.get('poster'), 'duration': float_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('currentSunrise')), @@ -100,16 +104,19 @@ class ServusIE(InfoExtractor): 'episode_number': episode_number, 'formats': formats, 'subtitles': subtitles, + **traverse_obj(next_data, ('props', 'pageProps', 'data', { + 'title': ('title', 'rendered', {str}), + 'timestamp': ('stv_date', 'raw', {int}), + 'duration': ('stv_duration', {float_or_none}), + 'categories': ('category_names', ..., {str}), + })), } - def _get_description(self, video_id): - info = self._download_json( - f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', - video_id, fatal=False) - - return join_nonempty(*traverse_obj(info, ( - ('stv_short_description', 'stv_long_description'), - {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + def _get_description(self, next_data): + return join_nonempty(*traverse_obj(next_data, ( + 'props', 'pageProps', 'data', + ('stv_short_description', 'stv_long_description'), {str}, + {lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n') def _report_errors(self, video): playability_errors = traverse_obj(video, ('playabilityErrors', ...)) diff --git a/yt_dlp/extractor/snapchat.py b/yt_dlp/extractor/snapchat.py new file mode 100644 index 000000000..732677c19 --- /dev/null +++ b/yt_dlp/extractor/snapchat.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class SnapchatSpotlightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snapchat\.com/spotlight/(?P\w+)' + + _TESTS = [{ + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'md5': '46c580f63592d0cbb76e974d2f9f0fcc', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'ext': 'mp4', + 'title': 'Views 💕', + 'description': '', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/d/kKJHIR1QAznRKK9jgYYDq\.256\.IRZXSOY', + 'duration': 4.665, + 'timestamp': 1637777831.369, + 'upload_date': '20211124', + 'repost_count': int, + 'uploader': 'shreypatel57', + 'uploader_url': 'https://www.snapchat.com/add/shreypatel57', + }, + }, { + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'md5': '4cd9626458c1a0e3e6dbe72c544a9ec2', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'ext': 'mp4', + 'title': 'Spotlight Snap', + 'description': 'How he flirt her teacher🤭🤭🤩😍 #kdrama#cdrama #dramaclips #dramaspotlight', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/i/ztfr6xFs0FOcFhwVczWfj\.256\.IRZXSOY', + 'duration': 10.91, + 'timestamp': 1722720291.307, + 'upload_date': '20240803', + 'view_count': int, + 'repost_count': int, + 'uploader': 'ganda0535', + 'uploader_url': 'https://www.snapchat.com/add/ganda0535', + 'tags': ['#dramaspotlight', '#dramaclips', '#cdrama', '#kdrama'], + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + video_data = traverse_obj(page_props, ( + 'spotlightFeed', 'spotlightStories', + lambda _, v: v['story']['storyId']['value'] == video_id, 'metadata', any), None) + + return { + 'id': video_id, + 'ext': 'mp4', + **traverse_obj(video_data, ('videoMetadata', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}), + 'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('shareCount', {int_or_none}), + 'url': ('contentUrl', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'uploader': ('creator', 'personCreator', 'username', {str}), + 'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}), + })), + **traverse_obj(video_data, { + 'description': ('description', {str}), + 'tags': ('hashtags', ..., {str}), + 'view_count': ('engagementStats', 'viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('engagementStats', 'shareCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38782abac..b5df2e1a1 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -472,7 +472,7 @@ class SVTPageIE(SVTBaseIE): title = self._og_search_title(webpage) urql_state = self._search_json( - r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id) + r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} diff --git a/yt_dlp/extractor/tele13.py b/yt_dlp/extractor/tele13.py index c5ca208fb..0d721773e 100644 --- a/yt_dlp/extractor/tele13.py +++ b/yt_dlp/extractor/tele13.py @@ -8,7 +8,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index d8c556ace..07db58347 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,33 +1,31 @@ -import base64 -import datetime as dt import functools import itertools from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin +from ..utils import int_or_none, traverse_obj, url_or_none, urljoin class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ - 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz', 'info_dict': { - 'id': '6226844312001', + 'id': '6336940246112', 'ext': 'mp4', - 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', - 'duration': 186, - 'season': 'Season 39', - 'season_number': 39, + 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach', + 'duration': 74, + 'season': 'Season 41', + 'season_number': 41, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', 'age_limit': 15, - 'timestamp': 1611810000, - 'upload_date': '20210128', + 'timestamp': 1694386800, + 'upload_date': '20230910', 'uploader_id': '2199827728001', }, 'params': { @@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor): }, 'skip': 'Only available in Australia', }, { - 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { - 'id': '6192880312001', + 'id': '9000000000091177', 'ext': 'mp4', - 'title': "Todd Sampson's Body Hack - S4 Ep. 2", - 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'title': 'Neighbours - S42 Ep. 9107', + 'alt_title': 'Thu 05 Sep', + 'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef', + 'duration': 1388, + 'episode': 'Episode 9107', + 'episode_number': 9107, + 'season': 'Season 42', + 'season_number': 42, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', 'age_limit': 15, - 'timestamp': 1600770600, - 'upload_date': '20200922', + 'timestamp': 1725517860, + 'upload_date': '20240905', 'uploader': 'Channel 10', 'uploader_id': '2199827728001', }, 'params': { 'skip_download': True, }, + 'skip': 'Only available in Australia', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor): 'X': 18, } - def _get_bearer_token(self, video_id): - username, password = self._get_login_info() - if username is None or password is None: - self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = dt.datetime.now().strftime('%Y%m%d000000') - _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') - data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ - 'X-Network-Ten-Auth': _auth_header, - }, data=urlencode_postdata({ - 'email': username, - 'password': password, - })) - return 'Bearer ' + data['jwt']['accessToken'] - def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) - headers = {} - - if data.get('memberGated') is True: - _token = self._get_bearer_token(content_id) - headers = {'Authorization': _token} - _video_url = self._download_json( - data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers=headers).get('source') - m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).url + video_data = self._download_json( + f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', + content_id, 'Downloading video JSON') + m3u8_url = self._request_webpage( + HEADRequest(video_data['items'][0]['HLSURL']), + content_id, 'Checking stream URL').url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) + # Attempt to get a higher quality stream + m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') return { + 'id': content_id, 'formats': formats, - 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, - 'id': data.get('altId') or content_id, - 'duration': data.get('duration'), - 'title': data.get('subtitle'), - 'alt_title': data.get('title'), - 'description': data.get('description'), - 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('tvShow'), - 'season_number': int_or_none(data.get('season')), - 'episode_number': int_or_none(data.get('episode')), - 'timestamp': data.get('published'), - 'thumbnail': data.get('imageUrl'), + 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None, 'uploader': 'Channel 10', 'uploader_id': '2199827728001', + **traverse_obj(data, { + 'id': ('altId', {str}), + 'duration': ('duration', {int_or_none}), + 'title': ('subtitle', {str}), + 'alt_title': ('title', {str}), + 'description': ('description', {str}), + 'age_limit': ('classification', {self._AUS_AGES.get}), + 'series': ('tvShow', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('published', {int_or_none}), + 'thumbnail': ('imageUrl', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9d823a315..f7e103fe9 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -542,16 +542,12 @@ class TikTokBaseIE(InfoExtractor): **COMMON_FORMAT_INFO, 'format_id': 'download', 'url': self._proto_relative_url(download_url), + 'format_note': 'watermarked', + 'preference': -2, }) self._remove_duplicate_formats(formats) - for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']): - f.update({ - 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '), - 'preference': f.get('preference') or -2, - }) - # Is it a slideshow with only audio for download? if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})): audio_url = aweme_detail['music']['playUrl'] @@ -565,7 +561,8 @@ class TikTokBaseIE(InfoExtractor): 'vcodec': 'none', }) - return formats + # Filter out broken formats, see https://github.com/yt-dlp/yt-dlp/issues/11034 + return [f for f in formats if urllib.parse.urlparse(f['url']).hostname != 'www.tiktok.com'] def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False): author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), { diff --git a/yt_dlp/extractor/tvn24.py b/yt_dlp/extractor/tvn24.py index 0dc43a9d4..a0590e4f7 100644 --- a/yt_dlp/extractor/tvn24.py +++ b/yt_dlp/extractor/tvn24.py @@ -8,7 +8,7 @@ from ..utils import ( class TVN24IE(InfoExtractor): _WORKING = False - _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)' + _VALID_URL = r'https?://(?:(?!eurosport)[^/]+\.)?tvn24(?:bis)?\.pl/(?:[^/?#]+/)*(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', 'md5': 'fbdec753d7bc29d96036808275f2130c', diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 53b408469..bf9c6348c 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -270,7 +270,7 @@ class TwitCastingLiveIE(InfoExtractor): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(?:show|archive)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', 'info_dict': { diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d056797f3..aca94df2d 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1764,7 +1764,7 @@ class TwitterSpacesIE(TwitterBaseIE): 'release_timestamp': 1659904215, 'release_date': '20220807', }, - 'params': {'skip_download': 'm3u8'}, + 'skip': 'No longer available', }, { # post_live/TimedOut but downloadable 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', @@ -1780,6 +1780,8 @@ class TwitterSpacesIE(TwitterBaseIE): 'upload_date': '20230413', 'release_timestamp': 1681839000, 'release_date': '20230418', + 'protocol': 'm3u8', # ffmpeg is forced + 'container': 'm4a_dash', # audio-only format fixup is applied }, 'params': {'skip_download': 'm3u8'}, }, { @@ -1790,11 +1792,31 @@ class TwitterSpacesIE(TwitterBaseIE): 'ext': 'm4a', 'title': 'あ', 'description': 'Twitter Space participated by nobody yet', - 'uploader': '息根とめる🔪Twitchで復活', + 'uploader': '息根とめる', 'uploader_id': 'tomeru_ikinone', 'live_status': 'was_live', 'timestamp': 1685617198, 'upload_date': '20230601', + 'protocol': 'm3u8', # ffmpeg is forced + 'container': 'm4a_dash', # audio-only format fixup is applied + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Video Space + 'url': 'https://x.com/i/spaces/1DXGydznBYWKM', + 'info_dict': { + 'id': '1DXGydznBYWKM', + 'ext': 'mp4', + 'title': 'America and Israel’s “special relationship”', + 'description': 'Twitter Space participated by nobody yet', + 'uploader': 'Candace Owens', + 'uploader_id': 'RealCandaceO', + 'live_status': 'was_live', + 'timestamp': 1723931351, + 'upload_date': '20240817', + 'release_timestamp': 1723932000, + 'release_date': '20240817', + 'protocol': 'm3u8_native', # not ffmpeg, detected as video space }, 'params': {'skip_download': 'm3u8'}, }] @@ -1854,13 +1876,17 @@ class TwitterSpacesIE(TwitterBaseIE): source = traverse_obj( self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']), ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader - source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, - headers=headers, fatal=False) if source else [] - for fmt in formats: - fmt.update({'vcodec': 'none', 'acodec': 'aac'}) - if not is_live: - fmt['container'] = 'm4a_dash' + is_audio_space = source and 'audio-space' in source + formats = self._extract_m3u8_formats( + source, metadata['media_key'], 'm4a' if is_audio_space else 'mp4', + # XXX: Some audio-only Spaces need ffmpeg as downloader + entry_protocol='m3u8' if is_audio_space else 'm3u8_native', + live=is_live, headers=headers, fatal=False) if source else [] + if is_audio_space: + for fmt in formats: + fmt.update({'vcodec': 'none', 'acodec': 'aac'}) + if not is_live: + fmt['container'] = 'm4a_dash' participants = ', '.join(traverse_obj( space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet' diff --git a/yt_dlp/extractor/vidflex.py b/yt_dlp/extractor/vidflex.py new file mode 100644 index 000000000..ce0880b47 --- /dev/null +++ b/yt_dlp/extractor/vidflex.py @@ -0,0 +1,148 @@ +import base64 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + join_nonempty, + mimetype2ext, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidflexIE(InfoExtractor): + _DOMAINS_RE = [ + r'[^.]+\.vidflex\.tv', + r'(?:www\.)?acactv\.ca', + r'(?:www\.)?albertalacrossetv\.com', + r'(?:www\.)?cjfltv\.com', + r'(?:www\.)?figureitoutbaseball\.com', + r'(?:www\.)?ocaalive\.com', + r'(?:www\.)?pegasussports\.tv', + r'(?:www\.)?praxisseries\.ca', + r'(?:www\.)?silenticetv\.com', + r'(?:www\.)?tuffhedemantv\.com', + r'(?:www\.)?watchfuntv\.com', + r'live\.ofsaa\.on\.ca', + r'tv\.procoro\.ca', + r'tv\.realcastmedia\.net', + r'tv\.fringetheatre\.ca', + r'video\.haisla\.ca', + r'video\.hockeycanada\.ca', + r'video\.huuayaht\.org', + r'video\.turningpointensemble\.ca', + r'videos\.livingworks\.net', + r'videos\.telusworldofscienceedmonton\.ca', + r'watch\.binghamtonbulldogs\.com', + r'watch\.rekindle\.tv', + r'watch\.wpca\.com', + ] + _VALID_URL = rf'https?://(?:{"|".join(_DOMAINS_RE)})/[a-z]{{2}}(?:-[a-z]{{2}})?/c/[\w-]+\.(?P\d+)' + _TESTS = [{ + 'url': 'https://video.hockeycanada.ca/en/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'only_matching': True, + }, { + # m3u8 + https + 'url': 'https://video.hockeycanada.ca/en-us/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'info_dict': { + 'id': '107486', + 'title': 'NWT: Mic’d up with Jamie Lee Rattray', + 'ext': 'mp4', + 'duration': 115, + 'timestamp': 1634310409, + 'upload_date': '20211015', + 'tags': ['English', '2021', "National Women's Team"], + 'description': 'md5:efb1cf6165b48cc3f5555c4262dd5b23', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://video.hockeycanada.ca/en/c/mwc-remembering-the-wild-ride-in-riga.112307', + 'info_dict': { + 'id': '112307', + 'title': 'MWC: Remembering the wild ride in Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235607, + 'upload_date': '20240520', + 'tags': ['English', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': r're:.+Canada’s National Men’s Team.+', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + # the same video in French + 'url': 'https://video.hockeycanada.ca/fr/c/cmm-retour-sur-un-parcours-endiable-a-riga.112304', + 'info_dict': { + 'id': '112304', + 'title': 'CMM : Retour sur un parcours endiablé à Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235545, + 'upload_date': '20240520', + 'tags': ['French', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': 'md5:cf825222882a3dab1cd62cffcf3b4d1f', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://myfbcgreenville.vidflex.tv/en/c/may-12th-2024.658', + 'only_matching': True, + }, { + 'url': 'https://www.figureitoutbaseball.com/en/c/fiob-podcast-14-dan-bertolini-ncaa-d1-head-coach-recorded-11-29-2018.1367', + 'only_matching': True, + }, { + 'url': 'https://videos.telusworldofscienceedmonton.ca/en/c/the-aurora-project-timelapse-4.577', + 'only_matching': True, + }, { + 'url': 'https://www.tuffhedemantv.com/en/c/2022-tuff-hedeman-tour-hobbs-nm-january-22.227', + 'only_matching': True, + }, { + 'url': 'https://www.albertalacrossetv.com/en/c/up-floor-ground-balls-one-more.3449', + 'only_matching': True, + }, { + 'url': 'https://www.silenticetv.com/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }, { + 'url': 'https://jphl.vidflex.tv/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_url = self._html_search_regex( + r'content_api:\s*(["\'])(?Phttps?://(?:(?!\1).)+)\1', webpage, 'content api url', group='url') + media_config = traverse_obj( + self._download_json(data_url, video_id), + ('config', {base64.b64decode}, {bytes.decode}, {json.loads}, {dict})) + + return { + 'id': video_id, + 'formats': list(self._yield_formats(media_config, video_id)), + **self._search_json_ld( + webpage.replace('/**/', ''), video_id), + } + + def _yield_formats(self, media_config, video_id): + for media_source in traverse_obj(media_config, ('media', 'source', lambda _, v: url_or_none(v['src']))): + media_url = media_source['src'] + media_type = mimetype2ext(media_source.get('type')) + + if media_type == 'm3u8': + yield from self._extract_m3u8_formats(media_url, video_id, fatal=False, m3u8_id='hls') + elif media_type == 'mp4': + bitrate = self._search_regex(r'_(\d+)k\.mp4', media_url, 'bitrate', default=None) + yield { + 'format_id': join_nonempty('http', bitrate), + 'url': media_url, + 'ext': 'mp4', + 'tbr': int_or_none(bitrate), + } + else: + yield { + 'url': media_url, + 'ext': media_type, + } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a20cf4b17..367d5e583 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -21,6 +21,7 @@ from ..utils import ( parse_filesize, parse_iso8601, parse_qs, + qualities, smuggle_url, str_or_none, traverse_obj, @@ -146,6 +147,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) # TODO: fix handling of 308 status code returned for live archive manifest requests + QUALITIES = ('low', 'medium', 'high') + quality = qualities(QUALITIES) sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): @@ -166,6 +169,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id, note=f'Downloading {cdn_name} m3u8 information', fatal=False) + # m3u8 doesn't give audio bitrates; need to prioritize based on GROUP-ID + # See: https://github.com/yt-dlp/yt-dlp/issues/10854 + for f in fmts: + if mobj := re.search(rf'audio-({"|".join(QUALITIES)})', f['format_id']): + f['quality'] = quality(mobj.group(1)) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': @@ -234,13 +242,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('vimeo'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -261,15 +286,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -354,7 +377,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -370,6 +393,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -379,22 +403,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -418,29 +443,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -456,11 +490,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -476,15 +511,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -495,6 +534,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -511,16 +552,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -543,28 +586,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -592,7 +630,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -606,6 +644,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -633,11 +672,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -645,11 +686,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -670,8 +719,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -701,6 +752,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 @@ -735,6 +787,48 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Failed to parse XML: not well-formed'], }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { # user playlist alias -> https://vimeo.com/258705797 'url': 'https://vimeo.com/user26785108/newspiritualguide', @@ -768,16 +862,6 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): - return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - def _extract_from_api(self, video_id, unlisted_hash=None): viewer = self._download_json( 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') @@ -798,6 +882,11 @@ class VimeoIE(VimeoBaseInfoExtractor): info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -899,7 +988,12 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1269,6 +1363,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1282,6 +1390,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1316,6 +1425,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( @@ -1327,8 +1437,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, - unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 01e59352b..f4ed96bf6 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -90,7 +90,7 @@ class ViuIE(ViuBaseIE): formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') for key, value in video_data.items(): - mobj = re.match(r'^subtitle_(?P[^_]+)_(?P(vtt|srt))', key) + mobj = re.match(r'subtitle_(?P[^_]+)_(?P(vtt|srt))', key) if not mobj: continue subtitles.setdefault(mobj.group('lang'), []).append({ diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index fb2a8648f..df7ecb3cd 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -8,6 +8,7 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, parse_qs, @@ -25,16 +26,25 @@ class WistiaBaseIE(InfoExtractor): def _download_embed_config(self, config_type, config_id, referer): base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}' + video_password = self.get_param('videopassword') embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. - }) + }, query=filter_dict({'password': video_password})) error = traverse_obj(embed_config, 'error') if error: raise ExtractorError( f'Error while getting the playlist: {error}', expected=True) + if traverse_obj(embed_config, ( + 'media', ('embed_options', 'embedOptions'), 'plugin', + 'passwordProtectedVideo', 'on', any)) == 'true': + if video_password: + raise ExtractorError('Invalid video password', expected=True) + raise ExtractorError( + 'This content is password-protected. Use the --video-password option', expected=True) + return embed_config def _get_real_ext(self, url): diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index e900a4ad9..02bf6a7be 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -1,7 +1,17 @@ +import base64 import math +import time from .common import InfoExtractor -from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call +from .videa import VideaIE +from ..utils import ( + InAdvancePagedList, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, +) class XimalayaBaseIE(InfoExtractor): @@ -11,7 +21,7 @@ class XimalayaBaseIE(InfoExtractor): class XimalayaIE(XimalayaBaseIE): IE_NAME = 'ximalaya' IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(:?(?P\d+)/)?sound/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:(?P\d+)/)?sound/(?P[0-9]+)' _TESTS = [ { 'url': 'http://www.ximalaya.com/sound/47740352/', @@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE): 'like_count': int, }, }, + { + # VIP-restricted audio + 'url': 'https://www.ximalaya.com/sound/562111701', + 'only_matching': True, + }, ] + @staticmethod + def _decrypt_filename(file_id, seed): + cgstr = '' + key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890' + for _ in key: + seed = float(int(211 * seed + 30031) % 65536) + r = int(seed / 65536 * len(key)) + cgstr += key[r] + key = key.replace(key[r], '') + parts = file_id.split('*') + filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal()) + if not filename.startswith('/'): + filename = '/' + filename + return filename + + @staticmethod + def _decrypt_url_params(encrypted_params): + params = VideaIE.rc4( + base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-') + # sign, token, timestamp + return params[1], params[2], params[3] + def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json' audio_info = self._download_json( - audio_info_file, audio_id, - f'Downloading info json {audio_info_file}', 'Unable to download info file') - - formats = [{ + f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id, + 'Downloading info json', 'Unable to download info file') + + formats = [] + # NOTE: VIP-restricted audio + if audio_info.get('is_paid'): + ts = int(time.time()) + vip_info = self._download_json( + f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}', + audio_id, 'Downloading VIP info json', 'Unable to download VIP info file', + query={'device': 'pc', 'isBackend': 'true', '_': ts}) + filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed']) + sign, token, timestamp = self._decrypt_url_params(vip_info['ep']) + vip_url = update_url_query( + f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', { + 'sign': sign, + 'token': token, + 'timestamp': timestamp, + 'buy_key': vip_info['buyKey'], + 'duration': vip_info['duration'], + }) + fmt = { + 'format_id': 'vip', + 'url': vip_url, + 'vcodec': 'none', + } + if '_preview_' in vip_url: + self.report_warning( + f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}') + fmt.update({ + 'format_note': 'Sample', + 'preference': -10, + **traverse_obj(vip_info, { + 'filesize': ('sampleLength', {int_or_none}), + 'duration': ('sampleDuration', {int_or_none}), + }), + }) + else: + fmt.update(traverse_obj(vip_info, { + 'filesize': ('totalLength', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })) + + fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024) + formats.append(fmt) + + formats.extend([{ 'format_id': f'{bps}k', 'url': audio_info[k], 'abr': bps, 'vcodec': 'none', - } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)]) thumbnails = [] for k in audio_info: diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 10849916b..23ed9270d 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -3,16 +3,13 @@ from ..utils import ( int_or_none, str_or_none, try_get, - update_url_query, url_or_none, ) class XinpianchangIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.xinpianchang\.com/(?P[^/]+?)(?:\D|$)' - IE_NAME = 'xinpianchang' - IE_DESC = 'xinpianchang.com' + _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?Pa\d+)' + IE_DESC = '新片场' _TESTS = [{ 'url': 'https://www.xinpianchang.com/a11766551', 'info_dict': { @@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) - vid = self.find_value_with_regex(var='vid', webpage=webpage) - app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) - api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) - data = self._download_json(api, video_id=video_id)['data'] + video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] + + data = self._download_json( + f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id, + query={'appKey': video_data['appKey']})['data'] formats, subtitles = [], {} for k, v in data.get('resource').items(): if k in ('dash', 'hls'): @@ -72,6 +69,10 @@ class XinpianchangIE(InfoExtractor): 'width': int_or_none(prog.get('width')), 'height': int_or_none(prog.get('height')), 'ext': 'mp4', + 'http_headers': { + # NB: Server returns 403 without the Range header + 'Range': 'bytes=0-', + }, } for prog in v if prog.get('url') or []]) return { @@ -87,6 +88,3 @@ class XinpianchangIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - def find_value_with_regex(self, var, webpage): - return self._search_regex(rf'var\s{var}\s=\s\"(?P[^\"]+)\"', webpage, name=var) diff --git a/yt_dlp/extractor/yle_areena.py b/yt_dlp/extractor/yle_areena.py index ef9e96804..c0a218e2f 100644 --- a/yt_dlp/extractor/yle_areena.py +++ b/yt_dlp/extractor/yle_areena.py @@ -10,7 +10,7 @@ from ..utils import ( class YleAreenaIE(InfoExtractor): - _VALID_URL = r'https?://areena\.yle\.fi/(?P[\d-]+)' + _VALID_URL = r'https?://areena\.yle\.fi/(?Ppodcastit/)?(?P[\d-]+)' _GEO_COUNTRIES = ['FI'] _TESTS = [ { @@ -77,7 +77,7 @@ class YleAreenaIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, is_podcast = self._match_valid_url(url).group('id', 'podcast') info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', @@ -103,8 +103,11 @@ class YleAreenaIE(InfoExtractor): 'name': sub.get('kind'), }) - kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str) - if kaltura_id: + if is_podcast: + info_dict = { + 'url': video_data['data']['ongoing_ondemand']['media_url'], + } + elif kaltura_id := traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id', {str})): info_dict = { '_type': 'url_transparent', 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), @@ -114,13 +117,11 @@ class YleAreenaIE(InfoExtractor): formats, subs = self._extract_m3u8_formats_and_subtitles( video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls') self._merge_subtitles(subs, target=subtitles) - info_dict = { - 'id': video_id, - 'formats': formats, - } + info_dict = {'formats': formats} return { **info_dict, + 'id': video_id, 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) or episode or info.get('title')), 'description': description, diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 224c9b988..1382c01b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -69,6 +69,8 @@ from ..utils import ( ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' +STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, 'web_embedded': { 'INNERTUBE_CONTEXT': { @@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_music': { 'INNERTUBE_CONTEXT': { @@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_creator': { 'INNERTUBE_CONTEXT': { @@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -323,6 +330,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('REQUIRE_PO_TOKEN', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None, fatal=False) - @staticmethod - def _extract_account_syncid(*args): + def _data_sync_id_to_delegated_session_id(self, data_sync_id): + if not data_sync_id: + return + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + channel_syncid, _, user_syncid = data_sync_id.partition('||') + if user_syncid: + return channel_syncid + + def _extract_account_syncid(self, *args): """ - Extract syncId required to download private playlists of secondary channels + Extract current session ID required to download private playlists of secondary channels @params response and/or ytcfg """ - for data in args: - # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) - if delegated_sid: - return delegated_sid - sync_ids = (try_get( - data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), str) or '').split('||') - if len(sync_ids) >= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid - @staticmethod - def _extract_visitor_data(*args): + data_sync_id = self._extract_data_sync_id(*args) + return self._data_sync_id_to_delegated_session_id(data_sync_id) + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): """ Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: + return visitor_data return get_first( args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @@ -1334,11 +1357,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _POTOKEN_EXPERIMENTS = ('51217476', '51217102') - _BROKEN_CLIENTS = { - short_client_name(client): client - for client in ('android', 'android_creator', 'android_music') - } + _DEFAULT_CLIENTS = ('ios', 'web_creator') _GEO_BYPASS = False @@ -3700,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): **cls._get_checkok_params(), } + def _get_config_po_token(self, client): + po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) + for token_str in po_token_strs: + po_token_client, sep, po_token = token_str.partition('+') + if not sep: + self.report_warning( + f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + continue + if po_token_client == client: + return po_token + + def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. + if not visitor_data and not self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') + return + + config_po_token = self._get_config_po_token(client) + if config_po_token: + # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, + # if using first channel in an account then we don't need the data_sync_id anymore... + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + + return config_po_token + + # Require PO Token if logged in for external fetching + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + return + + return self._fetch_po_token( + client=client, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + player_url=player_url, + **kwargs, + ) + + def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + """External PO Token fetch stub""" + @staticmethod def _is_agegated(player_response): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): @@ -3716,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): - - session_index = self._extract_session_index(player_ytcfg, master_ytcfg) - syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( - ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + ytcfg=player_ytcfg, + default_client=client, + visitor_data=visitor_data, + session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + account_syncid=( + self._data_sync_id_to_delegated_session_id(data_sync_id) + or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + ), + ) yt_query = { 'videoId': video_id, @@ -3733,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: yt_query['params'] = player_params + if po_token: + yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} + + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3743,26 +3818,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - broken_clients = [] - default = ['ios', 'web_creator'] + excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client == 'default': - requested_clients.extend(default) + requested_clients.extend(self._DEFAULT_CLIENTS) elif client == 'all': requested_clients.extend(allowed_clients) + elif client.startswith('-'): + excluded_clients.append(client[1:]) elif client not in allowed_clients: - self.report_warning(f'Skipping unsupported client {client}') - elif client in self._BROKEN_CLIENTS.values(): - broken_clients.append(client) + self.report_warning(f'Skipping unsupported client "{client}"') else: requested_clients.append(client) - # Force deprioritization of _BROKEN_CLIENTS for format de-duplication - requested_clients.extend(broken_clients) if not requested_clients: - requested_clients = default + requested_clients.extend(self._DEFAULT_CLIENTS) + for excluded_client in excluded_clients: + if excluded_client in requested_clients: + requested_clients.remove(excluded_client) + if not requested_clients: + raise ExtractorError('No player clients have been requested', expected=True) if smuggled_data.get('is_music_url') or self.is_music_url(url): for requested_client in requested_clients: @@ -3780,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return pr_id def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): - initial_pr = ignore_initial_response = None + initial_pr = None if webpage: - if 'web' in clients: - experiments = traverse_obj(master_ytcfg, ( - 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - self.report_warning( - 'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response') - ignore_initial_response = True initial_pr = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) prs = [] + deprioritized_prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -3814,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return tried_iframe_fallback = False - player_url = None + player_url = visitor_data = data_sync_id = None skipped_clients = {} while clients: + deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = {} - if client == 'web': - player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg - elif 'configs' not in self._configuration_arg('player_skip'): + player_ytcfg = master_ytcfg if client == 'web' else {} + if 'configs' not in self._configuration_arg('player_skip') and client != 'web': player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) @@ -3834,34 +3905,53 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' and not ignore_initial_response else None - for retry in self.RetryManager(fatal=False): - try: - pr = pr or self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, - player_url if require_js_player else None, initial_pr, smuggled_data) - except ExtractorError as e: - self.report_warning(e) - break - experiments = traverse_obj(pr, ( - 'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK', - 'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - pr = None - retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True) - if not pr: + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + po_token = self.fetch_po_token( + client=client, visitor_data=visitor_data, + data_sync_id=data_sync_id if self.is_authenticated else None, + player_url=player_url if require_js_player else None, + ) + + require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') + if not po_token and require_po_token: + self.report_warning( + f'No PO Token provided for {client} client, ' + f'which is required for working {client} formats. ' + f'You can manually pass a PO Token for this client with ' + f'--extractor-args "youtube:po_token={client}+XXX"', + only_once=True) + deprioritize_pr = True + + pr = initial_pr if client == 'web' else None + try: + pr = pr or self._extract_player_response( + client, video_id, + master_ytcfg=player_ytcfg or master_ytcfg, + player_ytcfg=player_ytcfg, + player_url=player_url, + initial_pr=initial_pr, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + po_token=po_token) + except ExtractorError as e: + self.report_warning(e) continue if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: # Save client name for introspection later - name = short_client_name(client) sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name + sd[STREAMING_DATA_CLIENT_NAME] = client + sd[STREAMING_DATA_PO_TOKEN] = po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) + f[STREAMING_DATA_CLIENT_NAME] = client + f[STREAMING_DATA_PO_TOKEN] = po_token + if deprioritize_pr: + deprioritized_prs.append(pr) + else: + prs.append(pr) # tv_embedded can work around age-gate and age-verification IF the video is embeddable if self._is_agegated(pr) and variant != 'tv_embedded': @@ -3885,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # _producer, _testsuite, & _vr variants can also work around age-verification append_client('web_creator', 'mediaconnect') + prs.extend(deprioritized_prs) + if skipped_clients: self.report_warning( f'Skipping player responses from {"/".join(skipped_clients)} clients ' @@ -4018,14 +4110,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) - client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds - # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name in self._BROKEN_CLIENTS + client_name = fmt[STREAMING_DATA_CLIENT_NAME] + po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) + + # Clients that require PO Token return videoplayback URLs that may return 403 + is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) if is_broken: self.report_warning( - f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' - 'and may yield HTTP Error 403. They will be deprioritized', only_once=True) + f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4039,7 +4135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), is_damaged and 'DAMAGED', is_broken and 'BROKEN', - (self.get_param('verbose') or all_formats) and client_name, + (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), @@ -4101,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag): + def process_manifest_format(f, proto, client_name, itag, po_token): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False itags[itag].add(key) + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Clients that require PO Token return videoplayback URLs that may return 403 + # hls does not currently require PO Token + if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + self.report_warning( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + f['source_preference'] -= 20 + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4118,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['language_preference'] = PREFERRED_LANG_VALUE - if f.get('source_preference') is None: - f['source_preference'] = -1 - if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['source_preference'] += 100 @@ -4129,7 +4234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) if self.get_param('verbose') or all_formats: - f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + f['format_note'] = join_nonempty( + f.get('format_note'), short_client_name(client_name), delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] @@ -4140,24 +4246,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for sd in streaming_data: - client_name = sd.get(STREAMING_DATA_CLIENT_NAME) - + client_name = sd[STREAMING_DATA_CLIENT_NAME] + po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + if po_token: + hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None)): + r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + if po_token: + dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: @@ -4979,7 +5089,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) @@ -4991,6 +5101,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=self._get_text(renderer, 'title')) return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, ('overlayMetadata', { + 'title': ('primaryText', 'content', {str}), + 'view_count': ('secondaryText', 'content', {parse_count}), + })), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -7530,6 +7655,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'id': clip_id, 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, + '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang'), } diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index e8a67b734..0643348e7 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -31,9 +31,9 @@ if curl_cffi is None: curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) -if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)): +if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi versions 0.5.10, 0.7.X are supported') + raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea2..b86d3606d 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import typing import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index 492af1154..ec55567da 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -33,8 +33,8 @@ if not websockets: import websockets.version websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) -if websockets_version < (12, 0): - raise ImportError('Only websockets>=12.0 is supported') +if websockets_version < (13, 0): + raise ImportError('Only websockets>=13.0 is supported') import websockets.sync.client from websockets.uri import parse_uri @@ -47,10 +47,7 @@ from websockets.uri import parse_uri # 2: "AttributeError: 'ClientConnection' object has no attribute 'recv_events_exc'. Did you mean: 'recv_events'?" import websockets.sync.connection # isort: split with contextlib.suppress(Exception): - # > 12.0 websockets.sync.connection.Connection.recv_exc = None - # 12.0 - websockets.sync.connection.Connection.recv_events_exc = None class WebsocketsResponseAdapter(WebSocketResponse): @@ -162,7 +159,7 @@ class WebsocketsRH(WebSocketRequestHandler): additional_headers=headers, open_timeout=timeout, user_agent_header=None, - ssl_context=ssl_ctx if wsuri.secure else None, + ssl=ssl_ctx if wsuri.secure else None, close_timeout=0, # not ideal, but prevents yt-dlp hanging ) return WebsocketsResponseAdapter(conn, url=request.url) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ffe2463fe..9980b7fc3 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -647,16 +647,16 @@ def create_parser(): 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if at least one of the conditions is met. E.g. --match-filter ' - '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'the filter matches if at least one of the conditions is met. E.g. --match-filters ' + '!is_live --match-filters "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' 'that contains the phrase "cats & dogs" (caseless). ' - 'Use "--match-filter -" to interactively ask whether to download each video')) + 'Use "--match-filters -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filters', dest='match_filter', action='store_const', const=None, - help='Do not use any --match-filter (default)') + help='Do not use any --match-filters (default)') selection.add_option( '--break-match-filters', metavar='FILTER', dest='breaking_match_filter', action='append', @@ -704,7 +704,7 @@ def create_parser(): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL') + help='Alters --max-downloads, --break-on-existing, --break-match-filters, and autonumber to reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', @@ -1725,15 +1725,17 @@ def create_parser(): '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( - 'Convert the subtitles to another format (currently supported: {}) ' - '(Alias: --convert-subtitles)'.format(', '.join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))))) + 'Convert the subtitles to another format ' + f'(currently supported: {", ".join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))}). ' + 'Use "--convert-subs none" to disable conversion (default) (Alias: --convert-subtitles)')) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' f'(currently supported: {", ".join(sorted(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))}). ' - 'You can specify multiple rules using similar syntax as --remux-video')) + 'You can specify multiple rules using similar syntax as "--remux-video". ' + 'Use "--convert-thumbnails none" to disable conversion (default)')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 6cf9ab62e..b3fc8b54a 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -33,7 +33,7 @@ class SponsorBlockPP(FFmpegPostProcessor): def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): FFmpegPostProcessor.__init__(self, downloader) self._categories = tuple(categories or self.CATEGORIES.keys()) - self._API_URL = api if re.match('^https?://', api) else 'https://' + api + self._API_URL = api if re.match('https?://', api) else 'https://' + api def run(self, info): extractor = info['extractor_key'] diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 72ae29084..4cf3bdc32 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -135,20 +135,42 @@ def _get_binary_name(): def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 8) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 9) if sys.version_info > MIN_RECOMMENDED: return None major, minor = sys.version_info[:2] + PYTHON_MSG = f'Please update to Python {".".join(map(str, MIN_RECOMMENDED))} or above' + if sys.version_info < MIN_SUPPORTED: - msg = f'Python version {major}.{minor} is no longer supported' - else: - msg = (f'Support for Python version {major}.{minor} has been deprecated. ' - '\nYou may stop receiving updates on this version at any time') + return f'Python version {major}.{minor} is no longer supported! {PYTHON_MSG}' + + EXE_MSG_TMPL = ('Support for {} has been deprecated. ' + 'See https://github.com/yt-dlp/yt-dlp/{} for details.\n{}') + STOP_MSG = 'You may stop receiving updates on this version at any time!' + variant = detect_variant() + + # Temporary until Windows builds use 3.9, which will drop support for Win7 and 2008ServerR2 + if variant in ('win_exe', 'win_x86_exe', 'py2exe'): + platform_name = platform.platform() + if any(platform_name.startswith(f'Windows-{name}') for name in ('7', '2008ServerR2')): + return EXE_MSG_TMPL.format('Windows 7/Server 2008 R2', 'issues/10086', STOP_MSG) + elif variant == 'py2exe': + return EXE_MSG_TMPL.format( + 'py2exe builds (yt-dlp_min.exe)', 'issues/10087', + 'In a future update you will be migrated to the PyInstaller-bundled executable. ' + 'This will be done automatically; no action is required on your part') + return None + + # Temporary until aarch64/armv7l build flow is bumped to Ubuntu 20.04 and Python 3.9 + elif variant in ('linux_aarch64_exe', 'linux_armv7l_exe'): + libc_ver = version_tuple(os.confstr('CS_GNU_LIBC_VERSION').partition(' ')[2]) + if libc_ver < (2, 31): + return EXE_MSG_TMPL.format('system glibc version < 2.31', 'pull/8638', STOP_MSG) + return None - major, minor = MIN_RECOMMENDED - return f'{msg}! Please update to Python {major}.{minor} or above' + return f'Support for Python version {major}.{minor} has been deprecated. {PYTHON_MSG}' def _sha256_file(path): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 0d3e707c5..e1b3c48d6 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1954,7 +1954,7 @@ def urljoin(base, path): path = path.decode() if not isinstance(path, str) or not path: return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode() @@ -2007,7 +2007,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -2919,6 +2919,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'audio/webm': 'webm', 'audio/x-matroska': 'mka', 'audio/x-mpegurl': 'm3u', + 'aacp': 'aac', 'midi': 'mid', 'ogg': 'ogg', 'wav': 'wav', @@ -3112,7 +3113,7 @@ def is_html(first_bytes): while first_bytes.startswith(bom): encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) + return re.match(r'\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): @@ -5280,7 +5281,7 @@ class FormatSorter: settings = { 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'order': ['av0?1', 'vp0?9.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 6633a11b9..76b8bf0ee 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.08.06' +__version__ = '2024.09.27' -RELEASE_GIT_HEAD = '4d9231208332d4c32364b8cd814bff8b20232cae' +RELEASE_GIT_HEAD = 'c6387abc1af9842bb0541288a5610abba9b1ab51' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.08.06' +_pkg_version = '2024.09.27'