From 8a8b54523addf46dfd50ef599761a81bc22362e6 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 14 Oct 2023 12:33:00 +1300 Subject: [PATCH] [rh:requests] Add handler for `requests` HTTP library (#3668) Adds support for HTTPS proxies and persistent connections (keep-alive) Closes https://github.com/yt-dlp/yt-dlp/issues/1890 Resolves https://github.com/yt-dlp/yt-dlp/issues/4070 Resolves https://github.com/ytdl-org/youtube-dl/issues/32549 Resolves https://github.com/ytdl-org/youtube-dl/issues/14523 Resolves https://github.com/ytdl-org/youtube-dl/issues/13734 Authored by: coletdjnz, Grub4K, bashonly --- .github/workflows/core.yml | 2 +- README.md | 4 +- requirements.txt | 2 + setup.py | 9 +- test/test_networking.py | 168 +++++++++--- test/test_socks.py | 36 +-- yt_dlp/YoutubeDL.py | 7 +- yt_dlp/__pyinstaller/hook-yt_dlp.py | 4 +- yt_dlp/dependencies/__init__.py | 9 + yt_dlp/networking/__init__.py | 10 + yt_dlp/networking/_helper.py | 20 +- yt_dlp/networking/_requests.py | 398 ++++++++++++++++++++++++++++ yt_dlp/networking/_urllib.py | 26 +- yt_dlp/options.py | 3 +- 14 files changed, 619 insertions(+), 79 deletions(-) create mode 100644 yt_dlp/networking/_requests.py diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7acaee1e8..049faf373 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -32,7 +32,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest + - name: Install dependencies run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False diff --git a/README.md b/README.md index dd4652d43..3b7432474 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is * yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. For ease of use, a few more compat options are available: @@ -164,7 +165,7 @@ For ease of use, a few more compat options are available: * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler`. Use this to enable all future compat options # INSTALLATION @@ -274,6 +275,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT [1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) * [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**requests**](https://github.com/psf/requests)\* - HTTP library. For HTTPS proxy and persistent connections support. Licensed under [Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) ### Metadata diff --git a/requirements.txt b/requirements.txt index dde37120f..112c30aeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ websockets brotli; platform_python_implementation=='CPython' brotlicffi; platform_python_implementation!='CPython' certifi +requests>=2.31.0,<3 +urllib3>=1.26.17,<3 \ No newline at end of file diff --git a/setup.py b/setup.py index a2f9f55c3..1740db27d 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,14 @@ def py2exe_params(): 'compressed': 1, 'optimize': 2, 'dist_dir': './dist', - 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # py2exe appears to confuse this with our socks library. + # We don't use pysocks and urllib3.contrib.socks would fail to import if tried. + 'urllib3.contrib.socks' + ], 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', diff --git a/test/test_networking.py b/test/test_networking.py index 5308c8d6f..2b45deac7 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -28,7 +28,7 @@ from http.cookiejar import CookieJar from test.helper import FakeYDL, http_server_port from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli +from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -43,6 +43,7 @@ from yt_dlp.networking.exceptions import ( HTTPError, IncompleteRead, NoSupportingHandlers, + ProxyError, RequestError, SSLError, TransportError, @@ -305,7 +306,7 @@ class TestRequestHandlerBase: class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -316,7 +317,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -334,7 +335,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -346,7 +347,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, @@ -361,14 +362,14 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -378,7 +379,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -389,7 +390,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect(self, handler): with handler() as rh: def do_req(redirect_status, method, assert_no_content=False): @@ -444,7 +445,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): with pytest.raises(HTTPError): do_req(code, 'GET') - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -476,19 +477,19 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert b'Cookie: test=ytdlp' not in data assert b'Cookie: test=test' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -505,7 +506,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'Cookie: test=ytdlp' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -521,7 +522,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert b'Test2: test2' not in data assert b'Test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -537,7 +538,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' with handler(source_address=source_address) as rh: @@ -545,13 +546,13 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '