Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Under-the-hood fixes, improvements and tests (pt 2) #32481

Merged
merged 3 commits into from
Jul 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ jobs:
#-------- Jython ------
- name: Set up Java 8
if: ${{ matrix.python-impl == 'jython' }}
uses: actions/setup-java@v2
uses: actions/setup-java@v3
with:
java-version: 8
distribution: 'zulu'
Expand Down
30 changes: 17 additions & 13 deletions test/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ def gzip_compress(p):
respond()
elif self.path == '/%c7%9f':
respond()
elif self.path == '/redirect_dotsegments':
self.send_response(301)
# redirect to /headers but with dot segments before
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
Expand Down Expand Up @@ -461,33 +467,23 @@ def __test_compression(self, encoding):
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': encoding}))
self.assertEqual(res.headers.get('Content-Encoding'), encoding)
# decoded encodings are removed: only check for valid decompressed data
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')

@unittest.skipUnless(brotli, 'brotli support is not installed')
@unittest.expectedFailure
def test_brotli(self):
self.__test_compression('br')

@unittest.expectedFailure
def test_deflate(self):
self.__test_compression('deflate')

@unittest.expectedFailure
def test_gzip(self):
self.__test_compression('gzip')

@unittest.expectedFailure # not yet implemented
def test_multiple_encodings(self):
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
with FakeYDL() as ydl:
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
res = ydl.urlopen(
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': pair}))
self.assertEqual(res.headers.get('Content-Encoding'), pair)
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
self.__test_compression(pair)

def test_unsupported_encoding(self):
# it should return the raw content
Expand All @@ -499,6 +495,14 @@ def test_unsupported_encoding(self):
self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported')
self.assertEqual(res.read(), b'raw')

def test_remove_dot_segments(self):
with FakeYDL() as ydl:
res = ydl.urlopen(sanitized_Request(self._test_url('a/b/./../../headers')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')

res = ydl.urlopen(sanitized_Request(self._test_url('redirect_dotsegments')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')


def _build_proxy_handler(name):
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
Expand Down
29 changes: 28 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
parse_age_limit,
parse_duration,
parse_filesize,
parse_codecs,
parse_count,
parse_iso8601,
parse_resolution,
Expand Down Expand Up @@ -114,7 +115,7 @@
cli_option,
cli_valueless_option,
cli_bool_option,
parse_codecs,
YoutubeDLHandler,
)
from youtube_dl.compat import (
compat_chr,
Expand Down Expand Up @@ -905,6 +906,32 @@ def test_escape_url(self):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')

def test_remove_dot_segments(self):

def remove_dot_segments(p):
q = '' if p.startswith('/') else '/'
p = 'http://example.com' + q + p
p = compat_urlparse.urlsplit(YoutubeDLHandler._fix_path(p)).path
return p[1:] if q else p

self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
self.assertEqual(remove_dot_segments('/..'), '/')
self.assertEqual(remove_dot_segments('/./'), '/')
self.assertEqual(remove_dot_segments('/./a'), '/a')
self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
self.assertEqual(remove_dot_segments('/'), '/')
self.assertEqual(remove_dot_segments('/t'), '/t')
self.assertEqual(remove_dot_segments('t'), 't')
self.assertEqual(remove_dot_segments(''), '')
self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
self.assertEqual(remove_dot_segments('../a'), 'a')
self.assertEqual(remove_dot_segments('./a'), 'a')
self.assertEqual(remove_dot_segments('.'), '')
self.assertEqual(remove_dot_segments('////'), '////')

def test_js_to_json_vars_strings(self):
self.assertDictEqual(
json.loads(js_to_json(
Expand Down
23 changes: 0 additions & 23 deletions youtube_dl/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@
format_bytes,
formatSeconds,
GeoRestrictedError,
HEADRequest,
int_or_none,
ISO3166Utils,
join_nonempty,
Expand All @@ -88,7 +87,6 @@
preferredencoding,
prepend_extension,
process_communicate_or_kill,
PUTRequest,
register_socks_protocols,
render_table,
replace_extension,
Expand Down Expand Up @@ -2460,27 +2458,6 @@ def urlopen(self, req):
""" Start an HTTP download """
if isinstance(req, compat_basestring):
req = sanitized_Request(req)
# an embedded /../ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
url = req.get_full_url()
parts = url.partition('/../')
if parts[1]:
url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2])
if url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
parts = compat_urllib_parse.urlsplit(url)
path = parts.path
while path.startswith('/../'):
path = path[3:]
url = parts._replace(path=path).geturl()
# get a new Request with the munged URL
if url != req.get_full_url():
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
req.get_method(), compat_urllib_request.Request)
req = req_type(
url, data=req.data, headers=dict(req.header_items()),
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
return self._opener.open(req, timeout=self._socket_timeout)

def print_debug_header(self):
Expand Down
14 changes: 14 additions & 0 deletions youtube_dl/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3200,6 +3200,18 @@ def compat_register_utf8():
def compat_datetime_timedelta_total_seconds(td):
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6

# optional decompression packages
# PyPi brotli package implements 'br' Content-Encoding
try:
import brotli as compat_brotli
except ImportError:
compat_brotli = None
# PyPi ncompress package implements 'compress' Content-Encoding
try:
import ncompress as compat_ncompress
except ImportError:
compat_ncompress = None
Comment on lines +3209 to +3213
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Has this ever been observed in the wild? I'm against adding a whole new dependency for a use case that's never encountered.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding indicates compress has been deprecated for decades

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It says:

... this content-encoding is not used by many browsers today ...

Not by Mozilla, eg. One guy on the Web seemed to want it, though it's hardly a yt-dl application and it was indeed more than two decades ago.

However as compress is in RFC 9110 and easy* to support, I included it.

It's a fairly trivial point since encoding is negotiated and compress is arguably a different case from br (Myspace vs Instagram). We would never have seen a bug report if some site offered compress but not gzip or deflate. You could surely be absolved if yt-dlp failed to reflect this feature, even given the slicker dependency system.

Separately, I wonder what sites do today if the client says Accept-Encoding: identity (so no encoding). Would that provoke/alleviate fingerprinting issues?

* Considering negotiation, the PR doesn't advertise any optional encodings, though, since I forgot to include that change.



legacy = [
'compat_HTMLParseError',
Expand Down Expand Up @@ -3234,6 +3246,7 @@ def compat_datetime_timedelta_total_seconds(td):
'compat_Struct',
'compat_base64_b64decode',
'compat_basestring',
'compat_brotli',
'compat_casefold',
'compat_chr',
'compat_collections_abc',
Expand All @@ -3259,6 +3272,7 @@ def compat_datetime_timedelta_total_seconds(td):
'compat_itertools_zip_longest',
'compat_kwargs',
'compat_map',
'compat_ncompress',
'compat_numeric_types',
'compat_open',
'compat_ord',
Expand Down
Loading