From 25d25fe62e19de5282d368ef4d4d4ac6dca207fc Mon Sep 17 00:00:00 2001 From: Ian Carroll Date: Wed, 19 Jun 2024 16:21:45 +0000 Subject: [PATCH 1/4] respect Accept-Ranges none --- fsspec/implementations/http.py | 14 ++++++----- fsspec/implementations/tests/test_http.py | 8 ++++-- fsspec/tests/conftest.py | 30 +++++++++++------------ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 4580764ce..ab4a57da2 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -358,9 +358,10 @@ def _open( kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) - size = size or self.info(path, **kwargs)["size"] + info = self.info(path, **kwargs) + size = size or info["size"] session = sync(self.loop, self.set_session) - if block_size and size: + if block_size and size and info.get("partial", True): return HTTPFile( self, path, @@ -834,10 +835,6 @@ async def _file_info(url, session, size_policy="head", **kwargs): async with r: r.raise_for_status() - # TODO: - # recognise lack of 'Accept-Ranges', - # or 'Accept-Ranges': 'none' (not 'bytes') - # to mean streaming only, no random access => return None if "Content-Length" in r.headers: # Some servers may choose to ignore Accept-Encoding and return # compressed content, in which case the returned size is unreliable. @@ -852,6 +849,11 @@ async def _file_info(url, session, size_policy="head", **kwargs): if "Content-Type" in r.headers: info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + if r.headers.get("Accept-Ranges") == "none": + # Some servers may explicitly discourage partial content requests, but + # the lack of "Accept-Ranges" does not always indicate they would fail + info["partial"] = False + info["url"] = str(r.url) for checksum_field in ["ETag", "Content-MD5", "Digest"]: diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index 81e438a81..568284e22 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -237,9 +237,13 @@ def test_random_access(server, headers): @pytest.mark.parametrize( "headers", [ - {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response headers lack size, assumed no range support + {"head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response is not a range {"ignore_range": "true", "give_length": "true"}, {"ignore_range": "true", "give_range": "true"}, + # HTTPStreamFile does not seek (past 0) + {"accept_range": "none", "head_ok": "true", "give_length": "true"}, ], ) def test_no_range_support(server, headers): @@ -247,8 +251,8 @@ def test_no_range_support(server, headers): url = server + "/index/realfile" with h.open(url, "rb") as f: # Random access is not possible if the server doesn't respect Range - f.seek(5) with pytest.raises(ValueError): + f.seek(5) f.read(10) # Reading from the beginning should still work diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py index fb1efb041..413ed717b 100644 --- a/fsspec/tests/conftest.py +++ b/fsspec/tests/conftest.py @@ -135,10 +135,10 @@ def read_chunks(self): self.rfile.readline() def do_HEAD(self): + r_headers = {} if "head_not_auth" in self.headers: - return self._respond( - 403, {"Content-Length": 123}, b"not authorized for HEAD request" - ) + r_headers["Content-Length"] = 123 + return self._respond(403, r_headers, b"not authorized for HEAD request") elif "head_ok" not in self.headers: return self._respond(405) @@ -148,23 +148,23 @@ def do_HEAD(self): return self._respond(404) if ("give_length" in self.headers) or ("head_give_length" in self.headers): - response_headers = {"Content-Length": len(file_data)} if "zero_length" in self.headers: - response_headers["Content-Length"] = 0 + r_headers["Content-Length"] = 0 elif "gzip_encoding" in self.headers: file_data = gzip.compress(file_data) - response_headers["Content-Encoding"] = "gzip" - response_headers["Content-Length"] = len(file_data) - - self._respond(200, response_headers) + r_headers["Content-Encoding"] = "gzip" + r_headers["Content-Length"] = len(file_data) + else: + r_headers["Content-Length"] = len(file_data) elif "give_range" in self.headers: - self._respond( - 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"} - ) + r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}" elif "give_etag" in self.headers: - self._respond(200, {"ETag": "xxx"}) - else: - self._respond(200) # OK response, but no useful info + r_headers["ETag"] = "xxx" + + if self.headers.get("accept_range") == "none": + r_headers["Accept-Ranges"] = "none" + + self._respond(200, r_headers) @contextlib.contextmanager From 384ffdfcb7c5c2059e7081a986856ffc6d7d0a6b Mon Sep 17 00:00:00 2001 From: Ian Carroll Date: Thu, 20 Jun 2024 16:02:05 +0000 Subject: [PATCH 2/4] explicit size implies partial is ok --- fsspec/implementations/http.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index ab4a57da2..a8b53840b 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -358,8 +358,8 @@ def _open( kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) - info = self.info(path, **kwargs) - size = size or info["size"] + info = {} + size = size or (info.update(self.info(path, **kwargs)) or info["size"]) session = sync(self.loop, self.set_session) if block_size and size and info.get("partial", True): return HTTPFile( @@ -521,9 +521,9 @@ async def _isdir(self, path): class HTTPFile(AbstractBufferedFile): """ - A file-like object pointing to a remove HTTP(S) resource + A file-like object pointing to a remote HTTP(S) resource - Supports only reading, with read-ahead of a predermined block-size. + Supports only reading, with read-ahead of a predetermined block-size. In the case that the server does not supply the filesize, only reading of the complete file in one go is supported. From fda602a98c64aa08d8af9aeeeb9207b2dad914dd Mon Sep 17 00:00:00 2001 From: Ian Carroll Date: Mon, 29 Jul 2024 02:15:16 +0000 Subject: [PATCH 3/4] lint --- fsspec/implementations/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index a8b53840b..e4a9ea290 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -359,7 +359,7 @@ def _open( kw["asynchronous"] = self.asynchronous kw.update(kwargs) info = {} - size = size or (info.update(self.info(path, **kwargs)) or info["size"]) + size = size or info.update(self.info(path, **kwargs)) or info["size"] session = sync(self.loop, self.set_session) if block_size and size and info.get("partial", True): return HTTPFile( From ec1a7f53d992f6dc5e7453c1bdf594fb56742e5b Mon Sep 17 00:00:00 2001 From: Ian Carroll Date: Wed, 19 Jun 2024 16:21:45 +0000 Subject: [PATCH 4/4] respect Accept-Ranges none --- fsspec/implementations/http.py | 14 ++++++----- fsspec/implementations/tests/test_http.py | 8 ++++-- fsspec/tests/conftest.py | 30 +++++++++++------------ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 7b5a38bb3..f5fc068cd 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -358,9 +358,10 @@ def _open( kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) - size = size or self.info(path, **kwargs)["size"] + info = self.info(path, **kwargs) + size = size or info["size"] session = sync(self.loop, self.set_session) - if block_size and size: + if block_size and size and info.get("partial", True): return HTTPFile( self, path, @@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs): async with r: r.raise_for_status() - # TODO: - # recognise lack of 'Accept-Ranges', - # or 'Accept-Ranges': 'none' (not 'bytes') - # to mean streaming only, no random access => return None if "Content-Length" in r.headers: # Some servers may choose to ignore Accept-Encoding and return # compressed content, in which case the returned size is unreliable. @@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs): if "Content-Type" in r.headers: info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + if r.headers.get("Accept-Ranges") == "none": + # Some servers may explicitly discourage partial content requests, but + # the lack of "Accept-Ranges" does not always indicate they would fail + info["partial"] = False + info["url"] = str(r.url) for checksum_field in ["ETag", "Content-MD5", "Digest"]: diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index 81e438a81..568284e22 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -237,9 +237,13 @@ def test_random_access(server, headers): @pytest.mark.parametrize( "headers", [ - {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response headers lack size, assumed no range support + {"head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response is not a range {"ignore_range": "true", "give_length": "true"}, {"ignore_range": "true", "give_range": "true"}, + # HTTPStreamFile does not seek (past 0) + {"accept_range": "none", "head_ok": "true", "give_length": "true"}, ], ) def test_no_range_support(server, headers): @@ -247,8 +251,8 @@ def test_no_range_support(server, headers): url = server + "/index/realfile" with h.open(url, "rb") as f: # Random access is not possible if the server doesn't respect Range - f.seek(5) with pytest.raises(ValueError): + f.seek(5) f.read(10) # Reading from the beginning should still work diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py index fb1efb041..413ed717b 100644 --- a/fsspec/tests/conftest.py +++ b/fsspec/tests/conftest.py @@ -135,10 +135,10 @@ def read_chunks(self): self.rfile.readline() def do_HEAD(self): + r_headers = {} if "head_not_auth" in self.headers: - return self._respond( - 403, {"Content-Length": 123}, b"not authorized for HEAD request" - ) + r_headers["Content-Length"] = 123 + return self._respond(403, r_headers, b"not authorized for HEAD request") elif "head_ok" not in self.headers: return self._respond(405) @@ -148,23 +148,23 @@ def do_HEAD(self): return self._respond(404) if ("give_length" in self.headers) or ("head_give_length" in self.headers): - response_headers = {"Content-Length": len(file_data)} if "zero_length" in self.headers: - response_headers["Content-Length"] = 0 + r_headers["Content-Length"] = 0 elif "gzip_encoding" in self.headers: file_data = gzip.compress(file_data) - response_headers["Content-Encoding"] = "gzip" - response_headers["Content-Length"] = len(file_data) - - self._respond(200, response_headers) + r_headers["Content-Encoding"] = "gzip" + r_headers["Content-Length"] = len(file_data) + else: + r_headers["Content-Length"] = len(file_data) elif "give_range" in self.headers: - self._respond( - 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"} - ) + r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}" elif "give_etag" in self.headers: - self._respond(200, {"ETag": "xxx"}) - else: - self._respond(200) # OK response, but no useful info + r_headers["ETag"] = "xxx" + + if self.headers.get("accept_range") == "none": + r_headers["Accept-Ranges"] = "none" + + self._respond(200, r_headers) @contextlib.contextmanager