Skip to content

Commit

Permalink
Merge pull request #255 from SciCatProject/flatten-directories-download
Browse files Browse the repository at this point in the history
Flatten directories in download
  • Loading branch information
jl-wynen authored Dec 18, 2024
2 parents be7b8d4 + c2efaa9 commit bf53658
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 38 deletions.
16 changes: 16 additions & 0 deletions src/scitacean/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
import re
import warnings
from collections import Counter
from collections.abc import Callable, Iterable, Iterator
from contextlib import contextmanager
from pathlib import Path
Expand Down Expand Up @@ -414,6 +415,9 @@ def download_files(
Makes selected files available on the local filesystem using the file transfer
object stored in the client.
All files are downloaded to the ``target`` directory.
Any directory structure on the remote system is discarded, that is all files
are placed directly in ``target``.
Parameters
----------
Expand Down Expand Up @@ -488,6 +492,7 @@ def download_files(
downloaded_files = [
f.downloaded(local_path=target / f.remote_path.to_local()) for f in files
]
_expect_no_duplicate_filenames(f.local_path for f in downloaded_files)
if not force:
to_download = _remove_up_to_date_local_files(
downloaded_files, checksum_algorithm=checksum_algorithm
Expand Down Expand Up @@ -1350,3 +1355,14 @@ def revert_upload(self, *files: File) -> None:
"""Raise if given files."""
if files:
raise RuntimeError("Internal error: Bad upload connection")


def _expect_no_duplicate_filenames(paths: Iterable[Path | None]) -> None:
counter = Counter(paths)
non_unique = [str(path) for path, count in counter.items() if count > 1]
if non_unique:
raise RuntimeError(
"Downloading selected files would result in duplicate local filenames: "
f"({', '.join(non_unique)}). Consider limiting which files get downloaded "
"using the `select` argument."
)
14 changes: 9 additions & 5 deletions src/scitacean/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,20 @@ def from_local(cls, path: PurePath) -> RemotePath:
return RemotePath(path.as_posix())

def to_local(self) -> PurePath:
"""Return self as a local, OS-specific path."""
segments = self._path.split("/")
if segments[0] == "":
segments = ["/"] + segments[1:]
return PurePath(*segments)
"""Return self as a local, OS-specific path.
This returns only the file name, discarding all parent path segments.
"""
return PurePath(self.name)

def __truediv__(self, other: str | RemotePath) -> RemotePath:
"""Join two path segments."""
if isinstance(other, (PurePath, Path)): # type: ignore[unreachable]
raise TypeError("OS paths are not supported when concatenating RemotePath.")

if _posix(other).startswith("/"):
return RemotePath(other) # other is absolute, do not concatenate

this = _strip_trailing_slash(self.posix)
other = _strip_leading_slash(_strip_trailing_slash(_posix(other)))
return RemotePath(f"{this}/{other}")
Expand Down
72 changes: 51 additions & 21 deletions tests/download_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def data_files() -> tuple[list[DownloadDataFile], dict[str, bytes]]:
contents = {
"file1.dat": b"contents-of-file1",
"log/what-happened.log": b"ERROR Flux is off the scale",
"thaum.dat": b"0 4 2 59 330 2314552",
"/src/stibbons/774/thaum.dat": b"0 4 2 59 330 2314552",
"/src/ridcully/grades.csv": b"3,12,-2",
}
files = [
DownloadDataFile(
Expand All @@ -43,7 +44,7 @@ def data_files() -> tuple[list[DownloadDataFile], dict[str, bytes]]:
return files, contents


DatasetAndFiles = tuple[Dataset, dict[RemotePath | str, bytes]]
DatasetAndFiles = tuple[Dataset, dict[str, bytes]]


@pytest.fixture
Expand Down Expand Up @@ -79,10 +80,18 @@ def dataset_and_files(
dset = Dataset.from_download_models(
dataset_model=model, orig_datablock_models=[block]
)
return dset, {
dset.source_folder / name: content # type: ignore[operator]

content_abs_path = {
file_absolute_path(name, dset.source_folder): content
for name, content in data_files[1].items()
}
return dset, content_abs_path


def file_absolute_path(path: str, source_folder: RemotePath | None) -> str:
if path.startswith("/"):
return path
return (source_folder / path).posix # type: ignore[operator]


def load(name: str | Path) -> bytes:
Expand All @@ -100,10 +109,11 @@ def test_download_files_creates_local_files_select_all(
client.download_files(dataset, target="./download", select=True)
assert load("download/file1.dat") == contents["/src/stibbons/774/file1.dat"]
assert (
load("download/log/what-happened.log")
load("download/what-happened.log")
== contents["/src/stibbons/774/log/what-happened.log"]
)
assert load("download/thaum.dat") == contents["/src/stibbons/774/thaum.dat"]
assert load("download/grades.csv") == contents["/src/ridcully/grades.csv"]


def test_download_files_creates_local_files_select_none(
Expand All @@ -114,9 +124,7 @@ def test_download_files_creates_local_files_select_none(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
client.download_files(dataset, target="./download", select=False)
assert not Path("download/file1.dat").exists()
assert not Path("download/log/what-happened.log").exists()
assert not Path("download/thaum.dat").exists()
assert not list(Path("download").iterdir())


def test_download_files_creates_local_files_select_one_by_string(
Expand All @@ -126,10 +134,11 @@ def test_download_files_creates_local_files_select_one_by_string(
client = Client.without_login(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
client.download_files(dataset, target="./download", select="thaum.dat")
client.download_files(
dataset, target="./download", select="/src/stibbons/774/thaum.dat"
)
assert len(list(Path("download").iterdir())) == 1
assert load("download/thaum.dat") == contents["/src/stibbons/774/thaum.dat"]
assert not Path("download/file1.dat").exists()
assert not Path("download/log/what-happened.log").exists()


def test_download_files_creates_local_files_select_two_by_string(
Expand All @@ -140,11 +149,13 @@ def test_download_files_creates_local_files_select_two_by_string(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
client.download_files(
dataset, target="./download", select=["thaum.dat", "log/what-happened.log"]
dataset,
target="./download",
select=["/src/stibbons/774/thaum.dat", "log/what-happened.log"],
)
assert not Path("download/file1.dat").exists()
assert len(list(Path("download").iterdir())) == 2
assert (
load("download/log/what-happened.log")
load("download/what-happened.log")
== contents["/src/stibbons/774/log/what-happened.log"]
)
assert load("download/thaum.dat") == contents["/src/stibbons/774/thaum.dat"]
Expand All @@ -158,8 +169,7 @@ def test_download_files_creates_local_files_select_one_by_regex_name_only(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
client.download_files(dataset, target="./download", select=re.compile("thaum"))
assert not Path("download/file1.dat").exists()
assert not Path("download/log/what-happened.log").exists()
assert len(list(Path("download").iterdir())) == 1
assert load("download/thaum.dat") == contents["/src/stibbons/774/thaum.dat"]


Expand All @@ -171,8 +181,8 @@ def test_download_files_creates_local_files_select_two_by_regex_suffix(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
client.download_files(dataset, target="./download", select=re.compile(r"\.dat"))
assert len(list(Path("download").iterdir())) == 2
assert load("download/file1.dat") == contents["/src/stibbons/774/file1.dat"]
assert not Path("download/log/what-happened.log").exists()
assert load("download/thaum.dat") == contents["/src/stibbons/774/thaum.dat"]


Expand All @@ -186,9 +196,8 @@ def test_download_files_creates_local_files_select_one_by_regex_full_path(
client.download_files(
dataset, target="./download", select=re.compile(r"^file1\.dat$")
)
assert len(list(Path("download").iterdir())) == 1
assert load("download/file1.dat") == contents["/src/stibbons/774/file1.dat"]
assert not Path("download/log/what-happened.log").exists()
assert not Path("download/thaum.dat").exists()


def test_download_files_creates_local_files_select_one_by_predicate(
Expand All @@ -201,9 +210,30 @@ def test_download_files_creates_local_files_select_one_by_predicate(
client.download_files(
dataset, target="./download", select=lambda f: f.remote_path == "file1.dat"
)
assert len(list(Path("download").iterdir())) == 1
assert load("download/file1.dat") == contents["/src/stibbons/774/file1.dat"]
assert not Path("download/log/what-happened.log").exists()
assert not Path("download/thaum.dat").exists()


def test_download_files_refuses_download_if_flattening_creates_clash(
fs: FakeFilesystem, dataset_and_files: DatasetAndFiles
) -> None:
dataset, contents = dataset_and_files
contents["sub_directory/file1.dat"] = b"second file with the same name"
dataset.add_files(
File.from_remote(
"sub_directory/file1.dat",
size=len(contents["sub_directory/file1.dat"]),
creation_time=parse_date("1995-08-06T14:14:14"),
checksum=_checksum(contents["sub_directory/file1.dat"]),
checksum_algorithm="md5",
)
)
client = Client.without_login(
url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents)
)
with pytest.raises(RuntimeError, match="file1.dat"):
client.download_files(dataset, target="./download", select=True)
assert not list(Path("download").iterdir())


def test_download_files_returns_updated_dataset(
Expand Down
18 changes: 6 additions & 12 deletions tests/filesystem_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ def test_remote_path_repr() -> None:


def test_remote_path_to_local() -> None:
assert RemotePath("folder/file.dat").to_local() == PurePath("folder", "file.dat")
assert RemotePath("/folder/file.dat").to_local() == PurePath("/folder", "file.dat")
assert RemotePath("folder//file.dat").to_local() == PurePath("folder", "file.dat")
assert RemotePath("folder/file.dat/").to_local() == PurePath("folder", "file.dat")
assert RemotePath("folder/file.dat").to_local() == PurePath("file.dat")
assert RemotePath("/folder/file.dat").to_local() == PurePath("file.dat")
assert RemotePath("folder//file.dat").to_local() == PurePath("file.dat")
assert RemotePath("folder/file.dat/").to_local() == PurePath("file.dat")


@pytest.mark.parametrize(
Expand All @@ -99,8 +99,8 @@ def test_remote_path_join(types: Any) -> None:
ta, tb = types
assert ta("/source/123") / tb("file.data") == RemotePath("/source/123/file.data")
assert ta("/source/123/") / tb("file.data") == RemotePath("/source/123/file.data")
assert ta("/source/123") / tb("/file.data") == RemotePath("/source/123/file.data")
assert ta("/source/123/") / tb("/file.data") == RemotePath("/source/123/file.data")
assert ta("/source/123") / tb("/file.data") == RemotePath("/file.data")
assert ta("/source/123/") / tb("/file.data") == RemotePath("/file.data")


@pytest.mark.parametrize(
Expand All @@ -114,12 +114,6 @@ def test_remote_path_join_url(types: Any) -> None:
assert ta("https://server.eu/") / tb("1234-abcd/data.txt") == RemotePath(
"https://server.eu/1234-abcd/data.txt"
)
assert ta("https://server.eu") / tb("/1234-abcd/data.txt") == RemotePath(
"https://server.eu/1234-abcd/data.txt"
)
assert ta("https://server.eu/") / tb("/1234-abcd/data.txt") == RemotePath(
"https://server.eu/1234-abcd/data.txt"
)


def test_remote_path_join_rejects_os_path() -> None:
Expand Down

0 comments on commit bf53658

Please sign in to comment.