From 41ec08f9faa9780135e998f2915d1bc420ae86c7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 23 Nov 2024 12:13:19 -0500 Subject: [PATCH 1/3] add reader_kwargs argument to open_virtual_dataset, and pass it down to every reader --- virtualizarr/backend.py | 4 ++++ virtualizarr/readers/common.py | 2 ++ virtualizarr/readers/dmrpp.py | 6 ++++++ virtualizarr/readers/fits.py | 6 ++++++ virtualizarr/readers/hdf/hdf.py | 6 ++++++ virtualizarr/readers/hdf5.py | 6 ++++++ virtualizarr/readers/kerchunk.py | 6 ++++++ virtualizarr/readers/netcdf3.py | 6 ++++++ virtualizarr/readers/tiff.py | 6 ++++++ virtualizarr/readers/zarr_v3.py | 6 ++++++ 10 files changed, 54 insertions(+) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index fed2b756..dafebdbd 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -113,6 +113,7 @@ def open_virtual_dataset( cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, backend: Optional[VirtualBackend] = None, ) -> Dataset: @@ -147,6 +148,8 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + reader_kwargs: dict, default is None + Dictionary of keyword arguments passed down to this reader. Allows passing arguments specific to certain readers. reader_options: dict, default {} Dict passed into Kerchunk file readers, to allow reading from remote filesystems. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. @@ -201,6 +204,7 @@ def open_virtual_dataset( loadable_variables=loadable_variables, decode_times=decode_times, indexes=indexes, + reader_kwargs=reader_kwargs, reader_options=reader_options, ) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 1ad24629..0f5fe083 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -175,6 +175,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: raise NotImplementedError() @@ -187,6 +188,7 @@ def open_virtual_datatree( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> DataTree: raise NotImplementedError() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 5859ca92..870a1d02 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -23,6 +23,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: loadable_variables, drop_variables = check_for_collisions( @@ -30,6 +31,11 @@ def open_virtual_dataset( loadable_variables=loadable_variables, ) + if reader_kwargs: + raise NotImplementedError( + "DMR++ reader does not understand any reader_kwargs" + ) + if loadable_variables != [] or decode_times or indexes is None: raise NotImplementedError( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index de93bc1f..d1ccab2b 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.fits import process_file + if reader_kwargs: + raise NotImplementedError( + "FITS reader does not understand any reader_kwargs" + ) + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 0eee63d5..203ae810 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -38,8 +38,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> xr.Dataset: + if reader_kwargs: + raise NotImplementedError( + "HDF reader does not understand any reader_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index 91e5b6f9..12156681 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.hdf import SingleHdf5ToZarr + if reader_kwargs: + raise NotImplementedError( + "HDF5 reader does not understand any reader_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 4a41548c..61a927e4 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -20,10 +20,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """Reads existing kerchunk references (in JSON or parquet) format.""" + if reader_kwargs: + raise NotImplementedError( + "Kerchunk reader does not understand any reader_kwargs" + ) + if group: raise NotImplementedError() diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index 25f212ca..b5127f68 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.netCDF3 import NetCDF3ToZarr + if reader_kwargs: + raise NotImplementedError( + "netcdf3 reader does not understand any reader_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index d9c440ba..30848ac6 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -25,8 +25,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: + if reader_kwargs: + raise NotImplementedError( + "TIFF reader does not understand any reader_kwargs" + ) + from kerchunk.tiff import tiff_to_zarr drop_variables, loadable_variables = check_for_collisions( diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 4a867ffb..714bcda6 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -20,6 +20,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + reader_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """ @@ -27,6 +28,11 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ + if reader_kwargs: + raise NotImplementedError( + "Zarr_v3 reader does not understand any reader_kwargs" + ) + storepath = Path(filepath) if group: From fdd6c05288cd48e460477f1ff9ef788b352fb345 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Nov 2024 12:44:42 -0500 Subject: [PATCH 2/3] rename reader_kwargs -> virtual_backend_kwargs --- virtualizarr/backend.py | 6 +++--- virtualizarr/readers/common.py | 4 ++-- virtualizarr/readers/dmrpp.py | 6 +++--- virtualizarr/readers/fits.py | 6 +++--- virtualizarr/readers/hdf/hdf.py | 6 +++--- virtualizarr/readers/hdf5.py | 6 +++--- virtualizarr/readers/kerchunk.py | 6 +++--- virtualizarr/readers/netcdf3.py | 6 +++--- virtualizarr/readers/tiff.py | 6 +++--- virtualizarr/readers/zarr_v3.py | 6 +++--- 10 files changed, 29 insertions(+), 29 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index dafebdbd..b9df1923 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -113,7 +113,7 @@ def open_virtual_dataset( cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, backend: Optional[VirtualBackend] = None, ) -> Dataset: @@ -148,7 +148,7 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - reader_kwargs: dict, default is None + virtual_backend_kwargs: dict, default is None Dictionary of keyword arguments passed down to this reader. Allows passing arguments specific to certain readers. reader_options: dict, default {} Dict passed into Kerchunk file readers, to allow reading from remote filesystems. @@ -204,7 +204,7 @@ def open_virtual_dataset( loadable_variables=loadable_variables, decode_times=decode_times, indexes=indexes, - reader_kwargs=reader_kwargs, + virtual_backend_kwargs=virtual_backend_kwargs, reader_options=reader_options, ) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 83511fb4..d4b65872 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -168,7 +168,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: raise NotImplementedError() @@ -181,7 +181,7 @@ def open_virtual_datatree( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> DataTree: raise NotImplementedError() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 870a1d02..5d74c412 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -23,7 +23,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: loadable_variables, drop_variables = check_for_collisions( @@ -31,9 +31,9 @@ def open_virtual_dataset( loadable_variables=loadable_variables, ) - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "DMR++ reader does not understand any reader_kwargs" + "DMR++ reader does not understand any virtual_backend_kwargs" ) if loadable_variables != [] or decode_times or indexes is None: diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index d1ccab2b..7b6221ff 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -23,14 +23,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.fits import process_file - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "FITS reader does not understand any reader_kwargs" + "FITS reader does not understand any virtual_backend_kwargs" ) # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 203ae810..ca921bb0 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -38,12 +38,12 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> xr.Dataset: - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "HDF reader does not understand any reader_kwargs" + "HDF reader does not understand any virtual_backend_kwargs" ) drop_variables, loadable_variables = check_for_collisions( diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index 12156681..ca910ab4 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -23,14 +23,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.hdf import SingleHdf5ToZarr - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "HDF5 reader does not understand any reader_kwargs" + "HDF5 reader does not understand any virtual_backend_kwargs" ) drop_variables, loadable_variables = check_for_collisions( diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 61a927e4..43ad5341 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -20,14 +20,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """Reads existing kerchunk references (in JSON or parquet) format.""" - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "Kerchunk reader does not understand any reader_kwargs" + "Kerchunk reader does not understand any virtual_backend_kwargs" ) if group: diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index b5127f68..f93dd2a6 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -23,14 +23,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.netCDF3 import NetCDF3ToZarr - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "netcdf3 reader does not understand any reader_kwargs" + "netcdf3 reader does not understand any virtual_backend_kwargs" ) drop_variables, loadable_variables = check_for_collisions( diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index 30848ac6..0383e899 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -25,12 +25,12 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "TIFF reader does not understand any reader_kwargs" + "TIFF reader does not understand any virtual_backend_kwargs" ) from kerchunk.tiff import tiff_to_zarr diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 714bcda6..1491d2d2 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -20,7 +20,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_kwargs: Optional[dict] = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """ @@ -28,9 +28,9 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ - if reader_kwargs: + if virtual_backend_kwargs: raise NotImplementedError( - "Zarr_v3 reader does not understand any reader_kwargs" + "Zarr_v3 reader does not understand any virtual_backend_kwargs" ) storepath = Path(filepath) From a15700301f0fb25e5f6fddfd8169a435cea09deb Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Nov 2024 12:54:10 -0500 Subject: [PATCH 3/3] release note --- docs/releases.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index bde41778..ed29ab13 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.1.1 (unreleased) New Features ~~~~~~~~~~~~ +- Add a ``virtual_backend_kwargs`` keyword argument to file readers and to ``open_virtual_dataset``, to allow reader-specific options to be passed down. + (:pull:`315`) By `Tom Nicholas `_. + Breaking changes ~~~~~~~~~~~~~~~~