Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tiff zarr reader #295

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ dependencies:
- fsspec
- s3fs
- fastparquet
# for opening tiff files
# for opening and creating test tiff files
- tifffile
- pillow
# for opening FITS files
- astropy
13 changes: 13 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,16 @@ def simple_netcdf4(tmpdir):
ds.to_netcdf(filepath)

return filepath


@pytest.fixture
def random_tiff(tmpdir):
from PIL import Image

array = np.random.randint(0, 255, (128, 128), dtype=np.uint8)
img = Image.fromarray(array)

filepath = tmpdir / "rand.tiff"
img.save(filepath)

return str(filepath)
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ test = [
"ruff",
"s3fs",
"scipy",
"tifffile",
"pillow",
]


Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa
from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa
from virtualizarr.backend import open_virtual_dataset # noqa: F401
from virtualizarr.backend import open_virtual_dataset, open_virtual_dataarray # noqa: F401

from importlib.metadata import version as _version

Expand Down
52 changes: 51 additions & 1 deletion virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
Optional,
)

from xarray import Dataset, Index
from xarray import DataArray, Dataset, Index

from virtualizarr.manifests import ManifestArray
from virtualizarr.readers import (
Expand Down Expand Up @@ -198,3 +198,53 @@ def open_virtual_dataset(
)

return vds


def open_virtual_dataarray(
filepath: str,
*,
filetype: FileType | None = None,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
indexes: Mapping[str, Index] | None = None,
virtual_array_class=ManifestArray,
reader_options: Optional[dict] = None,
) -> DataArray:

drop_variables, loadable_variables = check_for_collisions(
drop_variables,
loadable_variables,
)

if virtual_array_class is not ManifestArray:
raise NotImplementedError()

if reader_options is None:
reader_options = {}

if filetype is not None:
# if filetype is user defined, convert to FileType
filetype = FileType(filetype)
else:
filetype = automatically_determine_filetype(
filepath=filepath, reader_options=reader_options
)

backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())

if backend_cls is None:
raise NotImplementedError(f"Unsupported file type: {filetype.name}")

vda = backend_cls.open_virtual_dataarray(
filepath,
group=group,
drop_variables=drop_variables,
loadable_variables=loadable_variables,
decode_times=decode_times,
indexes=indexes,
reader_options=reader_options,
)

return vda
40 changes: 35 additions & 5 deletions virtualizarr/readers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from xarray import (
Coordinates,
DataArray,
Dataset,
DataTree,
Index,
Expand Down Expand Up @@ -93,11 +94,11 @@ def open_loadable_vars_and_indexes(


def construct_virtual_dataset(
virtual_vars,
loadable_vars,
indexes,
coord_names,
attrs,
virtual_vars: Mapping[str, Variable],
loadable_vars: Mapping[str, Variable],
indexes: Mapping[str, Index],
coord_names: Iterable[str],
attrs: dict[str, str],
) -> Dataset:
"""Construct a virtual Datset from consistuent parts."""

Expand All @@ -117,6 +118,23 @@ def construct_virtual_dataset(
return vds


def construct_virtual_dataarray(
virtual_var: Variable,
loadable_vars: Mapping[str, Variable],
indexes: Mapping[str, Index],
coord_names: Iterable[str],
attrs: dict[str, str],
) -> DataArray:

vda = DataArray(
data=virtual_var,
coords=coord_names,
# indexes={}, # TODO should be added in a later version of xarray
attrs=attrs,
)
return vda


def separate_coords(
vars: Mapping[str, Variable],
indexes: MutableMapping[str, Index],
Expand Down Expand Up @@ -167,6 +185,18 @@ def separate_coords(


class VirtualBackend(ABC):
@staticmethod
def open_virtual_dataarray(
filepath: str,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
indexes: Mapping[str, Index] | None = None,
reader_options: Optional[dict] = None,
) -> DataArray:
raise NotImplementedError()

@staticmethod
def open_virtual_dataset(
filepath: str,
Expand Down
83 changes: 34 additions & 49 deletions virtualizarr/readers/tiff.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
import warnings
from typing import Iterable, Mapping, Optional

from xarray import Dataset, Index
from xarray import DataArray, Dataset, Index
import zarr

from virtualizarr.readers.common import (
VirtualBackend,
construct_virtual_dataset,
open_loadable_vars_and_indexes,
)
from virtualizarr.translators.kerchunk import (
extract_group,
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.utils import check_for_collisions
from virtualizarr.readers.common import VirtualBackend
from virtualizarr.readers.zarr import virtual_variable_from_zarr_array


class TIFFVirtualBackend(VirtualBackend):
@staticmethod
def open_virtual_dataset(
def open_virtual_dataarray(
filepath: str,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
Expand All @@ -27,46 +18,40 @@ def open_virtual_dataset(
indexes: Mapping[str, Index] | None = None,
reader_options: Optional[dict] = None,
) -> Dataset:
from kerchunk.tiff import tiff_to_zarr

from tifffile import imread

store = imread(filepath, aszarr=True)

drop_variables, loadable_variables = check_for_collisions(
drop_variables=drop_variables, loadable_variables=loadable_variables
)
# TODO exception handling for TIFF files with multiple arrays
za = zarr.open_array(store=store, mode="r")

if reader_options is None:
reader_options = {}
vv = virtual_variable_from_zarr_array(za)

reader_options.pop("storage_options", {})
warnings.warn(
"storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr",
UserWarning,
)
# TODO should we generate any pixel coordnate arrays like kerhunk seems to do?

# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)})
return DataArray(data=vv, dims=vv.dims, attrs=za.attrs)

refs = extract_group(refs, group)
@staticmethod
def open_virtual_dataset(
filepath: str,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
indexes: Mapping[str, Index] | None = None,
reader_options: Optional[dict] = None,
) -> Dataset:

from tifffile import imread

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
refs,
loadable_variables,
drop_variables,
)
store = imread(filepath, aszarr=True)

loadable_vars, indexes = open_loadable_vars_and_indexes(
filepath,
loadable_variables=loadable_variables,
reader_options=reader_options,
drop_variables=drop_variables,
indexes=indexes,
group=group,
decode_times=decode_times,
)
try:
zg = zarr.open_group(store, mode="r")
except zarr.errors.ContainsArrayError as err:
# TODO tidy this up
print("TIFF file contains only a single array, please use `open_virtual_dataarray` instead")
raise

return construct_virtual_dataset(
virtual_vars=virtual_vars,
loadable_vars=loadable_vars,
indexes=indexes,
coord_names=coord_names,
attrs=attrs,
)
raise NotImplementedError()
50 changes: 50 additions & 0 deletions virtualizarr/readers/zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import io

from xarray import Variable
import zarr

from virtualizarr.zarr import ZArray
from virtualizarr.manifests import ChunkManifest, ManifestArray


def virtual_variable_from_zarr_array(za: zarr.Array) -> Variable:
"""
Create a virtual xarray.Variable wrapping a ManifestArray from a single zarr.Array.
"""

# TODO this only works with zarr-python v2 for now

attrs = dict(za.attrs)

# extract _ARRAY_DIMENSIONS and remove it from attrs
# TODO handle v3 DIMENSION_NAMES too
dims = attrs.pop("_ARRAY_DIMENSIONS")

zarray = ZArray(
shape=za.shape,
chunks=za.chunks,
dtype=za.dtype,
fill_value=za.fill_value,
order=za.order,
compressor=za.compressor,
filters=za.filters,
#zarr_format=za.zarr_format,
)

manifest = chunkmanifest_from_zarr_array(za)

ma = ManifestArray(chunkmanifest=manifest, zarray=zarray)

return Variable(data=ma, dims=dims, attrs=attrs)


def chunkmanifest_from_zarr_array(za: zarr.Array) -> ChunkManifest:
import ujson

of2 = io.StringIO()

# TODO handle remote urls
za.store.write_fsspec(of2)# , url=url)
out = ujson.loads(of2.getvalue())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the challenging part mentioned in #291 (comment)


print(out)
1 change: 1 addition & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def _importorskip(
has_s3fs, requires_s3fs = _importorskip("s3fs")
has_scipy, requires_scipy = _importorskip("scipy")
has_tifffile, requires_tifffile = _importorskip("tifffile")
has_pillow, requires_pillow = _importorskip("PIL")


def create_manifestarray(
Expand Down
22 changes: 22 additions & 0 deletions virtualizarr/tests/test_readers/test_tiff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import numpy as np
from xarray import DataArray

from virtualizarr import open_virtual_dataarray
from virtualizarr.manifests import ManifestArray
from virtualizarr.tests import requires_pillow


@requires_pillow
def test_random_tiff(random_tiff):
vda = open_virtual_dataarray(random_tiff, indexes={})

assert isinstance(vda, DataArray)

assert vda.sizes == {"X": 128, "Y": 128}
assert vda.dtype == np.uint8

assert isinstance(vda.data, ManifestArray)
manifest = vda.data.manifest
assert manifest.dict() == {
"0.0": {"path": random_tiff, "offset": 122, "length": 16384}
}
Loading