Skip to content

Commit

Permalink
Merge pull request #568 from catalystneuro/new_backend_pydantic_backe…
Browse files Browse the repository at this point in the history
…nd_configuration_models

[Backend Configuration Ib] Pydantic models for communicating backend-specific configuration information
  • Loading branch information
CodyCBakerPhD authored Nov 7, 2023
2 parents 3a9dc05 + 435053a commit 283c374
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 19 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Upcoming

### Features
* Added Pydantic data models of `BackendConfiguration` for both HDF5 and Zarr datasets (container/mapper of all the `DatasetConfiguration`s for a particular file). [PR #568](https://github.com/catalystneuro/neuroconv/pull/568)



# v0.4.5

### Back-compatibility break
Expand Down Expand Up @@ -33,11 +38,11 @@
### Deprecation
* Removed `use_times` and `buffer_size` from `add_photon_series`. [PR #600](https://github.com/catalystneuro/neuroconv/pull/600)


### Testing
* Adds `MockImagingInterface` as a general testing mechanism for ophys imaging interfaces [PR #604](https://github.com/catalystneuro/neuroconv/pull/604).



# v0.4.4

### Features
Expand Down
5 changes: 3 additions & 2 deletions requirements-minimal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ numpy>=1.22.0
jsonschema>=3.2.0
PyYAML>=5.4
scipy>=1.4.1
h5py>=2.10.0
h5py>=3.9.0
hdmf>=3.11.0
hdmf_zarr>=0.4.0
pynwb>=2.3.2;python_version>='3.8'
psutil>=5.8.0
tqdm>=4.60.0
dandi>=0.46.2
dandi>=0.57.0
pandas
fparse
9 changes: 6 additions & 3 deletions src/neuroconv/tools/nwb_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@
make_nwbfile_from_metadata,
make_or_load_nwbfile,
)
from ._models._base_dataset_models import DatasetConfiguration, DatasetInfo
from ._models._hdf5_dataset_models import (
from ._models._base_models import DatasetConfiguration, DatasetInfo
from ._models._hdf5_models import (
AVAILABLE_HDF5_COMPRESSION_METHODS,
HDF5BackendConfiguration,
HDF5DatasetConfiguration,
)
from ._models._zarr_dataset_models import (
from ._models._zarr_models import (
AVAILABLE_ZARR_COMPRESSION_METHODS,
ZarrBackendConfiguration,
ZarrDatasetConfiguration,
)

BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration)
BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Base Pydantic models for DatasetInfo and DatasetConfiguration."""
import math
from abc import ABC, abstractmethod
from typing import Any, Dict, Literal, Tuple, Union
from typing import Any, Dict, Literal, Tuple, Type, Union

import h5py
import numcodecs
import numpy as np
from hdmf.container import DataIO
from pydantic import BaseModel, Field, root_validator


Expand Down Expand Up @@ -180,3 +181,29 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
Fetch the properly structured dictionary of input arguments to be passed directly into a H5DataIO or ZarrDataIO.
"""
raise NotImplementedError


class BackendConfiguration(BaseModel):
"""A model for matching collections of DatasetConfigurations to a specific backend."""

backend: Literal["hdf5", "zarr"] = Field(description="The name of the backend used to configure the NWBFile.")
data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.")
dataset_configurations: Dict[str, DatasetConfiguration] = Field(
description=(
"A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) "
"to their DatasetConfiguration specification that contains all information "
"for writing the datasets to disk using the specific backend."
)
)

def __str__(self) -> str:
"""Not overriding __repr__ as this is intended to render only when wrapped in print()."""
string = (
f"\nConfigurable datasets identified using the {self.backend} backend"
f"\n{'-' * (43 + len(self.backend) + 8)}"
)

for dataset_configuration in self.dataset_configurations.values():
string += f"\n{dataset_configuration}"

return string
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Base Pydantic models for the HDF5DatasetConfiguration."""
from typing import Any, Dict, Literal, Union
from typing import Any, Dict, Literal, Type, Union

import h5py
from nwbinspector.utils import is_module_installed
from pydantic import Field
from pynwb import H5DataIO

from ._base_dataset_models import DatasetConfiguration
from ._base_models import BackendConfiguration, DatasetConfiguration

_base_hdf5_filters = set(h5py.filters.decode)
_excluded_hdf5_filters = set(
Expand Down Expand Up @@ -78,3 +79,20 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
compression_bundle = dict(compression=self.compression_method, compression_opts=self.compression_options)

return dict(chunks=self.chunk_shape, **compression_bundle)


class HDF5BackendConfiguration(BackendConfiguration):
"""A model for matching collections of DatasetConfigurations specific to the HDF5 backend."""

backend: Literal["hdf5"] = Field( # TODO: in pydantic v2 use property instead of class attribute
default="hdf5", description="The name of the backend used to configure the NWBFile."
)
data_io_class: Type[H5DataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute
default=H5DataIO, description="The DataIO class that is specific to HDF5."
)
dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field(
description=(
"A mapping from object locations to their HDF5DatasetConfiguration specification that contains all "
"information for writing the datasets to disk using the HDF5 backend."
)
)
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Base Pydantic models for the ZarrDatasetConfiguration."""
from typing import Any, Dict, List, Literal, Union
from typing import Any, Dict, List, Literal, Type, Union

import numcodecs
import psutil
import zarr
from hdmf_zarr import ZarrDataIO
from pydantic import Field, root_validator

from ._base_dataset_models import DatasetConfiguration
from ._base_models import BackendConfiguration, DatasetConfiguration

_base_zarr_codecs = set(zarr.codec_registry.keys())
_lossy_zarr_codecs = set(("astype", "bitround", "quantize"))
Expand Down Expand Up @@ -134,3 +136,32 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
compressor = False

return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor)


class ZarrBackendConfiguration(BackendConfiguration):
"""A model for matching collections of DatasetConfigurations specific to the Zarr backend."""

backend: Literal["zarr"] = Field(
default="zarr", description="The name of the backend used to configure the NWBFile."
)
data_io_class: Type[ZarrDataIO] = Field(
default=ZarrDataIO, description="The DataIO class that is specific to Zarr."
)
dataset_configurations: Dict[str, ZarrDatasetConfiguration] = Field(
description=(
"A mapping from object locations to their ZarrDatasetConfiguration specification that contains all "
"information for writing the datasets to disk using the Zarr backend."
)
)
number_of_jobs: int = Field(
description=(
"Number of jobs to use in parallel during write. Negative values, starting from -1, "
"will use all the available CPUs (including logical), -2 is all except one, etc. "
"This is equivalent to the pattern of indexing of "
" `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` "
"uses all except one, etc."
),
ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count?
le=psutil.cpu_count(),
default=psutil.cpu_count() - 1,
)
2 changes: 2 additions & 0 deletions src/neuroconv/tools/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ._mock._mock_dataset_models import (
mock_DatasetInfo,
mock_HDF5BackendConfiguration,
mock_HDF5DatasetConfiguration,
mock_ZarrBackendConfiguration,
mock_ZarrDatasetConfiguration,
)
from .mock_files import generate_path_expander_demo_ibl
Expand Down
63 changes: 58 additions & 5 deletions src/neuroconv/tools/testing/_mock/_mock_dataset_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@
AVAILABLE_HDF5_COMPRESSION_METHODS,
AVAILABLE_ZARR_COMPRESSION_METHODS,
DatasetInfo,
HDF5BackendConfiguration,
HDF5DatasetConfiguration,
ZarrBackendConfiguration,
ZarrDatasetConfiguration,
)


def mock_DatasetInfo() -> DatasetInfo:
def mock_DatasetInfo(
object_id: str = "481a0860-3a0c-40ec-b931-df4a3e9b101f",
location: str = "acquisition/TestElectricalSeries/data",
full_shape: Tuple[int, ...] = (60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe
dtype=np.dtype("int16"),
) -> DatasetInfo:
"""Mock instance of a DatasetInfo with NeuroPixel-like values to showcase chunk/buffer recommendations."""
return DatasetInfo(
object_id="481a0860-3a0c-40ec-b931-df4a3e9b101f",
location="acquisition/TestElectricalSeries/data",
full_shape=(60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe
dtype=np.dtype("int16"),
object_id=object_id,
location=location,
full_shape=full_shape,
dtype=dtype,
)


Expand Down Expand Up @@ -59,3 +66,49 @@ def mock_ZarrDatasetConfiguration(
filter_methods=filter_methods,
filter_options=filter_options,
)


def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration:
"""Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets."""
dataset_configurations = {
"acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration(
dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"),
chunk_shape=(78_125, 64), # ~10 MB
buffer_shape=(1_250_000, 384), # ~1 GB
),
"acquisition/TestElectricalSeriesLF/data": HDF5DatasetConfiguration(
dataset_info=mock_DatasetInfo(
object_id="bc37e164-519f-4b65-a976-206440f1d325",
location="acquisition/TestElectricalSeriesLF/data",
full_shape=(75_000, 384),
),
chunk_shape=(37_500, 128), # ~10 MB
buffer_shape=(75_000, 384),
),
}

return HDF5BackendConfiguration(dataset_configurations=dataset_configurations)


def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration:
"""Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets."""
dataset_configurations = {
"acquisition/TestElectricalSeriesAP/data": ZarrDatasetConfiguration(
dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"),
chunk_shape=(78_125, 64),
buffer_shape=(1_250_000, 384), # ~1 GB
filter_methods=["delta"],
),
"acquisition/TestElectricalSeriesLF/data": ZarrDatasetConfiguration(
dataset_info=mock_DatasetInfo(
object_id="bc37e164-519f-4b65-a976-206440f1d325",
location="acquisition/TestElectricalSeriesLF/data",
full_shape=(75_000, 384),
),
chunk_shape=(37_500, 128), # ~10 MB
buffer_shape=(75_000, 384),
filter_methods=["delta"],
),
}

return ZarrBackendConfiguration(dataset_configurations=dataset_configurations)
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
"""Unit tests for the DatasetConfiguration Pydantic model."""
import pytest

from neuroconv.tools.nwb_helpers._models._base_dataset_models import (
DatasetConfiguration,
)
from neuroconv.tools.nwb_helpers._models._base_models import DatasetConfiguration
from neuroconv.tools.testing import mock_DatasetInfo


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Unit tests for the DatasetInfo Pydantic model."""
from io import StringIO
from unittest.mock import patch

from neuroconv.tools.testing import mock_HDF5BackendConfiguration


def test_hdf5_backend_configuration_print():
"""Test the printout display of a HDF5DatasetConfiguration model looks nice."""
hdf5_backend_configuration = mock_HDF5BackendConfiguration()

with patch("sys.stdout", new=StringIO()) as out:
print(hdf5_backend_configuration)

expected_print = """
Configurable datasets identified using the hdf5 backend
-------------------------------------------------------
acquisition/TestElectricalSeriesAP/data
---------------------------------------
dtype : int16
full shape of source array : (1800000, 384)
full size of source array : 1.38 GB
buffer shape : (1250000, 384)
maximum RAM usage per iteration : 0.96 GB
chunk shape : (78125, 64)
disk space usage per chunk : 10.00 MB
compression method : gzip
acquisition/TestElectricalSeriesLF/data
---------------------------------------
dtype : int16
full shape of source array : (75000, 384)
full size of source array : 0.06 GB
buffer shape : (75000, 384)
maximum RAM usage per iteration : 0.06 GB
chunk shape : (37500, 128)
disk space usage per chunk : 9.60 MB
compression method : gzip
"""
assert out.getvalue() == expected_print
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Unit tests for the DatasetInfo Pydantic model."""
from io import StringIO
from unittest.mock import patch

from neuroconv.tools.testing import mock_ZarrBackendConfiguration


def test_zarr_backend_configuration_print():
"""Test the printout display of a HDF5DatasetConfiguration model looks nice."""
zarr_backend_configuration = mock_ZarrBackendConfiguration()

with patch("sys.stdout", new=StringIO()) as out:
print(zarr_backend_configuration)

expected_print = """
Configurable datasets identified using the zarr backend
-------------------------------------------------------
acquisition/TestElectricalSeriesAP/data
---------------------------------------
dtype : int16
full shape of source array : (1800000, 384)
full size of source array : 1.38 GB
buffer shape : (1250000, 384)
maximum RAM usage per iteration : 0.96 GB
chunk shape : (78125, 64)
disk space usage per chunk : 10.00 MB
compression method : gzip
filter methods : ['delta']
acquisition/TestElectricalSeriesLF/data
---------------------------------------
dtype : int16
full shape of source array : (75000, 384)
full size of source array : 0.06 GB
buffer shape : (75000, 384)
maximum RAM usage per iteration : 0.06 GB
chunk shape : (37500, 128)
disk space usage per chunk : 9.60 MB
compression method : gzip
filter methods : ['delta']
"""
assert out.getvalue() == expected_print

0 comments on commit 283c374

Please sign in to comment.