Merge pull request #568 from catalystneuro/new_backend_pydantic_backe…

…nd_configuration_models [Backend Configuration Ib] Pydantic models for communicating backend-specific configuration information
catalystneuro · Nov 7, 2023 · 283c374 · 283c374
2 parents 3a9dc05 + 435053a
commit 283c374
Show file tree

Hide file tree

Showing 11 changed files with 259 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Upcoming
 
+### Features
+* Added Pydantic data models of `BackendConfiguration` for both HDF5 and Zarr datasets (container/mapper of all the `DatasetConfiguration`s for a particular file). [PR #568](https://github.com/catalystneuro/neuroconv/pull/568)
+
+
+
 # v0.4.5
 
 ### Back-compatibility break
@@ -33,11 +38,11 @@
 ### Deprecation
 * Removed `use_times` and `buffer_size` from `add_photon_series`. [PR #600](https://github.com/catalystneuro/neuroconv/pull/600)
 
-
 ### Testing
 * Adds `MockImagingInterface` as a general testing mechanism for ophys imaging interfaces [PR #604](https://github.com/catalystneuro/neuroconv/pull/604).
 
 
+
 # v0.4.4
 
 ### Features

diff --git a/requirements-minimal.txt b/requirements-minimal.txt
@@ -2,11 +2,12 @@ numpy>=1.22.0
 jsonschema>=3.2.0
 PyYAML>=5.4
 scipy>=1.4.1
-h5py>=2.10.0
+h5py>=3.9.0
 hdmf>=3.11.0
+hdmf_zarr>=0.4.0
 pynwb>=2.3.2;python_version>='3.8'
 psutil>=5.8.0
 tqdm>=4.60.0
-dandi>=0.46.2
+dandi>=0.57.0
 pandas
 fparse
diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py
@@ -5,14 +5,17 @@
     make_nwbfile_from_metadata,
     make_or_load_nwbfile,
 )
-from ._models._base_dataset_models import DatasetConfiguration, DatasetInfo
-from ._models._hdf5_dataset_models import (
+from ._models._base_models import DatasetConfiguration, DatasetInfo
+from ._models._hdf5_models import (
     AVAILABLE_HDF5_COMPRESSION_METHODS,
+    HDF5BackendConfiguration,
     HDF5DatasetConfiguration,
 )
-from ._models._zarr_dataset_models import (
+from ._models._zarr_models import (
     AVAILABLE_ZARR_COMPRESSION_METHODS,
+    ZarrBackendConfiguration,
     ZarrDatasetConfiguration,
 )
 
 BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration)
+BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
diff --git a/...b_helpers/_models/_base_dataset_models.py → ...tools/nwb_helpers/_models/_base_models.py b/...b_helpers/_models/_base_dataset_models.py → ...tools/nwb_helpers/_models/_base_models.py
@@ -1,11 +1,12 @@
 """Base Pydantic models for DatasetInfo and DatasetConfiguration."""
 import math
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Literal, Tuple, Union
+from typing import Any, Dict, Literal, Tuple, Type, Union
 
 import h5py
 import numcodecs
 import numpy as np
+from hdmf.container import DataIO
 from pydantic import BaseModel, Field, root_validator
 
 
@@ -180,3 +181,29 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
         Fetch the properly structured dictionary of input arguments to be passed directly into a H5DataIO or ZarrDataIO.
         """
         raise NotImplementedError
+
+
+class BackendConfiguration(BaseModel):
+    """A model for matching collections of DatasetConfigurations to a specific backend."""
+
+    backend: Literal["hdf5", "zarr"] = Field(description="The name of the backend used to configure the NWBFile.")
+    data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.")
+    dataset_configurations: Dict[str, DatasetConfiguration] = Field(
+        description=(
+            "A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) "
+            "to their DatasetConfiguration specification that contains all information "
+            "for writing the datasets to disk using the specific backend."
+        )
+    )
+
+    def __str__(self) -> str:
+        """Not overriding __repr__ as this is intended to render only when wrapped in print()."""
+        string = (
+            f"\nConfigurable datasets identified using the {self.backend} backend"
+            f"\n{'-' * (43 + len(self.backend) + 8)}"
+        )
+
+        for dataset_configuration in self.dataset_configurations.values():
+            string += f"\n{dataset_configuration}"
+
+        return string
diff --git a/...b_helpers/_models/_hdf5_dataset_models.py → ...tools/nwb_helpers/_models/_hdf5_models.py b/...b_helpers/_models/_hdf5_dataset_models.py → ...tools/nwb_helpers/_models/_hdf5_models.py
@@ -1,11 +1,12 @@
 """Base Pydantic models for the HDF5DatasetConfiguration."""
-from typing import Any, Dict, Literal, Union
+from typing import Any, Dict, Literal, Type, Union
 
 import h5py
 from nwbinspector.utils import is_module_installed
 from pydantic import Field
+from pynwb import H5DataIO
 
-from ._base_dataset_models import DatasetConfiguration
+from ._base_models import BackendConfiguration, DatasetConfiguration
 
 _base_hdf5_filters = set(h5py.filters.decode)
 _excluded_hdf5_filters = set(
@@ -78,3 +79,20 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
             compression_bundle = dict(compression=self.compression_method, compression_opts=self.compression_options)
 
         return dict(chunks=self.chunk_shape, **compression_bundle)
+
+
+class HDF5BackendConfiguration(BackendConfiguration):
+    """A model for matching collections of DatasetConfigurations specific to the HDF5 backend."""
+
+    backend: Literal["hdf5"] = Field(  # TODO: in pydantic v2 use property instead of class attribute
+        default="hdf5", description="The name of the backend used to configure the NWBFile."
+    )
+    data_io_class: Type[H5DataIO] = Field(  # TODO: in pydantic v2 use property instead of class attribute
+        default=H5DataIO, description="The DataIO class that is specific to HDF5."
+    )
+    dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field(
+        description=(
+            "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all "
+            "information for writing the datasets to disk using the HDF5 backend."
+        )
+    )
diff --git a/...b_helpers/_models/_zarr_dataset_models.py → ...tools/nwb_helpers/_models/_zarr_models.py b/...b_helpers/_models/_zarr_dataset_models.py → ...tools/nwb_helpers/_models/_zarr_models.py
@@ -1,11 +1,13 @@
 """Base Pydantic models for the ZarrDatasetConfiguration."""
-from typing import Any, Dict, List, Literal, Union
+from typing import Any, Dict, List, Literal, Type, Union
 
 import numcodecs
+import psutil
 import zarr
+from hdmf_zarr import ZarrDataIO
 from pydantic import Field, root_validator
 
-from ._base_dataset_models import DatasetConfiguration
+from ._base_models import BackendConfiguration, DatasetConfiguration
 
 _base_zarr_codecs = set(zarr.codec_registry.keys())
 _lossy_zarr_codecs = set(("astype", "bitround", "quantize"))
@@ -134,3 +136,32 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
             compressor = False
 
         return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor)
+
+
+class ZarrBackendConfiguration(BackendConfiguration):
+    """A model for matching collections of DatasetConfigurations specific to the Zarr backend."""
+
+    backend: Literal["zarr"] = Field(
+        default="zarr", description="The name of the backend used to configure the NWBFile."
+    )
+    data_io_class: Type[ZarrDataIO] = Field(
+        default=ZarrDataIO, description="The DataIO class that is specific to Zarr."
+    )
+    dataset_configurations: Dict[str, ZarrDatasetConfiguration] = Field(
+        description=(
+            "A mapping from object locations to their ZarrDatasetConfiguration specification that contains all "
+            "information for writing the datasets to disk using the Zarr backend."
+        )
+    )
+    number_of_jobs: int = Field(
+        description=(
+            "Number of jobs to use in parallel during write. Negative values, starting from -1, "
+            "will use all the available CPUs (including logical), -2 is all except one, etc. "
+            "This is equivalent to the pattern of indexing of "
+            " `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` "
+            "uses all except one, etc."
+        ),
+        ge=-psutil.cpu_count(),  # TODO: should we specify logical=False in cpu_count?
+        le=psutil.cpu_count(),
+        default=psutil.cpu_count() - 1,
+    )
diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py
@@ -1,6 +1,8 @@
 from ._mock._mock_dataset_models import (
     mock_DatasetInfo,
+    mock_HDF5BackendConfiguration,
     mock_HDF5DatasetConfiguration,
+    mock_ZarrBackendConfiguration,
     mock_ZarrDatasetConfiguration,
 )
 from .mock_files import generate_path_expander_demo_ibl

diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py
@@ -8,18 +8,25 @@
     AVAILABLE_HDF5_COMPRESSION_METHODS,
     AVAILABLE_ZARR_COMPRESSION_METHODS,
     DatasetInfo,
+    HDF5BackendConfiguration,
     HDF5DatasetConfiguration,
+    ZarrBackendConfiguration,
     ZarrDatasetConfiguration,
 )
 
 
-def mock_DatasetInfo() -> DatasetInfo:
+def mock_DatasetInfo(
+    object_id: str = "481a0860-3a0c-40ec-b931-df4a3e9b101f",
+    location: str = "acquisition/TestElectricalSeries/data",
+    full_shape: Tuple[int, ...] = (60 * 30_000, 384),  # ~1 minute of v1 NeuroPixels probe
+    dtype=np.dtype("int16"),
+) -> DatasetInfo:
     """Mock instance of a DatasetInfo with NeuroPixel-like values to showcase chunk/buffer recommendations."""
     return DatasetInfo(
-        object_id="481a0860-3a0c-40ec-b931-df4a3e9b101f",
-        location="acquisition/TestElectricalSeries/data",
-        full_shape=(60 * 30_000, 384),  # ~1 minute of v1 NeuroPixels probe
-        dtype=np.dtype("int16"),
+        object_id=object_id,
+        location=location,
+        full_shape=full_shape,
+        dtype=dtype,
     )
 
 
@@ -59,3 +66,49 @@ def mock_ZarrDatasetConfiguration(
         filter_methods=filter_methods,
         filter_options=filter_options,
     )
+
+
+def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration:
+    """Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets."""
+    dataset_configurations = {
+        "acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration(
+            dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"),
+            chunk_shape=(78_125, 64),  # ~10 MB
+            buffer_shape=(1_250_000, 384),  # ~1 GB
+        ),
+        "acquisition/TestElectricalSeriesLF/data": HDF5DatasetConfiguration(
+            dataset_info=mock_DatasetInfo(
+                object_id="bc37e164-519f-4b65-a976-206440f1d325",
+                location="acquisition/TestElectricalSeriesLF/data",
+                full_shape=(75_000, 384),
+            ),
+            chunk_shape=(37_500, 128),  # ~10 MB
+            buffer_shape=(75_000, 384),
+        ),
+    }
+
+    return HDF5BackendConfiguration(dataset_configurations=dataset_configurations)
+
+
+def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration:
+    """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets."""
+    dataset_configurations = {
+        "acquisition/TestElectricalSeriesAP/data": ZarrDatasetConfiguration(
+            dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"),
+            chunk_shape=(78_125, 64),
+            buffer_shape=(1_250_000, 384),  # ~1 GB
+            filter_methods=["delta"],
+        ),
+        "acquisition/TestElectricalSeriesLF/data": ZarrDatasetConfiguration(
+            dataset_info=mock_DatasetInfo(
+                object_id="bc37e164-519f-4b65-a976-206440f1d325",
+                location="acquisition/TestElectricalSeriesLF/data",
+                full_shape=(75_000, 384),
+            ),
+            chunk_shape=(37_500, 128),  # ~10 MB
+            buffer_shape=(75_000, 384),
+            filter_methods=["delta"],
+        ),
+    }
+
+    return ZarrBackendConfiguration(dataset_configurations=dataset_configurations)
diff --git a/...mal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py b/...mal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py
@@ -1,9 +1,7 @@
 """Unit tests for the DatasetConfiguration Pydantic model."""
 import pytest
 
-from neuroconv.tools.nwb_helpers._models._base_dataset_models import (
-    DatasetConfiguration,
-)
+from neuroconv.tools.nwb_helpers._models._base_models import DatasetConfiguration
 from neuroconv.tools.testing import mock_DatasetInfo
 
 

diff --git a/...est_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/...est_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py
@@ -0,0 +1,49 @@
+"""Unit tests for the DatasetInfo Pydantic model."""
+from io import StringIO
+from unittest.mock import patch
+
+from neuroconv.tools.testing import mock_HDF5BackendConfiguration
+
+
+def test_hdf5_backend_configuration_print():
+    """Test the printout display of a HDF5DatasetConfiguration model looks nice."""
+    hdf5_backend_configuration = mock_HDF5BackendConfiguration()
+
+    with patch("sys.stdout", new=StringIO()) as out:
+        print(hdf5_backend_configuration)
+
+    expected_print = """
+Configurable datasets identified using the hdf5 backend
+-------------------------------------------------------
+
+acquisition/TestElectricalSeriesAP/data
+---------------------------------------
+  dtype : int16
+  full shape of source array : (1800000, 384)
+  full size of source array : 1.38 GB
+
+  buffer shape : (1250000, 384)
+  maximum RAM usage per iteration : 0.96 GB
+
+  chunk shape : (78125, 64)
+  disk space usage per chunk : 10.00 MB
+
+  compression method : gzip
+
+
+acquisition/TestElectricalSeriesLF/data
+---------------------------------------
+  dtype : int16
+  full shape of source array : (75000, 384)
+  full size of source array : 0.06 GB
+
+  buffer shape : (75000, 384)
+  maximum RAM usage per iteration : 0.06 GB
+
+  chunk shape : (37500, 128)
+  disk space usage per chunk : 9.60 MB
+
+  compression method : gzip
+
+"""
+    assert out.getvalue() == expected_print
diff --git a/...est_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/...est_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py
@@ -0,0 +1,53 @@
+"""Unit tests for the DatasetInfo Pydantic model."""
+from io import StringIO
+from unittest.mock import patch
+
+from neuroconv.tools.testing import mock_ZarrBackendConfiguration
+
+
+def test_zarr_backend_configuration_print():
+    """Test the printout display of a HDF5DatasetConfiguration model looks nice."""
+    zarr_backend_configuration = mock_ZarrBackendConfiguration()
+
+    with patch("sys.stdout", new=StringIO()) as out:
+        print(zarr_backend_configuration)
+
+    expected_print = """
+Configurable datasets identified using the zarr backend
+-------------------------------------------------------
+
+acquisition/TestElectricalSeriesAP/data
+---------------------------------------
+  dtype : int16
+  full shape of source array : (1800000, 384)
+  full size of source array : 1.38 GB
+
+  buffer shape : (1250000, 384)
+  maximum RAM usage per iteration : 0.96 GB
+
+  chunk shape : (78125, 64)
+  disk space usage per chunk : 10.00 MB
+
+  compression method : gzip
+
+  filter methods : ['delta']
+
+
+acquisition/TestElectricalSeriesLF/data
+---------------------------------------
+  dtype : int16
+  full shape of source array : (75000, 384)
+  full size of source array : 0.06 GB
+
+  buffer shape : (75000, 384)
+  maximum RAM usage per iteration : 0.06 GB
+
+  chunk shape : (37500, 128)
+  disk space usage per chunk : 9.60 MB
+
+  compression method : gzip
+
+  filter methods : ['delta']
+
+"""
+    assert out.getvalue() == expected_print