From fe6253404543821a7d1c67417866dd07116837e5 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Sun, 17 Sep 2023 19:06:12 -0400 Subject: [PATCH 01/13] add backends and their tests --- src/neuroconv/tools/nwb_helpers/__init__.py | 6 +- ...base_dataset_models.py => _base_models.py} | 30 ++++++++- ...hdf5_dataset_models.py => _hdf5_models.py} | 20 +++++- ...zarr_dataset_models.py => _zarr_models.py} | 29 ++++++++- src/neuroconv/tools/testing/__init__.py | 8 ++- .../testing/_mock/_mock_dataset_models.py | 63 +++++++++++++++++-- .../test_dataset_configuration_model.py | 4 +- .../test_dataset_configuration_models.py | 50 --------------- .../test_hdf5_backend_configuration_model.py | 38 +++++++++++ .../test_zarr_backend_configuration_model.py | 40 ++++++++++++ 10 files changed, 221 insertions(+), 67 deletions(-) rename src/neuroconv/tools/nwb_helpers/_models/{_base_dataset_models.py => _base_models.py} (80%) rename src/neuroconv/tools/nwb_helpers/_models/{_hdf5_dataset_models.py => _hdf5_models.py} (80%) rename src/neuroconv/tools/nwb_helpers/_models/{_zarr_dataset_models.py => _zarr_models.py} (82%) delete mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_models.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 07dd54ffc..d5f7c0b0a 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -5,8 +5,8 @@ make_nwbfile_from_metadata, make_or_load_nwbfile, ) -from ._models._base_dataset_models import DatasetConfiguration, DatasetInfo -from ._models._hdf5_dataset_models import HDF5DatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS -from ._models._zarr_dataset_models import ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS +from ._models._base_models import DatasetConfiguration, DatasetInfo +from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS +from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_dataset_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py similarity index 80% rename from src/neuroconv/tools/nwb_helpers/_models/_base_dataset_models.py rename to src/neuroconv/tools/nwb_helpers/_models/_base_models.py index 624896466..eb0408c2e 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_dataset_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -1,10 +1,11 @@ """Base Pydantic models for DatasetInfo and DatasetConfiguration.""" -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Tuple, Union, Literal, Type import h5py import numcodecs import numpy as np -from pydantic import BaseModel, Field, root_validator, validator +from hdmf.container import DataIO +from pydantic import BaseModel, Field, root_validator class DatasetInfo(BaseModel): @@ -113,3 +114,28 @@ def get_data_io_keyword_arguments(self): Fetch the properly structured dictionary of input arguments to be passed directly into a H5DataIO or ZarrDataIO. """ raise NotImplementedError + + +class BackendConfiguration(BaseModel): + """A model for matching collections of DatasetConfigurations to a specific backend.""" + + backend: Literal["hdf5", "zarr"] = Field(description="The name of the backend used to configure the NWBFile.") + data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") + dataset_configurations: Dict[str, DatasetConfiguration] = Field( + description=( + "A mapping from object locations to their DatasetConfiguration specification that contains all information " + "for writing the datasets to disk using the specific backend." + ) + ) + + def __str__(self) -> str: + """Not overriding __repr__ as this is intended to render only when wrapped in print().""" + string = ( + f"\nConfigurable datasets identified using the {self.backend} backend" + f"\n{'-' * (43 + len(self.backend) + 8)}" + ) + + for dataset_configuration in self.dataset_configurations.values(): + string += f"\n{dataset_configuration}" + + return string diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_dataset_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py similarity index 80% rename from src/neuroconv/tools/nwb_helpers/_models/_hdf5_dataset_models.py rename to src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index df9e98c9c..e21015f89 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_dataset_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -1,11 +1,12 @@ """Base Pydantic models for the HDF5DatasetConfiguration.""" -from typing import Any, Dict, Literal, Union +from typing import Any, Dict, Literal, Union, Type import h5py +from pynwb import H5DataIO from nwbinspector.utils import is_module_installed from pydantic import Field -from ._base_dataset_models import DatasetConfiguration +from ._base_models import DatasetConfiguration, BackendConfiguration _base_hdf5_filters = set(h5py.filters.decode) - set( ( @@ -74,3 +75,18 @@ def get_data_io_keyword_arguments(self) -> Dict[str, Any]: compression_bundle = dict(compression=self.compression_method, compression_opts=self.compression_options) return dict(chunks=self.chunk_shape, **compression_bundle) + + +class HDF5BackendConfiguration(BackendConfiguration): + """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" + + backend: Literal["hdf5"] = Field( + default="hdf5", description="The name of the backend used to configure the NWBFile." + ) + data_io_class: Type[H5DataIO] = Field(default=H5DataIO, description="The DataIO class that is specific to HDF5.") + dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field( + description=( + "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all " + "information for writing the datasets to disk using the HDF5 backend." + ) + ) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_dataset_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py similarity index 82% rename from src/neuroconv/tools/nwb_helpers/_models/_zarr_dataset_models.py rename to src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index d8691c677..0e83d58e5 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_dataset_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -1,11 +1,13 @@ """Base Pydantic models for the ZarrDatasetConfiguration.""" -from typing import Any, Dict, Literal, Union, List +from typing import Any, Dict, Literal, Union, List, Type import numcodecs import zarr +import psutil +from hdmf_zarr import ZarrDataIO from pydantic import Field, root_validator -from ._base_dataset_models import DatasetConfiguration +from ._base_models import DatasetConfiguration, BackendConfiguration _available_zarr_filters = ( set(zarr.codec_registry.keys()) @@ -120,3 +122,26 @@ def get_data_io_keyword_arguments(self) -> Dict[str, Any]: compressor = False return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor) + + +class ZarrBackendConfiguration(BackendConfiguration): + """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" + + backend: Literal["zarr"] = Field( + default="zarr", description="The name of the backend used to configure the NWBFile." + ) + data_io_class: Type[ZarrDataIO] = Field( + default=ZarrDataIO, description="The DataIO class that is specific to Zarr." + ) + dataset_configurations: Dict[str, ZarrDatasetConfiguration] = Field( + description=( + "A mapping from object locations to their ZarrDatasetConfiguration specification that contains all " + "information for writing the datasets to disk using the Zarr backend." + ) + ) + number_of_jobs: int = Field( + description="Number of jobs to use in parallel during write.", + ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? + le=psutil.cpu_count(), + default=-2, # -2 translates to 'all CPU except for one' + ) diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 731c19dbe..3c987fdd0 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,4 +1,10 @@ from .mock_files import generate_path_expander_demo_ibl from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface from .mock_ttl_signals import generate_mock_ttl_signal, regenerate_test_cases -from ._mock._mock_dataset_models import mock_DatasetInfo, mock_HDF5DatasetConfiguration, mock_ZarrDatasetConfiguration +from ._mock._mock_dataset_models import ( + mock_DatasetInfo, + mock_HDF5BackendConfiguration, + mock_ZarrBackendConfiguration, + mock_HDF5DatasetConfiguration, + mock_ZarrDatasetConfiguration, +) diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index e1d6a4e19..024ad9d3c 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -6,20 +6,27 @@ from ...nwb_helpers import ( DatasetInfo, + HDF5BackendConfiguration, HDF5DatasetConfiguration, + ZarrBackendConfiguration, ZarrDatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS, AVAILABLE_ZARR_COMPRESSION_METHODS, ) -def mock_DatasetInfo() -> DatasetInfo: +def mock_DatasetInfo( + object_id: str = "481a0860-3a0c-40ec-b931-df4a3e9b101f", + location: str = "acquisition/TestElectricalSeries/data", + full_shape: Tuple[int, ...] = (60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe + dtype=np.dtype("int16"), +) -> DatasetInfo: """Mock instance of a DatasetInfo with NeuroPixel-like values to showcase chunk/buffer recommendations.""" return DatasetInfo( - object_id="481a0860-3a0c-40ec-b931-df4a3e9b101f", - location="acquisition/TestElectricalSeries/data", - full_shape=(60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe - dtype=np.dtype("int16"), + object_id=object_id, + location=location, + full_shape=full_shape, + dtype=dtype, ) @@ -55,3 +62,49 @@ def mock_ZarrDatasetConfiguration( filter_methods=filter_methods, filter_options=filter_options, ) + + +def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: + """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" + dataset_configurations = { + "acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration( + dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), + chunk_shape=(78_125, 64), # ~10 MB + buffer_shape=(1_250_000, 384), # ~1 GB + ), + "acquisition/TestElectricalSeriesLF/data": HDF5DatasetConfiguration( + dataset_info=mock_DatasetInfo( + object_id="bc37e164-519f-4b65-a976-206440f1d325", + location="acquisition/TestElectricalSeriesLF/data", + full_shape=(75_000, 384), + ), + chunk_shape=(37_500, 128), # ~10 MB + buffer_shape=(75_000, 384), + ), + } + + return HDF5BackendConfiguration(dataset_configurations=dataset_configurations) + + +def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration: + """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" + dataset_configurations = { + "acquisition/TestElectricalSeriesAP/data": ZarrDatasetConfiguration( + dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), + chunk_shape=(78_125, 64), + buffer_shape=(1_250_000, 384), # ~1 GB + filter_methods=["delta"], + ), + "acquisition/TestElectricalSeriesLF/data": ZarrDatasetConfiguration( + dataset_info=mock_DatasetInfo( + object_id="bc37e164-519f-4b65-a976-206440f1d325", + location="acquisition/TestElectricalSeriesLF/data", + full_shape=(75_000, 384), + ), + chunk_shape=(37_500, 128), # ~10 MB + buffer_shape=(75_000, 384), + filter_methods=["delta"], + ), + } + + return ZarrBackendConfiguration(dataset_configurations=dataset_configurations) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py index 614fd92e3..b0f5a5e76 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py @@ -1,7 +1,7 @@ """Unit tests for the DatasetConfiguration Pydantic model.""" import pytest -from neuroconv.tools.nwb_helpers._models._base_dataset_models import DatasetConfiguration +from neuroconv.tools.nwb_helpers._models._base_models import DatasetConfiguration from neuroconv.tools.testing import mock_DatasetInfo @@ -11,6 +11,6 @@ def test_get_data_io_keyword_arguments_not_implemented(): chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), ) - + with pytest.raises(NotImplementedError): dataset_configuration.get_data_io_keyword_arguments() diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_models.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_models.py deleted file mode 100644 index 273610fe4..000000000 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_models.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Unit tests for the DatasetInfo Pydantic model.""" -from io import StringIO -from unittest.mock import patch - -import numpy as np - -from neuroconv.tools.nwb_helpers import - - -def MockHDF5DatasetConfig() -> DatasetInfo: - return DatasetInfo( - object_id="abc123", - location="TestParent/data", - full_shape=(2, 4), - dtype=np.dtype("int16"), - ) - - -def test_dataset_info_print(): - """Test the printout display of a Dataset modellooks nice.""" - dataset_info = MockDatasetInfo() - - with patch("sys.stdout", new=StringIO()) as out: - print(dataset_info) - - expected_print = """ -TestParent/data ---------------- - maxshape: (2, 4) - dtype: int16 -""" - assert out.getvalue() == expected_print - - -def test_dataset_info_repr(): - """Test the programmatic repr of a Dataset model is more dataclass-like.""" - dataset_info = MockDatasetInfo() - - # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects - expected_repr = ( - "DatasetInfo(object_id='abc123', location='TestParent/data', full_shape=(2, 4), dtype=dtype('int16'))" - ) - assert repr(dataset_info) == expected_repr - - -def test_dataset_info_hashability(): - dataset_info = MockDatasetInfo() - - test_dict = {dataset_info: True} # Technically this alone would raise an error if it didn't work... - assert test_dict[dataset_info] is True # ... but asserting this for good measure. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py new file mode 100644 index 000000000..e77694d08 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py @@ -0,0 +1,38 @@ +"""Unit tests for the DatasetInfo Pydantic model.""" +from io import StringIO +from unittest.mock import patch + + +from neuroconv.tools.testing import mock_HDF5BackendConfiguration + + +def test_hdf5_backend_configuration_print(): + """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + hdf5_backend_configuration = mock_HDF5BackendConfiguration() + + with patch("sys.stdout", new=StringIO()) as out: + print(hdf5_backend_configuration) + + expected_print = """ +Configurable datasets identified using the hdf5 backend +------------------------------------------------------- + +acquisition/TestElectricalSeriesAP/data +--------------------------------------- + maxshape: (1800000, 384) + dtype: int16 + + chunk_shape: (78125, 64) + buffer_shape: (1250000, 384) + compression_method: gzip + +acquisition/TestElectricalSeriesLF/data +--------------------------------------- + maxshape: (75000, 384) + dtype: int16 + + chunk_shape: (37500, 128) + buffer_shape: (75000, 384) + compression_method: gzip +""" + assert out.getvalue() == expected_print diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py new file mode 100644 index 000000000..66d7dbc03 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py @@ -0,0 +1,40 @@ +"""Unit tests for the DatasetInfo Pydantic model.""" +from io import StringIO +from unittest.mock import patch + + +from neuroconv.tools.testing import mock_ZarrBackendConfiguration + + +def test_zarr_backend_configuration_print(): + """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + zarr_backend_configuration = mock_ZarrBackendConfiguration() + + with patch("sys.stdout", new=StringIO()) as out: + print(zarr_backend_configuration) + + expected_print = """ +Configurable datasets identified using the zarr backend +------------------------------------------------------- + +acquisition/TestElectricalSeriesAP/data +--------------------------------------- + maxshape: (1800000, 384) + dtype: int16 + + chunk_shape: (78125, 64) + buffer_shape: (1250000, 384) + compression_method: gzip + filter_methods: ['delta'] + +acquisition/TestElectricalSeriesLF/data +--------------------------------------- + maxshape: (75000, 384) + dtype: int16 + + chunk_shape: (37500, 128) + buffer_shape: (75000, 384) + compression_method: gzip + filter_methods: ['delta'] +""" + assert out.getvalue() == expected_print From 81261d167139100f872cb8ef7b8313c0a237b3f5 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Sun, 17 Sep 2023 19:07:31 -0400 Subject: [PATCH 02/13] include global mapper --- src/neuroconv/tools/nwb_helpers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index d5f7c0b0a..89c738d51 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -10,3 +10,4 @@ from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) +BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) From fa15d6aa6e70b50d8771486b6b9f566812560024 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 02:50:30 +0000 Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 12 ++++++++++-- .../tools/nwb_helpers/_models/_base_models.py | 2 +- .../tools/nwb_helpers/_models/_hdf5_models.py | 6 +++--- .../tools/nwb_helpers/_models/_zarr_models.py | 6 +++--- src/neuroconv/tools/testing/__init__.py | 9 ++------- .../test_hdf5_backend_configuration_model.py | 1 - .../test_zarr_backend_configuration_model.py | 1 - 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 89c738d51..0982439bb 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -6,8 +6,16 @@ make_or_load_nwbfile, ) from ._models._base_models import DatasetConfiguration, DatasetInfo -from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS -from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS +from ._models._hdf5_models import ( + AVAILABLE_HDF5_COMPRESSION_METHODS, + HDF5BackendConfiguration, + HDF5DatasetConfiguration, +) +from ._models._zarr_models import ( + AVAILABLE_ZARR_COMPRESSION_METHODS, + ZarrBackendConfiguration, + ZarrDatasetConfiguration, +) BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index eb0408c2e..ad940fb6f 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -1,5 +1,5 @@ """Base Pydantic models for DatasetInfo and DatasetConfiguration.""" -from typing import Any, Dict, Tuple, Union, Literal, Type +from typing import Any, Dict, Literal, Tuple, Type, Union import h5py import numcodecs diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index e21015f89..6e1108432 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -1,12 +1,12 @@ """Base Pydantic models for the HDF5DatasetConfiguration.""" -from typing import Any, Dict, Literal, Union, Type +from typing import Any, Dict, Literal, Type, Union import h5py -from pynwb import H5DataIO from nwbinspector.utils import is_module_installed from pydantic import Field +from pynwb import H5DataIO -from ._base_models import DatasetConfiguration, BackendConfiguration +from ._base_models import BackendConfiguration, DatasetConfiguration _base_hdf5_filters = set(h5py.filters.decode) - set( ( diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 0e83d58e5..7b7b2dcbc 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -1,13 +1,13 @@ """Base Pydantic models for the ZarrDatasetConfiguration.""" -from typing import Any, Dict, Literal, Union, List, Type +from typing import Any, Dict, List, Literal, Type, Union import numcodecs -import zarr import psutil +import zarr from hdmf_zarr import ZarrDataIO from pydantic import Field, root_validator -from ._base_models import DatasetConfiguration, BackendConfiguration +from ._base_models import BackendConfiguration, DatasetConfiguration _available_zarr_filters = ( set(zarr.codec_registry.keys()) diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 1afc05495..502634466 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,15 +1,10 @@ from ._mock._mock_dataset_models import ( mock_DatasetInfo, + mock_HDF5BackendConfiguration, mock_HDF5DatasetConfiguration, + mock_ZarrBackendConfiguration, mock_ZarrDatasetConfiguration, ) from .mock_files import generate_path_expander_demo_ibl from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface from .mock_ttl_signals import generate_mock_ttl_signal, regenerate_test_cases -from ._mock._mock_dataset_models import ( - mock_DatasetInfo, - mock_HDF5BackendConfiguration, - mock_ZarrBackendConfiguration, - mock_HDF5DatasetConfiguration, - mock_ZarrDatasetConfiguration, -) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py index e77694d08..290ca6a1b 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py @@ -2,7 +2,6 @@ from io import StringIO from unittest.mock import patch - from neuroconv.tools.testing import mock_HDF5BackendConfiguration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py index 66d7dbc03..9e235df77 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py @@ -2,7 +2,6 @@ from io import StringIO from unittest.mock import patch - from neuroconv.tools.testing import mock_ZarrBackendConfiguration From fd7c5e4e1da0c278778c9d4359dd580de4196545 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 17 Sep 2023 22:53:46 -0400 Subject: [PATCH 04/13] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f1375b46..7ee7c7c08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Features * Added Pydantic data models of `DatasetInfo` (immutable summary of core dataset values such as maximum shape and dtype) and `DatasetConfiguration` for both HDF5 and Zarr datasets (the optional layer that specifies chunk/buffering/compression). [PR #567](https://github.com/catalystneuro/neuroconv/pull/567) +* Added Pydantic data models of `BackendConfiguration` for both HDF5 and Zarr datasets (container/mapper of all the `DatasetConfiguration`s for a particular file). [PR #568](https://github.com/catalystneuro/neuroconv/pull/568) From eb705b4bc7ce3b24269092a4fcae542bb454749d Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 17 Sep 2023 23:55:11 -0400 Subject: [PATCH 05/13] Update requirements-minimal.txt --- requirements-minimal.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-minimal.txt b/requirements-minimal.txt index 7ba056107..26545b661 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -3,6 +3,7 @@ jsonschema>=3.2.0 PyYAML>=5.4 scipy>=1.4.1 h5py>=2.10.0 +hdmf_zarr>=3.0.0 hdmf>=3.4.7 pynwb>=2.3.2;python_version>='3.8' psutil>=5.8.0 From 9bb6c0434ccb520e0647dfbe97255a99ab5a7726 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Wed, 4 Oct 2023 03:36:55 -0400 Subject: [PATCH 06/13] Update src/neuroconv/tools/testing/_mock/_mock_dataset_models.py Co-authored-by: Heberto Mayorquin --- src/neuroconv/tools/testing/_mock/_mock_dataset_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index 67b82c1bb..c1cbb0f3a 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -65,7 +65,7 @@ def mock_ZarrDatasetConfiguration( def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: - """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" + """Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets.""" dataset_configurations = { "acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration( dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), From 5a8b139620a59cd1c1b2d6b87ebd5b9f49d344c0 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Wed, 4 Oct 2023 03:40:01 -0400 Subject: [PATCH 07/13] Update src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py --- src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 7b7b2dcbc..9156e233e 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -140,7 +140,7 @@ class ZarrBackendConfiguration(BackendConfiguration): ) ) number_of_jobs: int = Field( - description="Number of jobs to use in parallel during write.", + description="Number of jobs to use in parallel during write. Negative slicing conforms with the pattern of indexing `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` uses all except one, etc.", ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? le=psutil.cpu_count(), default=-2, # -2 translates to 'all CPU except for one' From 56fd1bb16328d9558692bc9098e9f76c20c66b1d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Nov 2023 14:54:25 +0000 Subject: [PATCH 08/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/_models/_base_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index 88fd837cc..313454678 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -1,7 +1,7 @@ """Base Pydantic models for DatasetInfo and DatasetConfiguration.""" import math from abc import ABC, abstractmethod -from typing import Any, Dict, Literal, Tuple, Union, Type +from typing import Any, Dict, Literal, Tuple, Type, Union import h5py import numcodecs From d0fd8b8c0fcf9da43fb833918e18cc45551cd6a9 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Mon, 6 Nov 2023 10:33:55 -0500 Subject: [PATCH 09/13] fix tests --- .../tools/nwb_helpers/_models/_hdf5_models.py | 6 ++- .../tools/nwb_helpers/_models/_zarr_models.py | 6 ++- .../test_hdf5_backend_configuration_model.py | 32 +++++++++++----- .../test_zarr_backend_configuration_model.py | 38 +++++++++++++------ 4 files changed, 57 insertions(+), 25 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index 6a97f1b50..daf772688 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -84,10 +84,12 @@ def get_data_io_kwargs(self) -> Dict[str, Any]: class HDF5BackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" - backend: Literal["hdf5"] = Field( + backend: Literal["hdf5"] = Field( # TODO: in pydantic v2 use property instead of class attribute default="hdf5", description="The name of the backend used to configure the NWBFile." ) - data_io_class: Type[H5DataIO] = Field(default=H5DataIO, description="The DataIO class that is specific to HDF5.") + data_io_class: Type[H5DataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute + default=H5DataIO, description="The DataIO class that is specific to HDF5." + ) dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field( description=( "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all " diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 55c434536..8874e212b 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -154,7 +154,11 @@ class ZarrBackendConfiguration(BackendConfiguration): ) ) number_of_jobs: int = Field( - description="Number of jobs to use in parallel during write. Negative slicing conforms with the pattern of indexing `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` uses all except one, etc.", + description=( + "Number of jobs to use in parallel during write. Negative slicing conforms with the pattern of indexing " + " `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` " + "uses all except one, etc." + ), ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? le=psutil.cpu_count(), default=-2, # -2 translates to 'all CPU except for one' diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py index 290ca6a1b..2d6242ad1 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py @@ -18,20 +18,32 @@ def test_hdf5_backend_configuration_print(): acquisition/TestElectricalSeriesAP/data --------------------------------------- - maxshape: (1800000, 384) - dtype: int16 + dtype : int16 + full shape of source array : (1800000, 384) + full size of source array : 1.38 GB + + buffer shape : (1250000, 384) + maximum RAM usage per iteration : 0.96 GB + + chunk shape : (78125, 64) + disk space usage per chunk : 10.00 MB + + compression method : gzip - chunk_shape: (78125, 64) - buffer_shape: (1250000, 384) - compression_method: gzip acquisition/TestElectricalSeriesLF/data --------------------------------------- - maxshape: (75000, 384) - dtype: int16 + dtype : int16 + full shape of source array : (75000, 384) + full size of source array : 0.06 GB + + buffer shape : (75000, 384) + maximum RAM usage per iteration : 0.06 GB + + chunk shape : (37500, 128) + disk space usage per chunk : 9.60 MB + + compression method : gzip - chunk_shape: (37500, 128) - buffer_shape: (75000, 384) - compression_method: gzip """ assert out.getvalue() == expected_print diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py index 9e235df77..e8017c719 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py @@ -18,22 +18,36 @@ def test_zarr_backend_configuration_print(): acquisition/TestElectricalSeriesAP/data --------------------------------------- - maxshape: (1800000, 384) - dtype: int16 + dtype : int16 + full shape of source array : (1800000, 384) + full size of source array : 1.38 GB + + buffer shape : (1250000, 384) + maximum RAM usage per iteration : 0.96 GB + + chunk shape : (78125, 64) + disk space usage per chunk : 10.00 MB + + compression method : gzip + + filter methods : ['delta'] - chunk_shape: (78125, 64) - buffer_shape: (1250000, 384) - compression_method: gzip - filter_methods: ['delta'] acquisition/TestElectricalSeriesLF/data --------------------------------------- - maxshape: (75000, 384) - dtype: int16 + dtype : int16 + full shape of source array : (75000, 384) + full size of source array : 0.06 GB + + buffer shape : (75000, 384) + maximum RAM usage per iteration : 0.06 GB + + chunk shape : (37500, 128) + disk space usage per chunk : 9.60 MB + + compression method : gzip + + filter methods : ['delta'] - chunk_shape: (37500, 128) - buffer_shape: (75000, 384) - compression_method: gzip - filter_methods: ['delta'] """ assert out.getvalue() == expected_print From b014351f4b15aefe7f88b8ade6af663c14b348ec Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:33:17 -0500 Subject: [PATCH 10/13] Apply suggestions from code review Co-authored-by: Heberto Mayorquin --- src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 8874e212b..353dd2a0d 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -155,11 +155,11 @@ class ZarrBackendConfiguration(BackendConfiguration): ) number_of_jobs: int = Field( description=( - "Number of jobs to use in parallel during write. Negative slicing conforms with the pattern of indexing " + "Number of jobs to use in parallel during write. Negative values, starting from -1, will use all the available CPUs (including logical), -2 is all except one, etc. This is equivalent to the pattern of indexing of " " `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` " "uses all except one, etc." ), ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? le=psutil.cpu_count(), - default=-2, # -2 translates to 'all CPU except for one' + default=psutil.cpu_count() -1 ) From bdb6fd0f030ada801333a8b90a0f2300dbb01048 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:33:30 -0500 Subject: [PATCH 11/13] Update src/neuroconv/tools/nwb_helpers/_models/_base_models.py Co-authored-by: Heberto Mayorquin --- src/neuroconv/tools/nwb_helpers/_models/_base_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index 313454678..ceba9781e 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -190,7 +190,8 @@ class BackendConfiguration(BaseModel): data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") dataset_configurations: Dict[str, DatasetConfiguration] = Field( description=( - "A mapping from object locations to their DatasetConfiguration specification that contains all information " + "A mapping from object locations (e.g. ` +acquisition/TestElectricalSeriesAP/data`) to their DatasetConfiguration specification that contains all information " "for writing the datasets to disk using the specific backend." ) ) From 8c1d8561f5b4cfc8b88e3a44bb2cda7ffd9ec127 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 15:35:22 +0000 Subject: [PATCH 12/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 353dd2a0d..4d96bcc43 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -161,5 +161,5 @@ class ZarrBackendConfiguration(BackendConfiguration): ), ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? le=psutil.cpu_count(), - default=psutil.cpu_count() -1 + default=psutil.cpu_count() - 1, ) From 435053a2e212b8ce10d086dd04e9dc1cd1c09616 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:41:48 -0500 Subject: [PATCH 13/13] Apply suggestions from code review --- src/neuroconv/tools/nwb_helpers/_models/_base_models.py | 4 ++-- src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index ceba9781e..72b364dea 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -190,8 +190,8 @@ class BackendConfiguration(BaseModel): data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") dataset_configurations: Dict[str, DatasetConfiguration] = Field( description=( - "A mapping from object locations (e.g. ` -acquisition/TestElectricalSeriesAP/data`) to their DatasetConfiguration specification that contains all information " + "A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) " + "to their DatasetConfiguration specification that contains all information " "for writing the datasets to disk using the specific backend." ) ) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 4d96bcc43..760c7c2a9 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -155,7 +155,9 @@ class ZarrBackendConfiguration(BackendConfiguration): ) number_of_jobs: int = Field( description=( - "Number of jobs to use in parallel during write. Negative values, starting from -1, will use all the available CPUs (including logical), -2 is all except one, etc. This is equivalent to the pattern of indexing of " + "Number of jobs to use in parallel during write. Negative values, starting from -1, " + "will use all the available CPUs (including logical), -2 is all except one, etc. " + "This is equivalent to the pattern of indexing of " " `list(range(total_number_of_cpu))[number_of_jobs]`; for example, `-1` uses all available CPU, `-2` " "uses all except one, etc." ),