Speed up dataset tests, compressed writes to zarr3 arrays (#963)

* Add pytest-timestamper to debug slow tests * add shortuct for reading from zarrita array * smaller shard shape * do full test again * update test durations * changelog * fix after merge
scalableminds · Nov 8, 2023 · c09101f · c09101f
1 parent 3a4771e
commit c09101f
Show file tree

Hide file tree

Showing 6 changed files with 542 additions and 458 deletions.
diff --git a/webknossos/.test_durations b/webknossos/.test_durations
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
@@ -17,6 +17,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
 ### Added
 
 ### Changed
+- Performance improvements for reading from and writing to sharded zarr3 datasets, also speeding up the automated tests [#963](https://github.com/scalableminds/webknossos-libs/pull/963)
 
 ### Fixed
 

diff --git a/webknossos/poetry.lock b/webknossos/poetry.lock
diff --git a/webknossos/pyproject.toml b/webknossos/pyproject.toml
@@ -64,6 +64,7 @@ JPype1 = { version = "^1.3.0", optional = true }
 pims = { version = "^0.6.0", optional = true }
 tifffile = { version = ">=2021.11.2", optional = true }
 pylibCZIrw = { version = "3.5.1", source = "scm", optional = true }
+pytest-timestamper = "^0.0.9"
 
 [tool.poetry.extras]
 pims = ["pims"]

diff --git a/webknossos/tests/dataset/test_dataset.py b/webknossos/tests/dataset/test_dataset.py
@@ -2571,7 +2571,14 @@ def test_aligned_downsampling(data_format: DataFormat, output_path: Path) -> Non
         num_channels=3,
         data_format=input_layer.data_format,
     )
-    test_mag = test_layer.add_mag("1")
+
+    chunks_per_shard = None
+    if data_format == DataFormat.Zarr3:
+        # Writing compressed zarr with large shard shape is slow
+        # compare https://github.com/scalableminds/webknossos-libs/issues/964
+        chunks_per_shard = (4, 4, 4)
+
+    test_mag = test_layer.add_mag("1", chunks_per_shard=chunks_per_shard)
     test_mag.write(
         absolute_offset=(0, 0, 0),
         # assuming the layer has 3 channels:
@@ -2601,8 +2608,15 @@ def test_guided_downsampling(data_format: DataFormat, output_path: Path) -> None
 
     input_dataset = Dataset.open(ds_path)
     input_layer = input_dataset.get_layer("color")
+
+    chunks_per_shard = None
+    if data_format == DataFormat.Zarr3:
+        # Writing compressed zarr with large shard shape is slow
+        # compare https://github.com/scalableminds/webknossos-libs/issues/964
+        chunks_per_shard = (4, 4, 4)
+
     # Adding additional mags to the input dataset for testing
-    input_layer.get_or_add_mag("2-2-1")
+    input_layer.add_mag("2-2-1", chunks_per_shard=chunks_per_shard)
     input_layer.redownsample()
     assert len(input_layer.mags) == 2
     # Use the mag with the best resolution
@@ -2619,7 +2633,9 @@ def test_guided_downsampling(data_format: DataFormat, output_path: Path) -> None
         data_format=input_layer.data_format,
     )
     # Create the same mag in the new output dataset
-    output_mag = output_layer.add_mag(finest_input_mag.mag)
+    output_mag = output_layer.add_mag(
+        finest_input_mag.mag, chunks_per_shard=chunks_per_shard
+    )
     # Copying some data into the output dataset
     input_data = finest_input_mag.read(absolute_offset=(0, 0, 0), size=(24, 24, 24))
     output_mag.write(absolute_offset=(0, 0, 0), data=input_data)

diff --git a/webknossos/webknossos/dataset/_array.py b/webknossos/webknossos/dataset/_array.py
@@ -346,10 +346,9 @@ def read(self, offset: Vec3IntLike, shape: Vec3IntLike) -> np.ndarray:
                 offset.y : (offset.y + shape.y),
                 offset.z : (offset.z + shape.z),
             ]
-        if data.shape != shape:
-            padded_data = np.zeros(
-                (self.info.num_channels,) + shape.to_tuple(), dtype=data.dtype
-            )
+        shape_with_channels = (self.info.num_channels,) + shape.to_tuple()
+        if data.shape != shape and data.shape != shape_with_channels:
+            padded_data = np.zeros(shape_with_channels, dtype=data.dtype)
             padded_data[
                 :,
                 0 : data.shape[1],
@@ -591,10 +590,9 @@ def read(self, offset: Vec3IntLike, shape: Vec3IntLike) -> np.ndarray:
                 offset.y : (offset.y + shape.y),
                 offset.z : (offset.z + shape.z),
             ]
-        if data.shape != shape:
-            padded_data = np.zeros(
-                (self.info.num_channels,) + shape.to_tuple(), dtype=data.dtype
-            )
+        shape_with_channels = (self.info.num_channels,) + shape.to_tuple()
+        if data.shape != shape and data.shape != shape_with_channels:
+            padded_data = np.zeros(shape_with_channels, dtype=data.dtype)
             padded_data[
                 :,
                 0 : data.shape[1],