From f62c515f0dea49281f9e6d3aa83ef3e16deb8883 Mon Sep 17 00:00:00 2001 From: Tony Tung Date: Fri, 1 Jun 2018 13:29:57 -0700 Subject: [PATCH] Set up the caching numpy array to match the dtype of the tiles. (#208) This is particularly important for passthrough pipeline stages that don't do much computation, but reading the data and immediately writing the data alters it. --- starfish/errors.py | 5 ++ starfish/image/_stack.py | 55 ++++++++++++++-- tests/test_slicedimage_dtype.py | 112 ++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 starfish/errors.py create mode 100644 tests/test_slicedimage_dtype.py diff --git a/starfish/errors.py b/starfish/errors.py new file mode 100644 index 000000000..1bcd70380 --- /dev/null +++ b/starfish/errors.py @@ -0,0 +1,5 @@ +class DataFormatWarning(Warning): + """ + Warnings given by starfish when the data is not formatted as expected, though not fatally. + """ + pass diff --git a/starfish/image/_stack.py b/starfish/image/_stack.py index ceb1d7641..a2b219c24 100644 --- a/starfish/image/_stack.py +++ b/starfish/image/_stack.py @@ -2,6 +2,7 @@ import os from functools import partial from typing import Any, Iterable, Iterator, Mapping, MutableSequence, Sequence, Tuple, Union +from warnings import warn import numpy from scipy.stats import scoreatpercentile @@ -9,6 +10,7 @@ from slicedimage import Reader, Writer from starfish.constants import Coordinates, Indices +from starfish.errors import DataFormatWarning from ._base import ImageBase @@ -30,14 +32,57 @@ def __init__(self, image_partition): self._num_zlayers = 1 self._tile_shape = tuple(image_partition.default_tile_shape) - self._data = numpy.zeros((self._num_hybs, self._num_chs, self._num_zlayers) + self._tile_shape) - self._data_needs_writeback = False - - for tile in image_partition.tiles(): + # Examine the tiles to figure out the right kind (int, float, etc.) and size. We require that all the tiles + # have the same kind of data type, but we do not require that they all have the same size of data type. The + # allocated array is the highest size we encounter. + kind = None + max_size = 0 + for tile in self._image_partition.tiles(): + dtype = tile.numpy_array.dtype + if kind is None: + kind = dtype.kind + else: + if kind != dtype.kind: + raise TypeError("All tiles should have the same kind of dtype") + if dtype.itemsize > max_size: + max_size = dtype.itemsize + + # now that we know the tile data type (kind and size), we can allocate the data array. + self._data = numpy.zeros( + shape=(self._num_hybs, self._num_chs, self._num_zlayers) + self._tile_shape, + dtype=numpy.dtype(f"{kind}{max_size}") + ) + + # iterate through the tiles and set the data. + for tile in self._image_partition.tiles(): h = tile.indices[Indices.HYB] c = tile.indices[Indices.CH] zlayer = tile.indices.get(Indices.Z, 0) - self.set_slice(indices={Indices.HYB: h, Indices.CH: c, Indices.Z: zlayer}, data=tile.numpy_array) + data = tile.numpy_array + if max_size != data.dtype.itemsize: + # this source tile has a smaller data size than the other ones, though the same kind. need to scale the + # data. + if data.dtype.kind == "f": + # floating point can be done with numpy.interp. + src_finfo = numpy.finfo(data.dtype) + dst_finfo = numpy.finfo(self._data.dtype) + data = numpy.interp( + data, + (src_finfo.min, src_finfo.max), + (dst_finfo.min, dst_finfo.max)) + else: + # fixed point can be done with a simple multiply. + src_max = numpy.iinfo(data.dtype).max + dst_max = numpy.iinfo(self._data.dtype).max + data = data * (dst_max / src_max) + warn( + f"Tile " + f"(H: {tile.indices[Indices.HYB]} C: {tile.indices[Indices.CH]} Z: {tile.indices[Indices.Z]}) has " + f"dtype {data.dtype}. One or more tiles is of a larger dtype {self._data.dtype}.", + DataFormatWarning) + self.set_slice(indices={Indices.HYB: h, Indices.CH: c, Indices.Z: zlayer}, data=data) + # set_slice will mark the data as needing writeback, so we need to unset that. + self._data_needs_writeback = False @classmethod def from_url(cls, relativeurl, baseurl): diff --git a/tests/test_slicedimage_dtype.py b/tests/test_slicedimage_dtype.py new file mode 100644 index 000000000..3102aa6f9 --- /dev/null +++ b/tests/test_slicedimage_dtype.py @@ -0,0 +1,112 @@ +import unittest +import warnings + +import numpy +from slicedimage import Tile, TileSet + +from starfish.constants import Coordinates, Indices +from starfish.errors import DataFormatWarning +from starfish.image import ImageStack + + +class TestSlicedImageDtype(unittest.TestCase): + NUM_HYB = 2 + NUM_CH = 2 + NUM_Z = 2 + HEIGHT = 10 + WIDTH = 10 + + def make_stack(self, dtype: numpy.number, corner_dtype: numpy.number): + """ + Makes a stack that's all of the same type, except the hyb=0,ch=0,z=0 corner, which is a different type. All the + tiles are initialized with ones. + + Parameters + ---------- + dtype : numpy.number + The data type of all the tiles except the hyd=0,ch=0,z=0 corner. + corner_dtype + The data type of the tile in the hyd=0,ch=0,z=0 corner. + + Returns + ------- + ImageStack : + The image stack with the tiles initialized as described. + """ + img = TileSet( + {Coordinates.X, Coordinates.Y, Indices.HYB, Indices.CH, Indices.Z}, + { + Indices.HYB: TestSlicedImageDtype.NUM_HYB, + Indices.CH: TestSlicedImageDtype.NUM_CH, + Indices.Z: TestSlicedImageDtype.NUM_Z, + }, + default_tile_shape=(TestSlicedImageDtype.HEIGHT, TestSlicedImageDtype.WIDTH), + ) + for hyb in range(TestSlicedImageDtype.NUM_HYB): + for ch in range(TestSlicedImageDtype.NUM_CH): + for z in range(TestSlicedImageDtype.NUM_Z): + tile = Tile( + { + Coordinates.X: (0.0, 0.001), + Coordinates.Y: (0.0, 0.001), + Coordinates.Z: (0.0, 0.001), + }, + { + Indices.HYB: hyb, + Indices.CH: ch, + Indices.Z: z, + } + ) + if hyb == 0 and ch == 0 and z == 0: + data = numpy.ones((TestSlicedImageDtype.HEIGHT, TestSlicedImageDtype.WIDTH), dtype=corner_dtype) + else: + data = numpy.ones((TestSlicedImageDtype.HEIGHT, TestSlicedImageDtype.WIDTH), dtype=dtype) + tile.numpy_array = data + + img.add_tile(tile) + + return ImageStack(img) + + def test_different_kind(self): + with self.assertRaises(TypeError): + self.make_stack(numpy.uint32, numpy.float32) + + def test_all_identical(self): + stack = self.make_stack(numpy.uint32, numpy.uint32) + self.assertEqual(stack.numpy_array.all(), 1) + + def test_int_type_promotion(self): + with warnings.catch_warnings(record=True) as w: + stack = self.make_stack(numpy.int32, numpy.int8) + self.assertEqual(len(w), 1) + self.assertTrue(issubclass(w[0].category, DataFormatWarning)) + expected = numpy.empty( + (TestSlicedImageDtype.NUM_HYB, + TestSlicedImageDtype.NUM_CH, + TestSlicedImageDtype.NUM_Z, + TestSlicedImageDtype.HEIGHT, + TestSlicedImageDtype.WIDTH), dtype=numpy.int32) + expected.fill(16777216) + corner = numpy.ones( + (TestSlicedImageDtype.HEIGHT, + TestSlicedImageDtype.WIDTH), dtype=numpy.int32) + expected[0, 0, 0] = corner + self.assertEqual(stack.numpy_array.all(), expected.all()) + + def test_float_type_promotion(self): + with warnings.catch_warnings(record=True) as w: + stack = self.make_stack(numpy.float64, numpy.float32) + self.assertEqual(len(w), 1) + self.assertTrue(issubclass(w[0].category, DataFormatWarning)) + expected = numpy.empty( + (TestSlicedImageDtype.NUM_HYB, + TestSlicedImageDtype.NUM_CH, + TestSlicedImageDtype.NUM_Z, + TestSlicedImageDtype.HEIGHT, + TestSlicedImageDtype.WIDTH), dtype=numpy.int64) + expected.fill(2.0) + corner = numpy.ones( + (TestSlicedImageDtype.HEIGHT, + TestSlicedImageDtype.WIDTH), dtype=numpy.int64) + expected[0, 0, 0] = corner + self.assertEqual(stack.numpy_array.all(), expected.all())