diff --git a/resources/legacy_hist.npz b/resources/legacy_hist.npz new file mode 100644 index 00000000..ccde9886 Binary files /dev/null and b/resources/legacy_hist.npz differ diff --git a/src/faim_hcs/UIntHistogram.py b/src/faim_hcs/UIntHistogram.py index 7fe7fd31..be94afec 100644 --- a/src/faim_hcs/UIntHistogram.py +++ b/src/faim_hcs/UIntHistogram.py @@ -12,9 +12,9 @@ def __init__(self, data=None): """ if data is not None: assert data.min() >= 0, "Negative data is not supported." - self.offset, self.bins, self.frequencies = self._get_hist(data) + self.offset, _, self.frequencies = self._get_hist(data) else: - self.offset, self.bins, self.frequencies = None, None, None + self.offset, self.frequencies = None, None @staticmethod def _add(list_a, list_b): @@ -34,8 +34,8 @@ def _get_hist(data): :return: offset, bins, frequencies """ offset = int(data.min()) - bins = int(data.max()) + 2 - offset - freq = np.histogram(data, np.arange(offset, offset + bins))[0].tolist() + bins = int(data.max()) + 1 - offset + freq = np.histogram(data, np.arange(offset, offset + bins + 1))[0].tolist() return offset, bins, freq def _aggregate_histograms(self, offset_data, bins, freq): @@ -52,7 +52,7 @@ def _aggregate_histograms(self, offset_data, bins, freq): "frequencies must be of " "type List." ) lower_shift = offset_data - self.offset - upper_shift = self.offset + self.bins - (offset_data + bins) + upper_shift = self.offset + self.n_bins() + 1 - (offset_data + bins + 1) if lower_shift == 0 and upper_shift == 0: # Old and new frequencies cover the same range: @@ -61,13 +61,13 @@ def _aggregate_histograms(self, offset_data, bins, freq): self.frequencies = self._add(self.frequencies, freq) elif ( lower_shift < 0 - and (offset_data + bins - 1 >= self.offset) - and (offset_data + bins - 1 <= self.offset + self.bins - 1) + and (offset_data + bins >= self.offset) + and (offset_data + bins <= self.offset + self.n_bins()) ): # New frequencies have additional lower ones. # [old frequencies] # [new frequencies] - frequencies_to = offset_data + bins - self.offset - 1 + frequencies_to = offset_data + bins - self.offset freq_from = self.offset - offset_data self.frequencies[:frequencies_to] = self._add( self.frequencies[:frequencies_to], @@ -75,43 +75,39 @@ def _aggregate_histograms(self, offset_data, bins, freq): ) self.frequencies = freq[:freq_from] + self.frequencies self.offset = offset_data - self.bins = len(self.frequencies) + 1 - elif lower_shift < 0 and (offset_data + bins - 1 < self.offset): + elif lower_shift < 0 and (offset_data + bins < self.offset): # New frequencies only have additional lower ones. # [old frequencies] # [new frequencies] gap_freq = [ 0, - ] * (self.offset - offset_data - bins + 1) + ] * (self.offset - offset_data - bins) self.frequencies = freq + gap_freq + self.frequencies self.offset = offset_data - self.bins = len(self.frequencies) + 1 elif ( self.offset <= offset_data - <= (self.offset + self.bins - 2) - < offset_data + bins - 2 + <= (self.offset + self.n_bins() - 1) + < offset_data + bins - 1 ): # New frequencies have additional upper ones. # [old frequencies] # [new frequencies] - from_frequencies = self.offset + self.bins - offset_data - 1 - to_freq = self.offset + self.bins - offset_data + from_frequencies = self.offset + self.n_bins() - offset_data + to_freq = self.offset + self.n_bins() + 1 - offset_data self.frequencies[-from_frequencies:] = self._add( self.frequencies[-from_frequencies:], freq[:to_freq], ) self.frequencies = self.frequencies + freq[from_frequencies:] - self.bins = len(self.frequencies) + 1 - elif (offset_data) > (self.offset + self.bins - 2): + elif (offset_data) > (self.offset + self.n_bins() - 1): # New frequencies have only additional upper ones. # [old frequencies] # [new frequencies] gap_freq = [ 0, - ] * (offset_data - self.offset - self.bins + 1) + ] * (offset_data - self.offset - self.n_bins()) self.frequencies = self.frequencies + gap_freq + freq - self.bins = len(self.frequencies) + 1 elif lower_shift >= 0 and upper_shift >= 0: # New frequencies are completely covered. # [ old frequencies ] @@ -129,14 +125,13 @@ def _aggregate_histograms(self, offset_data, bins, freq): # [old frequencies] # [ new frequencies ] from_ = self.offset - offset_data - to = from_ + self.bins - 1 + to = from_ + self.n_bins() self.frequencies = self._add( self.frequencies, freq[from_:to], ) self.frequencies = freq[:from_] + self.frequencies + freq[to:] self.offset = offset_data - self.bins = len(self.frequencies) + 1 def combine(self, histogram): """ @@ -146,13 +141,12 @@ def combine(self, histogram): """ if self.frequencies is None: self.frequencies = histogram.frequencies - self.bins = histogram.bins self.offset = histogram.offset else: if histogram.frequencies is not None: self._aggregate_histograms( offset_data=histogram.offset, - bins=histogram.bins, + bins=histogram.n_bins(), freq=histogram.frequencies, ) @@ -168,7 +162,7 @@ def update(self, data): assert data.min() >= 0, "Negative data is not supported." if self.frequencies is None: - self.offset, self.bins, self.frequencies = self._get_hist(data) + self.offset, _, self.frequencies = self._get_hist(data) else: offset_data, bins, freq = self._get_hist(data) self._aggregate_histograms(offset_data=offset_data, bins=bins, freq=freq) @@ -181,13 +175,13 @@ def plot(self, width=1): """ if width > 1: heights = [] - for i in range(self.offset, self.offset + self.bins - 1, width): + for i in range(self.offset, self.offset + self.n_bins(), width): heights.append(np.sum(self.frequencies[i : i + width])) else: heights = self.frequencies plt.bar( - np.arange(self.offset, self.offset + self.bins - 1, width), + np.arange(self.offset, self.offset + self.n_bins(), width), heights, width=width, ) @@ -201,7 +195,7 @@ def mean(self): if self.frequencies is None: return 0 return np.sum( - np.arange(self.offset, self.offset + self.bins - 1) * self.frequencies + np.arange(self.offset, self.offset + self.n_bins()) * self.frequencies ) / np.sum(self.frequencies) def std(self): @@ -213,7 +207,7 @@ def std(self): return 0 return np.sqrt( np.sum( - (np.arange(self.offset, self.offset + self.bins - 1) - self.mean()) ** 2 + (np.arange(self.offset, self.offset + self.n_bins()) - self.mean()) ** 2 * self.frequencies ) / np.sum(self.frequencies) @@ -248,10 +242,19 @@ def max(self): """ if self.frequencies is None: return 0 - return self.offset + self.bins - 2 + return self.offset + self.n_bins() - 1 + + def n_bins(self): + """Return number of bins. + + :return: uint + """ + if self.frequencies is None: + return None + return len(self.frequencies) def save(self, path): - np.savez(path, frequencies=self.frequencies, offset=self.offset, bins=self.bins) + np.savez(path, frequencies=self.frequencies, offset=self.offset) @staticmethod def load(path): @@ -259,5 +262,4 @@ def load(path): hist = UIntHistogram() hist.frequencies = storage["frequencies"].tolist() hist.offset = storage["offset"] - hist.bins = storage["bins"] return hist diff --git a/tests/test_UIntHistogram.py b/tests/test_UIntHistogram.py index 01db2264..be75e9d8 100644 --- a/tests/test_UIntHistogram.py +++ b/tests/test_UIntHistogram.py @@ -1,18 +1,21 @@ import tempfile import unittest from os.path import join +from pathlib import Path import numpy as np from numpy.testing import assert_almost_equal, assert_array_equal, assert_equal from faim_hcs.UIntHistogram import UIntHistogram +ROOT_DIR = Path(__file__).parent + class TestUIntHistogram(unittest.TestCase): def test_bins(self): data = np.array([4, 5, 6]) hist = UIntHistogram(data) - assert hist.bins == len(hist.frequencies) + 1 + assert hist.n_bins() == len(hist.frequencies) def test_update_same_length(self): # Old and new frequencies cover the same range: @@ -23,7 +26,7 @@ def test_update_same_length(self): hist.update(data) assert_array_equal(hist.frequencies, np.array([2, 2, 2])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) def test_update_lower_overlap(self): @@ -36,7 +39,7 @@ def test_update_lower_overlap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 2, 1, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 2) update_data = np.array([2, 3, 4, 5, 6]) @@ -44,7 +47,7 @@ def test_update_lower_overlap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 2, 2, 2])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 2) def test_update_lower_concat(self): @@ -57,7 +60,7 @@ def test_update_lower_concat(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 1, 1, 1, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 1) def test_update_lower_gap(self): @@ -70,7 +73,7 @@ def test_update_lower_gap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 0, 1, 1, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 1) def test_update_upper_overlap(self): @@ -83,7 +86,7 @@ def test_update_upper_overlap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) update_data = np.array([4, 5, 6, 7]) @@ -91,7 +94,7 @@ def test_update_upper_overlap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([2, 2, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) def test_update_upper_concat(self): @@ -104,7 +107,7 @@ def test_update_upper_concat(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 1, 1, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) def test_update_upper_gap(self): @@ -117,7 +120,7 @@ def test_update_upper_gap(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 1, 1, 0, 1, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) def test_update_covered(self): @@ -130,7 +133,7 @@ def test_update_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) data = np.array([4, 5, 6, 7]) @@ -139,7 +142,7 @@ def test_update_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) data = np.array([4, 5, 6, 7]) @@ -148,7 +151,7 @@ def test_update_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([2, 2, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) data = np.array([4, 5, 6, 7]) @@ -157,7 +160,7 @@ def test_update_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 2, 2])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) def test_old_frequencies_covered(self): @@ -170,7 +173,7 @@ def test_old_frequencies_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) data = np.array([5]) @@ -179,7 +182,7 @@ def test_old_frequencies_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 0, 2, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 3) data = np.array([5]) @@ -188,7 +191,7 @@ def test_old_frequencies_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 2, 0, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 4) data = np.array([5]) @@ -197,7 +200,7 @@ def test_old_frequencies_covered(self): hist.update(update_data) assert_array_equal(hist.frequencies, np.array([1, 0, 2, 0, 1])) - assert_equal(hist.bins, len(hist.frequencies) + 1) + assert_equal(hist.n_bins(), len(hist.frequencies)) assert_equal(hist.offset, 3) def test_mean(self): @@ -327,15 +330,39 @@ def test_save_load(self): hist_ = UIntHistogram.load(join(temp_dir, "hist.npz")) - assert_equal(hist_.bins, hist.bins) assert_equal(hist_.offset, hist.offset) assert_array_equal(hist_.frequencies, hist.frequencies) assert hist != hist_ assert isinstance(hist.frequencies, list) - assert isinstance(hist.bins, int) assert isinstance(hist.offset, int) + def test_empty_histogram(self): + hist = UIntHistogram() + assert hist.mean() == 0 + assert hist.std() == 0 + assert hist.min() == 0 + assert hist.max() == 0 + assert hist.n_bins() is None + + def test_combine_empty_histogram(self): + hist = UIntHistogram() + update_data = np.array([5]) + update_hist = UIntHistogram(update_data) + hist.combine(update_hist) + assert hist.offset == update_hist.offset + assert hist.frequencies == update_hist.frequencies + + def test_load_legacy(self): + legacy_hist_path = ROOT_DIR.parent / "resources" / "legacy_hist.npz" + + raw = np.load(legacy_hist_path) + assert "bins" in raw # `bins` exists in the file but is now ignored + + hist = UIntHistogram.load(legacy_hist_path) + assert hist.frequencies == [1, 2, 1] + assert hist.offset == 4 + if __name__ == "__main__": unittest.main()