Merge pull request #19 from int-brain-lab/bincount2D

Add `bincount2D` and bump version
int-brain-lab · Jun 27, 2023 · 177bf30 · 177bf30
2 parents 727dc23 + 4e88879
commit 177bf30
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/iblutil/commits/main) [1.5.0]
+## [Latest](https://github.com/int-brain-lab/iblutil/commits/main) [1.7.0]
+
+### Added
+
+- bincount2D moved from ibllib.processing to iblutil.numerical
+
+## [1.6.0]
+
+### Added
+
+- numerical.rcoeff function that computes pairwise Pearson correlation coefficients for matrices
+
+## [1.5.0]
 
 ### Added
 

diff --git a/iblutil/__init__.py b/iblutil/__init__.py
@@ -1 +1 @@
-__version__ = '1.6.0'
+__version__ = '1.7.0'
diff --git a/iblutil/numerical.py b/iblutil/numerical.py
@@ -3,7 +3,7 @@
 import numpy as np
 from numba import jit
 
-D = TypeVar('D', bound=np.generic)
+D = TypeVar("D", bound=np.generic)
 Array = Union[np.ndarray, Sequence]
 
 
@@ -26,7 +26,7 @@ def between_sorted(sorted_v, bounds=None):
     stops = stops[sbounds]
     sel = sorted_v * 0
     sel[np.searchsorted(sorted_v, starts)] = 1
-    istops = np.searchsorted(sorted_v, stops, side='right')
+    istops = np.searchsorted(sorted_v, stops, side="right")
     sel[istops[istops < sorted_v.size]] += -1
     return np.cumsum(sel).astype(bool)
 
@@ -93,16 +93,78 @@ def intersect2d(a0, a1, assume_unique=False):
     :return: index of a0 such as intersection = a0[ia, :]
     :return: index of b0 such as intersection = b0[ib, :]
     """
-    _, i0, i1 = np.intersect1d(a0[:, 0], a1[:, 0],
-                               return_indices=True, assume_unique=assume_unique)
+    _, i0, i1 = np.intersect1d(
+        a0[:, 0], a1[:, 0], return_indices=True, assume_unique=assume_unique
+    )
     for n in np.arange(1, a0.shape[1]):
-        _, ii0, ii1 = np.intersect1d(a0[i0, n], a1[i1, n],
-                                     return_indices=True, assume_unique=assume_unique)
+        _, ii0, ii1 = np.intersect1d(
+            a0[i0, n], a1[i1, n], return_indices=True, assume_unique=assume_unique
+        )
         i0 = i0[ii0]
         i1 = i1[ii1]
     return a0[i0, :], i0, i1
 
 
+def bincount2D(x, y, xbin=0, ybin=0, xlim=None, ylim=None, weights=None):
+    """
+    Computes a 2D histogram by aggregating values in a 2D array.
+
+    :param x: values to bin along the 2nd dimension (c-contiguous)
+    :param y: values to bin along the 1st dimension
+    :param xbin:
+        scalar: bin size along 2nd dimension
+        0: aggregate according to unique values
+        array: aggregate according to exact values (count reduce operation)
+    :param ybin:
+        scalar: bin size along 1st dimension
+        0: aggregate according to unique values
+        array: aggregate according to exact values (count reduce operation)
+    :param xlim: (optional) 2 values (array or list) that restrict range along 2nd dimension
+    :param ylim: (optional) 2 values (array or list) that restrict range along 1st dimension
+    :param weights: (optional) defaults to None, weights to apply to each value for aggregation
+    :return: 3 numpy arrays MAP [ny,nx] image, xscale [nx], yscale [ny]
+    """
+    # if no bounds provided, use min/max of vectors
+    if xlim is None:
+        xlim = [np.min(x), np.max(x)]
+    if ylim is None:
+        ylim = [np.min(y), np.max(y)]
+
+    def _get_scale_and_indices(v, bin, lim):
+        # if bin is a nonzero scalar, this is a bin size: create scale and indices
+        if np.isscalar(bin) and bin != 0:
+            scale = np.arange(lim[0], lim[1] + bin / 2, bin)
+            ind = (np.floor((v - lim[0]) / bin)).astype(np.int64)
+        # if bin == 0, aggregate over unique values
+        else:
+            scale, ind = np.unique(v, return_inverse=True)
+        return scale, ind
+
+    xscale, xind = _get_scale_and_indices(x, xbin, xlim)
+    yscale, yind = _get_scale_and_indices(y, ybin, ylim)
+    # aggregate by using bincount on absolute indices for a 2d array
+    nx, ny = [xscale.size, yscale.size]
+    ind2d = np.ravel_multi_index(np.c_[yind, xind].transpose(), dims=(ny, nx))
+    r = np.bincount(ind2d, minlength=nx * ny, weights=weights).reshape(ny, nx)
+
+    # if a set of specific values is requested output an array matching the scale dimensions
+    if not np.isscalar(xbin) and xbin.size > 1:
+        _, iout, ir = np.intersect1d(xbin, xscale, return_indices=True)
+        _r = r.copy()
+        r = np.zeros((ny, xbin.size))
+        r[:, iout] = _r[:, ir]
+        xscale = xbin
+
+    if not np.isscalar(ybin) and ybin.size > 1:
+        _, iout, ir = np.intersect1d(ybin, yscale, return_indices=True)
+        _r = r.copy()
+        r = np.zeros((ybin.size, r.shape[1]))
+        r[iout, :] = _r[ir, :]
+        yscale = ybin
+
+    return r, xscale, yscale
+
+
 @jit(nopython=True)
 def find_first_2d(mat, val):
     """
@@ -120,25 +182,35 @@ def find_first_2d(mat, val):
 
 def rcoeff(x, y):
     """
-    Computes pairwise Person correlation coefficients for matrices.
+    Computes pairwise Pearson correlation coefficients for matrices.
+
     That is for 2 matrices the same size, computes the row to row coefficients and outputs
-    a vector corresponding to the number of rows of the first matrix
-    If the second array is a vector then computes the correlation coefficient for all rows
+    a vector corresponding to the number of rows of the first matrix.
+    If the second array is a vector then computes the correlation coefficient for all rows.
     :param x: np array [nc, ns]
     :param y: np array [nc, ns] or [ns]
     :return: r [nc]
     """
+
     def normalize(z):
         mean = np.mean(z, axis=-1)
         return z - mean if mean.size == 1 else z - mean[:, np.newaxis]
+
     xnorm = normalize(x)
     ynorm = normalize(y)
-    rcor = np.sum(xnorm * ynorm, axis=-1) / np.sqrt(np.sum(np.square(xnorm), axis=-1) * np.sum(np.square(ynorm), axis=-1))
+    rcor = np.sum(xnorm * ynorm, axis=-1) / np.sqrt(
+        np.sum(np.square(xnorm), axis=-1) * np.sum(np.square(ynorm), axis=-1)
+    )
     return rcor
 
 
-def within_ranges(x: np.ndarray, ranges: Array, labels: Optional[Array] = None,
-                  mode: str = 'vector', dtype: Type[D] = 'int8') -> np.ndarray:
+def within_ranges(
+    x: np.ndarray,
+    ranges: Array,
+    labels: Optional[Array] = None,
+    mode: str = "vector",
+    dtype: Type[D] = "int8",
+) -> np.ndarray:
     """
     Detects which points of the input vector lie within one of the ranges specified in the ranges.
     Returns an array the size of x with a 1 if the corresponding point is within a range.
@@ -220,32 +292,34 @@ def within_ranges(x: np.ndarray, ranges: Array, labels: Optional[Array] = None,
 
     if labels is None:
         # In 'matrix' mode default row index is 0
-        labels = np.zeros((n_ranges,), dtype='uint32')
-        if mode == 'vector':  # Otherwise default numerical label is 1
+        labels = np.zeros((n_ranges,), dtype="uint32")
+        if mode == "vector":  # Otherwise default numerical label is 1
             labels += 1
-    assert len(labels) >= n_ranges, 'range labels do not match number of ranges'
+    assert len(labels) >= n_ranges, "range labels do not match number of ranges"
     n_labels = np.unique(labels).size
 
     # If no ranges given, short circuit function and return zeros
     if n_ranges == 0:
         return np.zeros_like(x, dtype=dtype)
 
     # Check end comes after start in each case
-    assert np.all(np.diff(ranges, axis=1) > 0), 'ranges ends must all be greater than starts'
+    assert np.all(
+        np.diff(ranges, axis=1) > 0
+    ), "ranges ends must all be greater than starts"
 
     # Make array containing points, starts and finishes
 
     # This order means it will be inclusive
     to_sort = np.concatenate((ranges[:, 0], x, ranges[:, 1]))
     # worst case O(n*log(n)) but will be better than this as most of the array is ordered;
     # memory overhead ~n/2
-    idx = np.argsort(to_sort, kind='stable')
+    idx = np.argsort(to_sort, kind="stable")
 
     # Make delta array containing 1 for every start and -1 for every stop
     # with one row for each range label
-    if mode == 'matrix':
+    if mode == "matrix":
         delta_shape = (n_labels, n_points + 2 * n_ranges)
-        delta = np.zeros(delta_shape, dtype='int8')
+        delta = np.zeros(delta_shape, dtype="int8")
 
         delta[labels, np.arange(n_ranges)] = 1
         delta[labels, n_points + n_ranges + np.arange(n_ranges)] = -1
@@ -261,9 +335,9 @@ def within_ranges(x: np.ndarray, ranges: Array, labels: Optional[Array] = None,
         reordered[:, idx] = summed.reshape(delta_shape[0], -1)
         return reordered[:, np.arange(n_ranges, n_points + n_ranges)]
 
-    elif mode == 'vector':
+    elif mode == "vector":
         delta_shape = (n_points + 2 * n_ranges,)
-        r_delta = np.zeros(delta_shape, dtype='int32')
+        r_delta = np.zeros(delta_shape, dtype="int32")
         r_delta[np.arange(n_ranges)] = labels
         r_delta[n_points + n_ranges + np.arange(n_ranges)] = -labels
 

diff --git a/tests/test_numerical.py b/tests/test_numerical.py
@@ -5,7 +5,6 @@
 
 
 class TestRcoeff(unittest.TestCase):
-
     def test_rcoeff(self):
         x = np.random.rand(2, 1000)
         y = x[0, :]
@@ -17,13 +16,14 @@ def test_rcoeff(self):
 
 
 class TestBetweeenSorted(unittest.TestCase):
-
     def test_between_sorted_single_time(self):
         # test single time, falling right on edges
         t = np.arange(100)
         bounds = [10, 25]
         ind = num.between_sorted(t, bounds)
-        assert np.all(t[ind] == np.arange(int(np.ceil(bounds[0])), int(np.floor(bounds[1] + 1))))
+        assert np.all(
+            t[ind] == np.arange(int(np.ceil(bounds[0])), int(np.floor(bounds[1] + 1)))
+        )
         # test single time in between edges
         bounds = [10.4, 25.2]
         ind = num.between_sorted(t, bounds)
@@ -36,23 +36,31 @@ def test_between_sorted_multiple_times(self):
         t = np.arange(100)
         # non overlapping ranges
         bounds = np.array([[10.4, 25.2], [67.2, 86.4]])
-        ind_ = np.logical_or(num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1]))
+        ind_ = np.logical_or(
+            num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1])
+        )
         ind = num.between_sorted(t, bounds)
         assert np.all(ind == ind_)
         # overlapping ranges
         bounds = np.array([[10.4, 83.2], [67.2, 86.4]])
-        ind_ = np.logical_or(num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1]))
+        ind_ = np.logical_or(
+            num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1])
+        )
         ind = num.between_sorted(t, bounds)
         assert np.all(ind == ind_)
         # one range contains the other
         bounds = np.array([[10.4, 83.2], [34, 78]])
-        ind_ = np.logical_or(num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1]))
+        ind_ = np.logical_or(
+            num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1])
+        )
         ind = num.between_sorted(t, bounds)
         assert np.all(ind == ind_)
         # case when one range starts exactly where another stops
         bounds = np.array([[10.4, 83.2], [83.2, 84]])
         ind = num.between_sorted(t, bounds)
-        ind_ = np.logical_or(num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1]))
+        ind_ = np.logical_or(
+            num.between_sorted(t, bounds[0]), num.between_sorted(t, bounds[1])
+        )
         assert np.all(ind == ind_)
 
     def test_between_sorted_out_of_range(self):
@@ -64,7 +72,6 @@ def test_between_sorted_out_of_range(self):
 
 
 class TestIsmember(unittest.TestCase):
-
     def test_ismember2d(self):
         b = np.reshape([0, 0, 0, 1, 1, 1], [3, 2])
         locb = np.array([0, 1, 0, 2, 2, 1])
@@ -80,8 +87,12 @@ def test_ismember2d_uuids(self):
         a = np.random.randint(0, nb + 3, na)
         b = np.arange(nb)
         lia, locb = num.ismember(a, b)
-        bb = np.random.randint(low=np.iinfo(np.int64).min, high=np.iinfo(np.int64).max,
-                               size=(nb, 2), dtype=np.int64)
+        bb = np.random.randint(
+            low=np.iinfo(np.int64).min,
+            high=np.iinfo(np.int64).max,
+            size=(nb, 2),
+            dtype=np.int64,
+        )
         aa = np.zeros((na, 2), dtype=np.int64)
         aa[lia, :] = bb[locb, :]
         lia_, locb_ = num.ismember2d(aa, bb)
@@ -127,27 +138,38 @@ def test_within_ranges(self):
 
         # Matrix mode
         ranges = np.array([[1, 2], [5, 8]])
-        verifiable = num.within_ranges(np.arange(10) + 1, ranges,
-                                       labels=np.array([0, 1]), mode='matrix')
-        expected = np.array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 1, 1, 1, 1, 0, 0]], dtype=int)
+        verifiable = num.within_ranges(
+            np.arange(10) + 1, ranges, labels=np.array([0, 1]), mode="matrix"
+        )
+        expected = np.array(
+            [[1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1, 0, 0]], dtype=int
+        )
         np.testing.assert_array_equal(verifiable, expected)
 
         # Test overlap
-        verifiable = num.within_ranges(np.arange(11), [(1, 2), (5, 8), (4, 6)],
-                                       labels=[0, 1, 1], mode='matrix')
-        expected = np.array([[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0]], dtype=int)
+        verifiable = num.within_ranges(
+            np.arange(11), [(1, 2), (5, 8), (4, 6)], labels=[0, 1, 1], mode="matrix"
+        )
+        expected = np.array(
+            [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0]],
+            dtype=int,
+        )
         np.testing.assert_array_equal(verifiable, expected)
 
         # Vector mode
-        verifiable = num.within_ranges(np.arange(10) + 1, ranges, np.array([3, 1]), mode='vector')
+        verifiable = num.within_ranges(
+            np.arange(10) + 1, ranges, np.array([3, 1]), mode="vector"
+        )
         expected = np.array([3, 3, 0, 0, 1, 1, 1, 1, 0, 0], dtype=int)
         np.testing.assert_array_equal(verifiable, expected)
 
         # Boolean
-        verifiable = num.within_ranges(np.arange(11), [(1, 2), (5, 8), (4, 6)], dtype=bool)
-        expected = np.array([False, True, True, False, True, True, True, True, True, False, False])
+        verifiable = num.within_ranges(
+            np.arange(11), [(1, 2), (5, 8), (4, 6)], dtype=bool
+        )
+        expected = np.array(
+            [False, True, True, False, True, True, True, True, True, False, False]
+        )
         np.testing.assert_array_equal(verifiable, expected)
 
         # Edge cases
@@ -156,4 +178,45 @@ def test_within_ranges(self):
         np.testing.assert_array_equal(verifiable, expected)
 
         with self.assertRaises(ValueError):
-            num.within_ranges(np.arange(11), [(1, 2)], mode='array')
+            num.within_ranges(np.arange(11), [(1, 2)], mode="array")
+
+
+class TestBincount2D(unittest.TestCase):
+    def test_bincount_2d(self):
+        # first test simple with indices
+        x = np.array([0, 1, 1, 2, 2, 3, 3, 3])
+        y = np.array([3, 2, 2, 1, 1, 0, 0, 0])
+        r, xscale, yscale = num.bincount2D(x, y, xbin=1, ybin=1)
+        r_ = np.zeros_like(r)
+        # sometimes life would have been simpler in c:
+        for ix, iy in zip(x, y):
+            r_[iy, ix] += 1
+        self.assertTrue(np.all(np.equal(r_, r)))
+        # test with negative values
+        y = np.array([3, 2, 2, 1, 1, 0, 0, 0]) - 5
+        r, xscale, yscale = num.bincount2D(x, y, xbin=1, ybin=1)
+        self.assertTrue(np.all(np.equal(r_, r)))
+        # test unequal bins
+        r, xscale, yscale = num.bincount2D(x / 2, y / 2, xbin=1, ybin=2)
+        r_ = np.zeros_like(r)
+        for ix, iy in zip(np.floor(x / 2), np.floor((y / 2 + 2.5) / 2)):
+            r_[int(iy), int(ix)] += 1
+        self.assertTrue(np.all(r_ == r))
+        # test with weights
+        w = np.ones_like(x) * 2
+        r, xscale, yscale = num.bincount2D(x / 2, y / 2, xbin=1, ybin=2, weights=w)
+        self.assertTrue(np.all(r_ * 2 == r))
+        # test aggregation instead of binning
+        x = np.array([0, 1, 1, 2, 2, 4, 4, 4])
+        y = np.array([4, 2, 2, 1, 1, 0, 0, 0])
+        r, xscale, yscale = num.bincount2D(x, y)
+        self.assertTrue(
+            np.all(xscale == yscale) and np.all(xscale == np.array([0, 1, 2, 4]))
+        )
+        # test aggregation on a fixed scale
+        r, xscale, yscale = num.bincount2D(
+            x + 10, y + 10, xbin=np.arange(5) + 10, ybin=np.arange(3) + 10
+        )
+        self.assertTrue(np.all(xscale == np.arange(5) + 10))
+        self.assertTrue(np.all(yscale == np.arange(3) + 10))
+        self.assertTrue(np.all(r.shape == (3, 5)))