Optimize and test perf on small 2D arrays

nomonosound · Jan 11, 2024 · fbd78dd · fbd78dd
1 parent 21d9b0a
commit fbd78dd
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ import numpy as np
 # Development
 
 * Install dev/build/test dependencies as denoted in setup.py
-* `pip install -e .`
+* `CC=clang pip install -e .`
 * `pytest`
 
 # Acknowledgements

diff --git a/numpy_minmax/__init__.py b/numpy_minmax/__init__.py
@@ -17,7 +17,7 @@ def minmax(a: NDArray) -> Tuple:
                 len(a),
             )
             return np.float32(result.min_val), np.float32(result.max_val)
-        elif a.ndim == 2 and a.shape[1] > 16:
+        elif a.ndim == 2:
             result = _numpy_minmax.lib.minmax_2d(
                 _numpy_minmax.ffi.cast("float *", a.ctypes.data),
                 a.shape[0],

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
@@ -79,7 +79,6 @@ MinMaxResult minmax_1d(float *a, size_t length) {
     if (length >= 16) {
         return minmax_avx2_1d(a, length);
     } else {
-        // TODO: test if this is faster than the numpy equivalent
         return minmax_pairwise_1d(a, length);
     }
 }
@@ -124,12 +123,42 @@ MinMaxResult minmax_avx2_2d(float *a, size_t shape_0, size_t shape_1) {
     return result;
 }
 
+MinMaxResult minmax_pairwise_2d(float *a, size_t shape_0, size_t shape_1) {
+    MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
+
+    // Return early for empty arrays
+    if (shape_0 == 0 || shape_1 == 0) {
+        return (MinMaxResult){0.0, 0.0};
+    }
+
+    for (size_t row = 0; row < shape_0; ++row) {
+        size_t i = 0;
+        float* row_ptr = a + (row * shape_1);
+
+        // Initialize min and max for the row. Handle edge case for odd number of elements.
+        if (shape_1 % 2 != 0) {
+            float last_elem = row_ptr[shape_1 - 1];
+            if (last_elem < result.min_val) result.min_val = last_elem;
+            if (last_elem > result.max_val) result.max_val = last_elem;
+        }
+
+        // Process elements in pairs for each row
+        for (; i < shape_1 - 1; i += 2) {
+            float smaller = row_ptr[i] < row_ptr[i + 1] ? row_ptr[i] : row_ptr[i + 1];
+            float larger = row_ptr[i] < row_ptr[i + 1] ? row_ptr[i + 1] : row_ptr[i];
+
+            if (smaller < result.min_val) result.min_val = smaller;
+            if (larger > result.max_val) result.max_val = larger;
+        }
+    }
+
+    return result;
+}
+
 MinMaxResult minmax_2d(float *a, size_t shape_0, size_t shape_1) {
-    return minmax_avx2_2d(a, shape_0, shape_1);
-    // TODO:
-//    if (shape_1 >= 16) {
-//        return minmax_avx2_2d(a, length);
-//    } else {
-//        return minmax_pairwise_2d(a, length);
-//    }
+    if (shape_1 >= 16) {
+        return minmax_avx2_2d(a, shape_0, shape_1);
+    } else {
+        return minmax_pairwise_2d(a, shape_0, shape_1);
+    }
 }
diff --git a/packaging.md b/packaging.md
@@ -1,5 +1,5 @@
 * Bump version in `numpy_minmax/__init__.py`
-* `pip install -e . && pytest`
+* `CC=clang pip install -e . && pytest`
 * Update CHANGELOG.md
 * Commit and push the change with a commit message like this: "Release vx.y.z" (replace x.y.z with the package version)
 * Wait for build workflow in Github Actions to complete

diff --git a/scripts/perf_benchmark.py b/scripts/perf_benchmark.py
@@ -60,6 +60,27 @@ def perf_benchmark_many_small_1d_c_contiguous():
             min_val, max_val = numpy_minmax.minmax(a)
 
 
+def perf_benchmark_many_small_2d_c_contiguous():
+    print("===\nperf_benchmark_many_small_2d_c_contiguous:")
+    arrays = []
+    for i in range(100_000):
+        a = np.random.uniform(low=-4.0, high=3.9, size=(3, 9)).astype(np.float32)
+        arrays.append(a)
+
+    with timer("numpy.amax and numpy.amin sequentially"):
+        for a in arrays:
+            min_val = np.amin(a)
+            max_val = np.amax(a)
+
+    with timer("diplib"):
+        for a in arrays:
+            min_val, max_val = dip.MaximumAndMinimum(a)
+
+    with timer("minmax") as t:
+        for a in arrays:
+            min_val, max_val = numpy_minmax.minmax(a)
+
+
 def perf_benchmark_large_1d_c_contiguous():
     print("===\nperf_benchmark_large_1d_c_contiguous:")
     a = np.random.uniform(low=-4.0, high=3.9, size=(999_999_999,)).astype(np.float32)
@@ -158,6 +179,7 @@ def perf_benchmark_large_2d_not_c_contiguous():
 
 if __name__ == "__main__":
     perf_benchmark_many_small_1d_c_contiguous()
+    perf_benchmark_many_small_2d_c_contiguous()
     perf_benchmark_large_1d_c_contiguous()
     perf_benchmark_large_1d_not_c_contiguous()
     perf_benchmark_large_2d_c_contiguous()

diff --git a/tests/test_minmax.py b/tests/test_minmax.py
@@ -53,11 +53,23 @@ def test_minmax_float64_numpy_fallback(self):
         assert isinstance(min_val, np.float64)
         assert isinstance(max_val, np.float64)
 
-    def test_minmax_2d_shape(self):
-        arr = np.arange(16, dtype=np.float32).reshape((2, 8))
+    def test_minmax_2d_small1(self):
+        arr = np.random.uniform(low=-6.0, high=3.0, size=(15, 2)).astype(np.float32)
         min_val, max_val = numpy_minmax.minmax(arr)
-        assert min_val == 0.0
-        assert max_val == 15.0
+        assert min_val == np.amin(arr)
+        assert max_val == np.amax(arr)
+
+    def test_minmax_2d_small2(self):
+        arr = np.random.uniform(low=-6.0, high=3.0, size=(2, 15)).astype(np.float32)
+        min_val, max_val = numpy_minmax.minmax(arr)
+        assert min_val == np.amin(arr)
+        assert max_val == np.amax(arr)
+
+    def test_minmax_2d_shape_large(self):
+        arr = np.random.uniform(low=-6.0, high=3.0, size=(2, 999)).astype(np.float32)
+        min_val, max_val = numpy_minmax.minmax(arr)
+        assert min_val == np.amin(arr)
+        assert max_val == np.amax(arr)
 
     @pytest.mark.parametrize("shape", [(0,), (0, 0)])
     def test_minmax_empty_array(self, shape):