From e59883d637dc8e9e8b1a1f3400c54b7a18befca1 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 15 Jul 2024 10:25:06 +0200
Subject: [PATCH 01/16] Add ARM support

---
 .github/workflows/build.yml |  3 ++-
 numpy_minmax/_minmax.c      | 47 +++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 36df60d..a3be9fa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest]
+        #os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64]
+        os: [ubuntu-22.04-arm64]
         python-version: [3.9]
 
     if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch'
diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 147740a..64a5837 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -1,7 +1,10 @@
 #include <float.h>
-#include <immintrin.h>
 #include <stdbool.h>
 
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+#endif
+
 #ifdef _MSC_VER
     #include <intrin.h>  // MSVC
 #else
@@ -127,15 +130,20 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) {
     if (length == 0) {
         return (MinMaxResult){0.0, 0.0};
     }
-    if (length >= 16) {
-        if (system_supports_avx512()) {
-            return minmax_avx512(a, length);
+
+    #if defined(__x86_64__) || defined(_M_X64)
+        if (length >= 16) {
+            if (system_supports_avx512()) {
+                return minmax_avx512(a, length);
+            } else {
+                return minmax_avx(a, length);
+            }
         } else {
-            return minmax_avx(a, length);
+            return minmax_pairwise(a, length);
         }
-    } else {
+    #else
         return minmax_pairwise(a, length);
-    }
+    #endif
 }
 
 // Takes the pairwise min/max on strided input. Strides are in number of bytes,
@@ -220,18 +228,23 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
     if (length == 0) {
         return (MinMaxResult){0.0, 0.0};
     }
-    if (stride < 0){
-        if (-stride == sizeof(float)){
-            return minmax_contiguous(a - length + 1, length);
+
+    #if defined(__x86_64__) || defined(_M_X64)
+        if (stride < 0){
+            if (-stride == sizeof(float)){
+                return minmax_contiguous(a - length + 1, length);
+            }
+            if (length < 16){
+                return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+            }
+            return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+
         }
         if (length < 16){
-            return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+            return minmax_pairwise_strided((Byte*)a, length, stride);
         }
-        return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
-
-    }
-    if (length < 16){
+        return minmax_avx_strided((Byte*)a, length, stride);
+    #else
         return minmax_pairwise_strided((Byte*)a, length, stride);
-    }
-    return minmax_avx_strided((Byte*)a, length, stride);
+    #endif
 }

From 5918647974675a3d2c053d417568f8d178d2914e Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 15 Jul 2024 10:33:20 +0200
Subject: [PATCH 02/16] Make AVX compile args conditional

---
 numpy_minmax/_minmax_cffi.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/numpy_minmax/_minmax_cffi.py b/numpy_minmax/_minmax_cffi.py
index ee6ac21..aef9d86 100644
--- a/numpy_minmax/_minmax_cffi.py
+++ b/numpy_minmax/_minmax_cffi.py
@@ -1,7 +1,7 @@
 import os
+import platform
 from cffi import FFI
 
-
 ffibuilder = FFI()
 ffibuilder.cdef("""
     typedef struct {
@@ -18,10 +18,14 @@
 with open(c_file_path, "r") as file:
     c_code = file.read()
 
-extra_compile_args = ["-mavx", "-mavx512f", "-O3", "-Wall"]
+extra_compile_args = ["-O3", "-Wall"]
 if os.name == "posix":
     extra_compile_args.append("-Wextra")
 
+# Detect architecture and set appropriate SIMD-related compile args
+if platform.machine().lower() in ["x86_64", "amd64", "i386", "i686"]:
+    extra_compile_args.append("-mavx")
+
 ffibuilder.set_source("_numpy_minmax", c_code, extra_compile_args=extra_compile_args)
 
 

From f837b611017499e7a347a2c54898abb2c783f23a Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 15 Jul 2024 10:39:43 +0200
Subject: [PATCH 03/16] Don't include cpuid.h when compiling for ARM

---
 numpy_minmax/_minmax.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 64a5837..4fdd12b 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -2,19 +2,20 @@
 #include <stdbool.h>
 
 #if defined(__x86_64__) || defined(_M_X64)
-#include <immintrin.h>
-#endif
+    #include <immintrin.h>
 
-#ifdef _MSC_VER
-    #include <intrin.h>  // MSVC
-#else
-    #include <cpuid.h>  // GCC and Clang
-#endif
+    #ifdef _MSC_VER
+        #include <intrin.h>  // MSVC
+    #else
+        #include <cpuid.h>  // GCC and Clang
+    #endif
 
-#ifndef bit_AVX512F
-#define bit_AVX512F     (1 << 16)
+    #ifndef bit_AVX512F
+    #define bit_AVX512F     (1 << 16)
+    #endif
 #endif
 
+
 typedef struct {
     float min_val;
     float max_val;

From 438532d03cb386dfb1066252283b36d899cde963 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 15 Jul 2024 10:44:44 +0200
Subject: [PATCH 04/16] Conditionally exclude AVX-enabled functions

---
 numpy_minmax/_minmax.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 4fdd12b..a56e52a 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -23,6 +23,7 @@ typedef struct {
 
 typedef unsigned char Byte;
 
+#if defined(__x86_64__) || defined(_M_X64)
 bool system_supports_avx512() {
     unsigned int eax, ebx, ecx, edx;
 
@@ -40,6 +41,7 @@ bool system_supports_avx512() {
     // Check the AVX512F bit in EBX
     return (ebx & bit_AVX512F) != 0;
 }
+#endif
 
 static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
     // Initialize min and max with the last element of the array.
@@ -61,6 +63,7 @@ static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
     return result;
 }
 
+#if defined(__x86_64__) || defined(_M_X64)
 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) {
     float temp_min[8], temp_max[8];
     _mm256_storeu_ps(temp_min, min_vals);
@@ -72,6 +75,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
     return result;
 }
 
+
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -125,6 +129,7 @@ MinMaxResult minmax_avx512(const float *a, size_t length) {
 
     return reduce_result_from_mm512(min_vals, max_vals, result);
 }
+#endif
 
 MinMaxResult minmax_contiguous(const float *a, size_t length) {
     // Return early for empty arrays
@@ -147,6 +152,7 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) {
     #endif
 }
 
+#if defined(__x86_64__) || defined(_M_X64)
 // Takes the pairwise min/max on strided input. Strides are in number of bytes,
 // which is why the data pointer is Byte (i.e. unsigned char)
 MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) {
@@ -222,6 +228,7 @@ MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
 
     return reduce_result_from_mm256(min_vals, max_vals, result);
 }
+#endif
 
 
 MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {

From 810705663716024187cfdedf99036516d970dff5 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 13:15:14 +0200
Subject: [PATCH 05/16] Don't exclude minmax_pairwise_strided in arm builds

---
 numpy_minmax/_minmax.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index a56e52a..47f6099 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -74,8 +74,9 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
     }
     return result;
 }
+#endif
 
-
+#if defined(__x86_64__) || defined(_M_X64)
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -96,7 +97,9 @@ MinMaxResult minmax_avx(const float *a, size_t length) {
 
     return reduce_result_from_mm256(min_vals, max_vals, result);
 }
+#endif
 
+#if defined(__x86_64__) || defined(_M_X64)
 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) {
     float temp_min[16], temp_max[16];
     _mm512_storeu_ps(temp_min, min_vals);
@@ -107,7 +110,9 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_
     }
     return result;
 }
+#endif
 
+#if defined(__x86_64__) || defined(_M_X64)
 MinMaxResult minmax_avx512(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -152,7 +157,6 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) {
     #endif
 }
 
-#if defined(__x86_64__) || defined(_M_X64)
 // Takes the pairwise min/max on strided input. Strides are in number of bytes,
 // which is why the data pointer is Byte (i.e. unsigned char)
 MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) {
@@ -185,6 +189,7 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride)
     return result;
 }
 
+#if defined(__x86_64__) || defined(_M_X64)
 // Takes the avx min/max on strided input. Strides are in number of bytes,
 // which is why the data pointer is Byte (i.e. unsigned char)
 MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
@@ -220,7 +225,6 @@ MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
         max_vals = _mm256_max_ps(max_vals, vals);
     }
 
-
     // Process remainder elements
     if (i < length*stride){
         result = minmax_pairwise_strided(a + i, length - i / stride, stride);

From d34566af98ec90b71107aed3f2ad4aa8596f30c0 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:00:57 +0200
Subject: [PATCH 06/16] Handle negative strides correctly on arm

---
 numpy_minmax/_minmax.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 47f6099..181d80f 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -241,17 +241,21 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
         return (MinMaxResult){0.0, 0.0};
     }
 
-    #if defined(__x86_64__) || defined(_M_X64)
-        if (stride < 0){
-            if (-stride == sizeof(float)){
-                return minmax_contiguous(a - length + 1, length);
-            }
-            if (length < 16){
-                return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
-            }
+    if (stride < 0){
+        if (-stride == sizeof(float)){
+            return minmax_contiguous(a - length + 1, length);
+        }
+        if (length < 16){
+            return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+        }
+        #if defined(__x86_64__) || defined(_M_X64)
             return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+        #else
+            return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+        #endif
+    }
 
-        }
+    #if defined(__x86_64__) || defined(_M_X64)
         if (length < 16){
             return minmax_pairwise_strided((Byte*)a, length, stride);
         }

From 06f7160c3ad32888529f599274e7e6f33cf57695 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:08:29 +0200
Subject: [PATCH 07/16] Dry the x86 check

---
 numpy_minmax/_minmax.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 181d80f..86bd89b 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -1,7 +1,9 @@
 #include <float.h>
 #include <stdbool.h>
 
-#if defined(__x86_64__) || defined(_M_X64)
+#define IS_X86_64 (defined(__x86_64__) || defined(_M_X64))
+
+#if IS_X86_64
     #include <immintrin.h>
 
     #ifdef _MSC_VER
@@ -23,7 +25,7 @@ typedef struct {
 
 typedef unsigned char Byte;
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 bool system_supports_avx512() {
     unsigned int eax, ebx, ecx, edx;
 
@@ -63,7 +65,7 @@ static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
     return result;
 }
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) {
     float temp_min[8], temp_max[8];
     _mm256_storeu_ps(temp_min, min_vals);
@@ -76,7 +78,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
 }
 #endif
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -99,7 +101,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) {
 }
 #endif
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) {
     float temp_min[16], temp_max[16];
     _mm512_storeu_ps(temp_min, min_vals);
@@ -112,7 +114,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_
 }
 #endif
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 MinMaxResult minmax_avx512(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -142,7 +144,7 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) {
         return (MinMaxResult){0.0, 0.0};
     }
 
-    #if defined(__x86_64__) || defined(_M_X64)
+    #if IS_X86_64
         if (length >= 16) {
             if (system_supports_avx512()) {
                 return minmax_avx512(a, length);
@@ -189,7 +191,7 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride)
     return result;
 }
 
-#if defined(__x86_64__) || defined(_M_X64)
+#if IS_X86_64
 // Takes the avx min/max on strided input. Strides are in number of bytes,
 // which is why the data pointer is Byte (i.e. unsigned char)
 MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
@@ -248,14 +250,14 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
         if (length < 16){
             return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
         }
-        #if defined(__x86_64__) || defined(_M_X64)
+        #if IS_X86_64
             return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
         #else
             return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
         #endif
     }
 
-    #if defined(__x86_64__) || defined(_M_X64)
+    #if IS_X86_64
         if (length < 16){
             return minmax_pairwise_strided((Byte*)a, length, stride);
         }

From 0f3bf76aa62e912de78d9129807962ecb4809ec4 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:17:09 +0200
Subject: [PATCH 08/16] Add back the missing -mavx512f compile arg

---
 numpy_minmax/_minmax_cffi.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numpy_minmax/_minmax_cffi.py b/numpy_minmax/_minmax_cffi.py
index aef9d86..39c1f9f 100644
--- a/numpy_minmax/_minmax_cffi.py
+++ b/numpy_minmax/_minmax_cffi.py
@@ -25,6 +25,7 @@
 # Detect architecture and set appropriate SIMD-related compile args
 if platform.machine().lower() in ["x86_64", "amd64", "i386", "i686"]:
     extra_compile_args.append("-mavx")
+    extra_compile_args.append("-mavx512f")
 
 ffibuilder.set_source("_numpy_minmax", c_code, extra_compile_args=extra_compile_args)
 

From 921b02ce8e0d2fcc82d45dbe7adbbdc0945d7bcf Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:17:21 +0200
Subject: [PATCH 09/16] Reindent preprocessor directives

---
 numpy_minmax/_minmax.c | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 86bd89b..4fac089 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -144,19 +144,19 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) {
         return (MinMaxResult){0.0, 0.0};
     }
 
-    #if IS_X86_64
-        if (length >= 16) {
-            if (system_supports_avx512()) {
-                return minmax_avx512(a, length);
-            } else {
-                return minmax_avx(a, length);
-            }
+#if IS_X86_64
+    if (length >= 16) {
+        if (system_supports_avx512()) {
+            return minmax_avx512(a, length);
         } else {
-            return minmax_pairwise(a, length);
+            return minmax_avx(a, length);
         }
-    #else
+    } else {
         return minmax_pairwise(a, length);
-    #endif
+    }
+#else
+    return minmax_pairwise(a, length);
+#endif
 }
 
 // Takes the pairwise min/max on strided input. Strides are in number of bytes,
@@ -250,19 +250,19 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
         if (length < 16){
             return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
         }
-        #if IS_X86_64
-            return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
-        #else
-            return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
-        #endif
+#if IS_X86_64
+        return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+#else
+        return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride);
+#endif
     }
 
-    #if IS_X86_64
-        if (length < 16){
-            return minmax_pairwise_strided((Byte*)a, length, stride);
-        }
-        return minmax_avx_strided((Byte*)a, length, stride);
-    #else
+#if IS_X86_64
+    if (length < 16){
         return minmax_pairwise_strided((Byte*)a, length, stride);
-    #endif
+    }
+    return minmax_avx_strided((Byte*)a, length, stride);
+#else
+    return minmax_pairwise_strided((Byte*)a, length, stride);
+#endif
 }

From 064894b739165b960091ed382db3b2d8fbb31367 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:17:33 +0200
Subject: [PATCH 10/16] Compile for all OSes now

---
 .github/workflows/build.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a3be9fa..abd2c55 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,8 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        #os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64]
-        os: [ubuntu-22.04-arm64]
+        os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64]
         python-version: [3.9]
 
     if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch'

From 5bdde0924a8163e283264005a8e564b0ee3e1e8c Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:22:49 +0200
Subject: [PATCH 11/16] Limit numpy to <2 for now. Explicitly add cffi
 dependency.

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7fa218a..78c8330 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,8 @@ name = "numpy-minmax"
 version = "0.2.1"
 description = "A fast python library for finding both min and max value in a NumPy array"
 dependencies = [
-    "numpy>=1.21"
+    "cffi>=1.0.0",
+    "numpy>=1.21,<2"
 ]
 readme = "README.md"
 license = { file = "LICENSE" }

From 1bb3801c9c831e49c06ceae48e017c47b3fd7a21 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:31:36 +0200
Subject: [PATCH 12/16] Distribute source

---
 MANIFEST.in  | 1 +
 packaging.md | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..bfcb3e5
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include numpy_minmax *.c
diff --git a/packaging.md b/packaging.md
index c9533eb..3ad4417 100644
--- a/packaging.md
+++ b/packaging.md
@@ -5,6 +5,7 @@
 * Wait for build workflow in GitHub Actions to complete
 * Download wheels artifact from the build workflow
 * Place all the fresh whl files in dist/
+* `python setup.py sdist`
 * `python -m twine upload dist/*`
 * Add a tag with name "x.y.z" to the commit
 * Go to https://github.com/nomonosound/numpy-minmax/releases and create a release where you choose the new tag

From bbbd3e06e33c2bd1e059f21bf860c834faedde42 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 14:39:34 +0200
Subject: [PATCH 13/16] Build for macOS (arm64)

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index abd2c55..50174c6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64]
+        os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64, macos-latest]
         python-version: [3.9]
 
     if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch'

From 04bf6d65ff6c4e45702d07372d79f0c32aedb202 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 15:37:44 +0200
Subject: [PATCH 14/16] Regroup the code by ISA

---
 numpy_minmax/_minmax.c | 156 +++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 83 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 4fac089..449c55b 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -3,6 +3,13 @@
 
 #define IS_X86_64 (defined(__x86_64__) || defined(_M_X64))
 
+typedef struct {
+    float min_val;
+    float max_val;
+} MinMaxResult;
+
+typedef unsigned char Byte;
+
 #if IS_X86_64
     #include <immintrin.h>
 
@@ -15,17 +22,7 @@
     #ifndef bit_AVX512F
     #define bit_AVX512F     (1 << 16)
     #endif
-#endif
-
-
-typedef struct {
-    float min_val;
-    float max_val;
-} MinMaxResult;
-
-typedef unsigned char Byte;
 
-#if IS_X86_64
 bool system_supports_avx512() {
     unsigned int eax, ebx, ecx, edx;
 
@@ -43,29 +40,7 @@ bool system_supports_avx512() {
     // Check the AVX512F bit in EBX
     return (ebx & bit_AVX512F) != 0;
 }
-#endif
 
-static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
-    // Initialize min and max with the last element of the array.
-    // This ensures that it works correctly for odd length arrays as well as even.
-    MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]};
-
-    // Process elements in pairs
-    for (size_t i = 0; i < length - 1; i += 2) {
-        float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1];
-        float larger = a[i] < a[i + 1] ? a[i + 1] : a[i];
-
-        if (smaller < result.min_val) {
-            result.min_val = smaller;
-        }
-        if (larger > result.max_val) {
-            result.max_val = larger;
-        }
-    }
-    return result;
-}
-
-#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) {
     float temp_min[8], temp_max[8];
     _mm256_storeu_ps(temp_min, min_vals);
@@ -76,9 +51,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
     }
     return result;
 }
-#endif
 
-#if IS_X86_64
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -99,9 +72,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) {
 
     return reduce_result_from_mm256(min_vals, max_vals, result);
 }
-#endif
 
-#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) {
     float temp_min[16], temp_max[16];
     _mm512_storeu_ps(temp_min, min_vals);
@@ -112,9 +83,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_
     }
     return result;
 }
-#endif
 
-#if IS_X86_64
 MinMaxResult minmax_avx512(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -136,8 +105,74 @@ MinMaxResult minmax_avx512(const float *a, size_t length) {
 
     return reduce_result_from_mm512(min_vals, max_vals, result);
 }
+
+// Takes the avx min/max on strided input. Strides are in number of bytes,
+// which is why the data pointer is Byte (i.e. unsigned char)
+MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
+    MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
+
+    // This is faster than intrinsic gather on tested platforms
+    __m256 min_vals = _mm256_set_ps(
+        *(float*)(a),
+        *(float*)(a + stride),
+        *(float*)(a + 2*stride),
+        *(float*)(a + 3*stride),
+        *(float*)(a + 4*stride),
+        *(float*)(a + 5*stride),
+        *(float*)(a + 6*stride),
+        *(float*)(a + 7*stride)
+        );
+    __m256 max_vals = min_vals;
+
+    // Process elements in chunks of eight
+    size_t i = 8*stride;
+    for (; i <= (length - 8)*stride; i += 8*stride) {
+        __m256 vals = _mm256_set_ps(
+            *(float*)(a + i),
+            *(float*)(a + i + stride),
+            *(float*)(a + i + 2*stride),
+            *(float*)(a + i + 3*stride),
+            *(float*)(a + i + 4*stride),
+            *(float*)(a + i + 5*stride),
+            *(float*)(a + i + 6*stride),
+            *(float*)(a + i + 7*stride)
+            );
+        min_vals = _mm256_min_ps(min_vals, vals);
+        max_vals = _mm256_max_ps(max_vals, vals);
+    }
+
+    // Process remainder elements
+    if (i < length*stride){
+        result = minmax_pairwise_strided(a + i, length - i / stride, stride);
+    }
+
+    return reduce_result_from_mm256(min_vals, max_vals, result);
+}
 #endif
 
+
+
+static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
+    // Initialize min and max with the last element of the array.
+    // This ensures that it works correctly for odd length arrays as well as even.
+    MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]};
+
+    // Process elements in pairs
+    for (size_t i = 0; i < length - 1; i += 2) {
+        float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1];
+        float larger = a[i] < a[i + 1] ? a[i + 1] : a[i];
+
+        if (smaller < result.min_val) {
+            result.min_val = smaller;
+        }
+        if (larger > result.max_val) {
+            result.max_val = larger;
+        }
+    }
+    return result;
+}
+
+
 MinMaxResult minmax_contiguous(const float *a, size_t length) {
     // Return early for empty arrays
     if (length == 0) {
@@ -191,51 +226,6 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride)
     return result;
 }
 
-#if IS_X86_64
-// Takes the avx min/max on strided input. Strides are in number of bytes,
-// which is why the data pointer is Byte (i.e. unsigned char)
-MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
-    MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
-
-    // This is faster than intrinsic gather on tested platforms
-    __m256 min_vals = _mm256_set_ps(
-        *(float*)(a),
-        *(float*)(a + stride),
-        *(float*)(a + 2*stride),
-        *(float*)(a + 3*stride),
-        *(float*)(a + 4*stride),
-        *(float*)(a + 5*stride),
-        *(float*)(a + 6*stride),
-        *(float*)(a + 7*stride)
-        );
-    __m256 max_vals = min_vals;
-
-    // Process elements in chunks of eight
-    size_t i = 8*stride;
-    for (; i <= (length - 8)*stride; i += 8*stride) {
-        __m256 vals = _mm256_set_ps(
-            *(float*)(a + i),
-            *(float*)(a + i + stride),
-            *(float*)(a + i + 2*stride),
-            *(float*)(a + i + 3*stride),
-            *(float*)(a + i + 4*stride),
-            *(float*)(a + i + 5*stride),
-            *(float*)(a + i + 6*stride),
-            *(float*)(a + i + 7*stride)
-            );
-        min_vals = _mm256_min_ps(min_vals, vals);
-        max_vals = _mm256_max_ps(max_vals, vals);
-    }
-
-    // Process remainder elements
-    if (i < length*stride){
-        result = minmax_pairwise_strided(a + i, length - i / stride, stride);
-    }
-
-    return reduce_result_from_mm256(min_vals, max_vals, result);
-}
-#endif
-
 
 MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
     // Return early for empty arrays

From 75db06466f77976f87a301267c970f9acb63c21a Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 15:44:44 +0200
Subject: [PATCH 15/16] Revert "Regroup the code by ISA"

This reverts commit 04bf6d65ff6c4e45702d07372d79f0c32aedb202.
---
 numpy_minmax/_minmax.c | 156 ++++++++++++++++++++++-------------------
 1 file changed, 83 insertions(+), 73 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 449c55b..4fac089 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -3,13 +3,6 @@
 
 #define IS_X86_64 (defined(__x86_64__) || defined(_M_X64))
 
-typedef struct {
-    float min_val;
-    float max_val;
-} MinMaxResult;
-
-typedef unsigned char Byte;
-
 #if IS_X86_64
     #include <immintrin.h>
 
@@ -22,7 +15,17 @@ typedef unsigned char Byte;
     #ifndef bit_AVX512F
     #define bit_AVX512F     (1 << 16)
     #endif
+#endif
+
+
+typedef struct {
+    float min_val;
+    float max_val;
+} MinMaxResult;
+
+typedef unsigned char Byte;
 
+#if IS_X86_64
 bool system_supports_avx512() {
     unsigned int eax, ebx, ecx, edx;
 
@@ -40,7 +43,29 @@ bool system_supports_avx512() {
     // Check the AVX512F bit in EBX
     return (ebx & bit_AVX512F) != 0;
 }
+#endif
 
+static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
+    // Initialize min and max with the last element of the array.
+    // This ensures that it works correctly for odd length arrays as well as even.
+    MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]};
+
+    // Process elements in pairs
+    for (size_t i = 0; i < length - 1; i += 2) {
+        float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1];
+        float larger = a[i] < a[i + 1] ? a[i + 1] : a[i];
+
+        if (smaller < result.min_val) {
+            result.min_val = smaller;
+        }
+        if (larger > result.max_val) {
+            result.max_val = larger;
+        }
+    }
+    return result;
+}
+
+#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) {
     float temp_min[8], temp_max[8];
     _mm256_storeu_ps(temp_min, min_vals);
@@ -51,7 +76,9 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
     }
     return result;
 }
+#endif
 
+#if IS_X86_64
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -72,7 +99,9 @@ MinMaxResult minmax_avx(const float *a, size_t length) {
 
     return reduce_result_from_mm256(min_vals, max_vals, result);
 }
+#endif
 
+#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) {
     float temp_min[16], temp_max[16];
     _mm512_storeu_ps(temp_min, min_vals);
@@ -83,7 +112,9 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_
     }
     return result;
 }
+#endif
 
+#if IS_X86_64
 MinMaxResult minmax_avx512(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -105,74 +136,8 @@ MinMaxResult minmax_avx512(const float *a, size_t length) {
 
     return reduce_result_from_mm512(min_vals, max_vals, result);
 }
-
-// Takes the avx min/max on strided input. Strides are in number of bytes,
-// which is why the data pointer is Byte (i.e. unsigned char)
-MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
-    MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
-
-    // This is faster than intrinsic gather on tested platforms
-    __m256 min_vals = _mm256_set_ps(
-        *(float*)(a),
-        *(float*)(a + stride),
-        *(float*)(a + 2*stride),
-        *(float*)(a + 3*stride),
-        *(float*)(a + 4*stride),
-        *(float*)(a + 5*stride),
-        *(float*)(a + 6*stride),
-        *(float*)(a + 7*stride)
-        );
-    __m256 max_vals = min_vals;
-
-    // Process elements in chunks of eight
-    size_t i = 8*stride;
-    for (; i <= (length - 8)*stride; i += 8*stride) {
-        __m256 vals = _mm256_set_ps(
-            *(float*)(a + i),
-            *(float*)(a + i + stride),
-            *(float*)(a + i + 2*stride),
-            *(float*)(a + i + 3*stride),
-            *(float*)(a + i + 4*stride),
-            *(float*)(a + i + 5*stride),
-            *(float*)(a + i + 6*stride),
-            *(float*)(a + i + 7*stride)
-            );
-        min_vals = _mm256_min_ps(min_vals, vals);
-        max_vals = _mm256_max_ps(max_vals, vals);
-    }
-
-    // Process remainder elements
-    if (i < length*stride){
-        result = minmax_pairwise_strided(a + i, length - i / stride, stride);
-    }
-
-    return reduce_result_from_mm256(min_vals, max_vals, result);
-}
 #endif
 
-
-
-static inline MinMaxResult minmax_pairwise(const float *a, size_t length) {
-    // Initialize min and max with the last element of the array.
-    // This ensures that it works correctly for odd length arrays as well as even.
-    MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]};
-
-    // Process elements in pairs
-    for (size_t i = 0; i < length - 1; i += 2) {
-        float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1];
-        float larger = a[i] < a[i + 1] ? a[i + 1] : a[i];
-
-        if (smaller < result.min_val) {
-            result.min_val = smaller;
-        }
-        if (larger > result.max_val) {
-            result.max_val = larger;
-        }
-    }
-    return result;
-}
-
-
 MinMaxResult minmax_contiguous(const float *a, size_t length) {
     // Return early for empty arrays
     if (length == 0) {
@@ -226,6 +191,51 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride)
     return result;
 }
 
+#if IS_X86_64
+// Takes the avx min/max on strided input. Strides are in number of bytes,
+// which is why the data pointer is Byte (i.e. unsigned char)
+MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) {
+    MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
+
+    // This is faster than intrinsic gather on tested platforms
+    __m256 min_vals = _mm256_set_ps(
+        *(float*)(a),
+        *(float*)(a + stride),
+        *(float*)(a + 2*stride),
+        *(float*)(a + 3*stride),
+        *(float*)(a + 4*stride),
+        *(float*)(a + 5*stride),
+        *(float*)(a + 6*stride),
+        *(float*)(a + 7*stride)
+        );
+    __m256 max_vals = min_vals;
+
+    // Process elements in chunks of eight
+    size_t i = 8*stride;
+    for (; i <= (length - 8)*stride; i += 8*stride) {
+        __m256 vals = _mm256_set_ps(
+            *(float*)(a + i),
+            *(float*)(a + i + stride),
+            *(float*)(a + i + 2*stride),
+            *(float*)(a + i + 3*stride),
+            *(float*)(a + i + 4*stride),
+            *(float*)(a + i + 5*stride),
+            *(float*)(a + i + 6*stride),
+            *(float*)(a + i + 7*stride)
+            );
+        min_vals = _mm256_min_ps(min_vals, vals);
+        max_vals = _mm256_max_ps(max_vals, vals);
+    }
+
+    // Process remainder elements
+    if (i < length*stride){
+        result = minmax_pairwise_strided(a + i, length - i / stride, stride);
+    }
+
+    return reduce_result_from_mm256(min_vals, max_vals, result);
+}
+#endif
+
 
 MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) {
     // Return early for empty arrays

From 1d6aba99ddad8b0e681843bd94d252f02bb5a679 Mon Sep 17 00:00:00 2001
From: iver56 <iver56@hotmail.com>
Date: Mon, 29 Jul 2024 15:46:13 +0200
Subject: [PATCH 16/16] Remove some redundant ifs

---
 numpy_minmax/_minmax.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c
index 4fac089..f1d4f1b 100644
--- a/numpy_minmax/_minmax.c
+++ b/numpy_minmax/_minmax.c
@@ -76,9 +76,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_
     }
     return result;
 }
-#endif
 
-#if IS_X86_64
 MinMaxResult minmax_avx(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };
 
@@ -99,9 +97,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) {
 
     return reduce_result_from_mm256(min_vals, max_vals, result);
 }
-#endif
 
-#if IS_X86_64
 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) {
     float temp_min[16], temp_max[16];
     _mm512_storeu_ps(temp_min, min_vals);
@@ -112,9 +108,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_
     }
     return result;
 }
-#endif
 
-#if IS_X86_64
 MinMaxResult minmax_avx512(const float *a, size_t length) {
     MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };