From e59883d637dc8e9e8b1a1f3400c54b7a18befca1 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 15 Jul 2024 10:25:06 +0200 Subject: [PATCH 01/16] Add ARM support --- .github/workflows/build.yml | 3 ++- numpy_minmax/_minmax.c | 47 +++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 36df60d..a3be9fa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest] + #os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64] + os: [ubuntu-22.04-arm64] python-version: [3.9] if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch' diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 147740a..64a5837 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -1,7 +1,10 @@ #include -#include #include +#if defined(__x86_64__) || defined(_M_X64) +#include +#endif + #ifdef _MSC_VER #include // MSVC #else @@ -127,15 +130,20 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) { if (length == 0) { return (MinMaxResult){0.0, 0.0}; } - if (length >= 16) { - if (system_supports_avx512()) { - return minmax_avx512(a, length); + + #if defined(__x86_64__) || defined(_M_X64) + if (length >= 16) { + if (system_supports_avx512()) { + return minmax_avx512(a, length); + } else { + return minmax_avx(a, length); + } } else { - return minmax_avx(a, length); + return minmax_pairwise(a, length); } - } else { + #else return minmax_pairwise(a, length); - } + #endif } // Takes the pairwise min/max on strided input. Strides are in number of bytes, @@ -220,18 +228,23 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { if (length == 0) { return (MinMaxResult){0.0, 0.0}; } - if (stride < 0){ - if (-stride == sizeof(float)){ - return minmax_contiguous(a - length + 1, length); + + #if defined(__x86_64__) || defined(_M_X64) + if (stride < 0){ + if (-stride == sizeof(float)){ + return minmax_contiguous(a - length + 1, length); + } + if (length < 16){ + return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + } + return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + } if (length < 16){ - return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + return minmax_pairwise_strided((Byte*)a, length, stride); } - return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); - - } - if (length < 16){ + return minmax_avx_strided((Byte*)a, length, stride); + #else return minmax_pairwise_strided((Byte*)a, length, stride); - } - return minmax_avx_strided((Byte*)a, length, stride); + #endif } From 5918647974675a3d2c053d417568f8d178d2914e Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 15 Jul 2024 10:33:20 +0200 Subject: [PATCH 02/16] Make AVX compile args conditional --- numpy_minmax/_minmax_cffi.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/numpy_minmax/_minmax_cffi.py b/numpy_minmax/_minmax_cffi.py index ee6ac21..aef9d86 100644 --- a/numpy_minmax/_minmax_cffi.py +++ b/numpy_minmax/_minmax_cffi.py @@ -1,7 +1,7 @@ import os +import platform from cffi import FFI - ffibuilder = FFI() ffibuilder.cdef(""" typedef struct { @@ -18,10 +18,14 @@ with open(c_file_path, "r") as file: c_code = file.read() -extra_compile_args = ["-mavx", "-mavx512f", "-O3", "-Wall"] +extra_compile_args = ["-O3", "-Wall"] if os.name == "posix": extra_compile_args.append("-Wextra") +# Detect architecture and set appropriate SIMD-related compile args +if platform.machine().lower() in ["x86_64", "amd64", "i386", "i686"]: + extra_compile_args.append("-mavx") + ffibuilder.set_source("_numpy_minmax", c_code, extra_compile_args=extra_compile_args) From f837b611017499e7a347a2c54898abb2c783f23a Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 15 Jul 2024 10:39:43 +0200 Subject: [PATCH 03/16] Don't include cpuid.h when compiling for ARM --- numpy_minmax/_minmax.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 64a5837..4fdd12b 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -2,19 +2,20 @@ #include #if defined(__x86_64__) || defined(_M_X64) -#include -#endif + #include -#ifdef _MSC_VER - #include // MSVC -#else - #include // GCC and Clang -#endif + #ifdef _MSC_VER + #include // MSVC + #else + #include // GCC and Clang + #endif -#ifndef bit_AVX512F -#define bit_AVX512F (1 << 16) + #ifndef bit_AVX512F + #define bit_AVX512F (1 << 16) + #endif #endif + typedef struct { float min_val; float max_val; From 438532d03cb386dfb1066252283b36d899cde963 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 15 Jul 2024 10:44:44 +0200 Subject: [PATCH 04/16] Conditionally exclude AVX-enabled functions --- numpy_minmax/_minmax.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 4fdd12b..a56e52a 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -23,6 +23,7 @@ typedef struct { typedef unsigned char Byte; +#if defined(__x86_64__) || defined(_M_X64) bool system_supports_avx512() { unsigned int eax, ebx, ecx, edx; @@ -40,6 +41,7 @@ bool system_supports_avx512() { // Check the AVX512F bit in EBX return (ebx & bit_AVX512F) != 0; } +#endif static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { // Initialize min and max with the last element of the array. @@ -61,6 +63,7 @@ static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { return result; } +#if defined(__x86_64__) || defined(_M_X64) static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) { float temp_min[8], temp_max[8]; _mm256_storeu_ps(temp_min, min_vals); @@ -72,6 +75,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ return result; } + MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -125,6 +129,7 @@ MinMaxResult minmax_avx512(const float *a, size_t length) { return reduce_result_from_mm512(min_vals, max_vals, result); } +#endif MinMaxResult minmax_contiguous(const float *a, size_t length) { // Return early for empty arrays @@ -147,6 +152,7 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) { #endif } +#if defined(__x86_64__) || defined(_M_X64) // Takes the pairwise min/max on strided input. Strides are in number of bytes, // which is why the data pointer is Byte (i.e. unsigned char) MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) { @@ -222,6 +228,7 @@ MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { return reduce_result_from_mm256(min_vals, max_vals, result); } +#endif MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { From 810705663716024187cfdedf99036516d970dff5 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 13:15:14 +0200 Subject: [PATCH 05/16] Don't exclude minmax_pairwise_strided in arm builds --- numpy_minmax/_minmax.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index a56e52a..47f6099 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -74,8 +74,9 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ } return result; } +#endif - +#if defined(__x86_64__) || defined(_M_X64) MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -96,7 +97,9 @@ MinMaxResult minmax_avx(const float *a, size_t length) { return reduce_result_from_mm256(min_vals, max_vals, result); } +#endif +#if defined(__x86_64__) || defined(_M_X64) static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) { float temp_min[16], temp_max[16]; _mm512_storeu_ps(temp_min, min_vals); @@ -107,7 +110,9 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_ } return result; } +#endif +#if defined(__x86_64__) || defined(_M_X64) MinMaxResult minmax_avx512(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -152,7 +157,6 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) { #endif } -#if defined(__x86_64__) || defined(_M_X64) // Takes the pairwise min/max on strided input. Strides are in number of bytes, // which is why the data pointer is Byte (i.e. unsigned char) MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) { @@ -185,6 +189,7 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) return result; } +#if defined(__x86_64__) || defined(_M_X64) // Takes the avx min/max on strided input. Strides are in number of bytes, // which is why the data pointer is Byte (i.e. unsigned char) MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { @@ -220,7 +225,6 @@ MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { max_vals = _mm256_max_ps(max_vals, vals); } - // Process remainder elements if (i < length*stride){ result = minmax_pairwise_strided(a + i, length - i / stride, stride); From d34566af98ec90b71107aed3f2ad4aa8596f30c0 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:00:57 +0200 Subject: [PATCH 06/16] Handle negative strides correctly on arm --- numpy_minmax/_minmax.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 47f6099..181d80f 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -241,17 +241,21 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { return (MinMaxResult){0.0, 0.0}; } - #if defined(__x86_64__) || defined(_M_X64) - if (stride < 0){ - if (-stride == sizeof(float)){ - return minmax_contiguous(a - length + 1, length); - } - if (length < 16){ - return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); - } + if (stride < 0){ + if (-stride == sizeof(float)){ + return minmax_contiguous(a - length + 1, length); + } + if (length < 16){ + return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + } + #if defined(__x86_64__) || defined(_M_X64) return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + #else + return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); + #endif + } - } + #if defined(__x86_64__) || defined(_M_X64) if (length < 16){ return minmax_pairwise_strided((Byte*)a, length, stride); } From 06f7160c3ad32888529f599274e7e6f33cf57695 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:08:29 +0200 Subject: [PATCH 07/16] Dry the x86 check --- numpy_minmax/_minmax.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 181d80f..86bd89b 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -1,7 +1,9 @@ #include #include -#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86_64 (defined(__x86_64__) || defined(_M_X64)) + +#if IS_X86_64 #include #ifdef _MSC_VER @@ -23,7 +25,7 @@ typedef struct { typedef unsigned char Byte; -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 bool system_supports_avx512() { unsigned int eax, ebx, ecx, edx; @@ -63,7 +65,7 @@ static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { return result; } -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) { float temp_min[8], temp_max[8]; _mm256_storeu_ps(temp_min, min_vals); @@ -76,7 +78,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ } #endif -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -99,7 +101,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) { } #endif -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) { float temp_min[16], temp_max[16]; _mm512_storeu_ps(temp_min, min_vals); @@ -112,7 +114,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_ } #endif -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 MinMaxResult minmax_avx512(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -142,7 +144,7 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) { return (MinMaxResult){0.0, 0.0}; } - #if defined(__x86_64__) || defined(_M_X64) + #if IS_X86_64 if (length >= 16) { if (system_supports_avx512()) { return minmax_avx512(a, length); @@ -189,7 +191,7 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) return result; } -#if defined(__x86_64__) || defined(_M_X64) +#if IS_X86_64 // Takes the avx min/max on strided input. Strides are in number of bytes, // which is why the data pointer is Byte (i.e. unsigned char) MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { @@ -248,14 +250,14 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { if (length < 16){ return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); } - #if defined(__x86_64__) || defined(_M_X64) + #if IS_X86_64 return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); #else return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); #endif } - #if defined(__x86_64__) || defined(_M_X64) + #if IS_X86_64 if (length < 16){ return minmax_pairwise_strided((Byte*)a, length, stride); } From 0f3bf76aa62e912de78d9129807962ecb4809ec4 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:17:09 +0200 Subject: [PATCH 08/16] Add back the missing -mavx512f compile arg --- numpy_minmax/_minmax_cffi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy_minmax/_minmax_cffi.py b/numpy_minmax/_minmax_cffi.py index aef9d86..39c1f9f 100644 --- a/numpy_minmax/_minmax_cffi.py +++ b/numpy_minmax/_minmax_cffi.py @@ -25,6 +25,7 @@ # Detect architecture and set appropriate SIMD-related compile args if platform.machine().lower() in ["x86_64", "amd64", "i386", "i686"]: extra_compile_args.append("-mavx") + extra_compile_args.append("-mavx512f") ffibuilder.set_source("_numpy_minmax", c_code, extra_compile_args=extra_compile_args) From 921b02ce8e0d2fcc82d45dbe7adbbdc0945d7bcf Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:17:21 +0200 Subject: [PATCH 09/16] Reindent preprocessor directives --- numpy_minmax/_minmax.c | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 86bd89b..4fac089 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -144,19 +144,19 @@ MinMaxResult minmax_contiguous(const float *a, size_t length) { return (MinMaxResult){0.0, 0.0}; } - #if IS_X86_64 - if (length >= 16) { - if (system_supports_avx512()) { - return minmax_avx512(a, length); - } else { - return minmax_avx(a, length); - } +#if IS_X86_64 + if (length >= 16) { + if (system_supports_avx512()) { + return minmax_avx512(a, length); } else { - return minmax_pairwise(a, length); + return minmax_avx(a, length); } - #else + } else { return minmax_pairwise(a, length); - #endif + } +#else + return minmax_pairwise(a, length); +#endif } // Takes the pairwise min/max on strided input. Strides are in number of bytes, @@ -250,19 +250,19 @@ MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { if (length < 16){ return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); } - #if IS_X86_64 - return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); - #else - return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); - #endif +#if IS_X86_64 + return minmax_avx_strided((Byte*)(a) + (length - 1)*stride, length, -stride); +#else + return minmax_pairwise_strided((Byte*)(a) + (length - 1)*stride, length, -stride); +#endif } - #if IS_X86_64 - if (length < 16){ - return minmax_pairwise_strided((Byte*)a, length, stride); - } - return minmax_avx_strided((Byte*)a, length, stride); - #else +#if IS_X86_64 + if (length < 16){ return minmax_pairwise_strided((Byte*)a, length, stride); - #endif + } + return minmax_avx_strided((Byte*)a, length, stride); +#else + return minmax_pairwise_strided((Byte*)a, length, stride); +#endif } From 064894b739165b960091ed382db3b2d8fbb31367 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:17:33 +0200 Subject: [PATCH 10/16] Compile for all OSes now --- .github/workflows/build.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a3be9fa..abd2c55 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,8 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - #os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64] - os: [ubuntu-22.04-arm64] + os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64] python-version: [3.9] if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch' From 5bdde0924a8163e283264005a8e564b0ee3e1e8c Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:22:49 +0200 Subject: [PATCH 11/16] Limit numpy to <2 for now. Explicitly add cffi dependency. --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7fa218a..78c8330 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,8 @@ name = "numpy-minmax" version = "0.2.1" description = "A fast python library for finding both min and max value in a NumPy array" dependencies = [ - "numpy>=1.21" + "cffi>=1.0.0", + "numpy>=1.21,<2" ] readme = "README.md" license = { file = "LICENSE" } From 1bb3801c9c831e49c06ceae48e017c47b3fd7a21 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:31:36 +0200 Subject: [PATCH 12/16] Distribute source --- MANIFEST.in | 1 + packaging.md | 1 + 2 files changed, 2 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..bfcb3e5 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include numpy_minmax *.c diff --git a/packaging.md b/packaging.md index c9533eb..3ad4417 100644 --- a/packaging.md +++ b/packaging.md @@ -5,6 +5,7 @@ * Wait for build workflow in GitHub Actions to complete * Download wheels artifact from the build workflow * Place all the fresh whl files in dist/ +* `python setup.py sdist` * `python -m twine upload dist/*` * Add a tag with name "x.y.z" to the commit * Go to https://github.com/nomonosound/numpy-minmax/releases and create a release where you choose the new tag From bbbd3e06e33c2bd1e059f21bf860c834faedde42 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 14:39:34 +0200 Subject: [PATCH 13/16] Build for macOS (arm64) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index abd2c55..50174c6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64] + os: [ubuntu-latest, windows-latest, ubuntu-22.04-arm64, macos-latest] python-version: [3.9] if: startsWith(github.event.head_commit.message, 'Release') || github.event_name == 'workflow_dispatch' From 04bf6d65ff6c4e45702d07372d79f0c32aedb202 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 15:37:44 +0200 Subject: [PATCH 14/16] Regroup the code by ISA --- numpy_minmax/_minmax.c | 156 +++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 83 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 4fac089..449c55b 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -3,6 +3,13 @@ #define IS_X86_64 (defined(__x86_64__) || defined(_M_X64)) +typedef struct { + float min_val; + float max_val; +} MinMaxResult; + +typedef unsigned char Byte; + #if IS_X86_64 #include @@ -15,17 +22,7 @@ #ifndef bit_AVX512F #define bit_AVX512F (1 << 16) #endif -#endif - - -typedef struct { - float min_val; - float max_val; -} MinMaxResult; - -typedef unsigned char Byte; -#if IS_X86_64 bool system_supports_avx512() { unsigned int eax, ebx, ecx, edx; @@ -43,29 +40,7 @@ bool system_supports_avx512() { // Check the AVX512F bit in EBX return (ebx & bit_AVX512F) != 0; } -#endif -static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { - // Initialize min and max with the last element of the array. - // This ensures that it works correctly for odd length arrays as well as even. - MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]}; - - // Process elements in pairs - for (size_t i = 0; i < length - 1; i += 2) { - float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1]; - float larger = a[i] < a[i + 1] ? a[i + 1] : a[i]; - - if (smaller < result.min_val) { - result.min_val = smaller; - } - if (larger > result.max_val) { - result.max_val = larger; - } - } - return result; -} - -#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) { float temp_min[8], temp_max[8]; _mm256_storeu_ps(temp_min, min_vals); @@ -76,9 +51,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ } return result; } -#endif -#if IS_X86_64 MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -99,9 +72,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) { return reduce_result_from_mm256(min_vals, max_vals, result); } -#endif -#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) { float temp_min[16], temp_max[16]; _mm512_storeu_ps(temp_min, min_vals); @@ -112,9 +83,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_ } return result; } -#endif -#if IS_X86_64 MinMaxResult minmax_avx512(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -136,8 +105,74 @@ MinMaxResult minmax_avx512(const float *a, size_t length) { return reduce_result_from_mm512(min_vals, max_vals, result); } + +// Takes the avx min/max on strided input. Strides are in number of bytes, +// which is why the data pointer is Byte (i.e. unsigned char) +MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { + MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; + + // This is faster than intrinsic gather on tested platforms + __m256 min_vals = _mm256_set_ps( + *(float*)(a), + *(float*)(a + stride), + *(float*)(a + 2*stride), + *(float*)(a + 3*stride), + *(float*)(a + 4*stride), + *(float*)(a + 5*stride), + *(float*)(a + 6*stride), + *(float*)(a + 7*stride) + ); + __m256 max_vals = min_vals; + + // Process elements in chunks of eight + size_t i = 8*stride; + for (; i <= (length - 8)*stride; i += 8*stride) { + __m256 vals = _mm256_set_ps( + *(float*)(a + i), + *(float*)(a + i + stride), + *(float*)(a + i + 2*stride), + *(float*)(a + i + 3*stride), + *(float*)(a + i + 4*stride), + *(float*)(a + i + 5*stride), + *(float*)(a + i + 6*stride), + *(float*)(a + i + 7*stride) + ); + min_vals = _mm256_min_ps(min_vals, vals); + max_vals = _mm256_max_ps(max_vals, vals); + } + + // Process remainder elements + if (i < length*stride){ + result = minmax_pairwise_strided(a + i, length - i / stride, stride); + } + + return reduce_result_from_mm256(min_vals, max_vals, result); +} #endif + + +static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { + // Initialize min and max with the last element of the array. + // This ensures that it works correctly for odd length arrays as well as even. + MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]}; + + // Process elements in pairs + for (size_t i = 0; i < length - 1; i += 2) { + float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1]; + float larger = a[i] < a[i + 1] ? a[i + 1] : a[i]; + + if (smaller < result.min_val) { + result.min_val = smaller; + } + if (larger > result.max_val) { + result.max_val = larger; + } + } + return result; +} + + MinMaxResult minmax_contiguous(const float *a, size_t length) { // Return early for empty arrays if (length == 0) { @@ -191,51 +226,6 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) return result; } -#if IS_X86_64 -// Takes the avx min/max on strided input. Strides are in number of bytes, -// which is why the data pointer is Byte (i.e. unsigned char) -MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { - MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; - - // This is faster than intrinsic gather on tested platforms - __m256 min_vals = _mm256_set_ps( - *(float*)(a), - *(float*)(a + stride), - *(float*)(a + 2*stride), - *(float*)(a + 3*stride), - *(float*)(a + 4*stride), - *(float*)(a + 5*stride), - *(float*)(a + 6*stride), - *(float*)(a + 7*stride) - ); - __m256 max_vals = min_vals; - - // Process elements in chunks of eight - size_t i = 8*stride; - for (; i <= (length - 8)*stride; i += 8*stride) { - __m256 vals = _mm256_set_ps( - *(float*)(a + i), - *(float*)(a + i + stride), - *(float*)(a + i + 2*stride), - *(float*)(a + i + 3*stride), - *(float*)(a + i + 4*stride), - *(float*)(a + i + 5*stride), - *(float*)(a + i + 6*stride), - *(float*)(a + i + 7*stride) - ); - min_vals = _mm256_min_ps(min_vals, vals); - max_vals = _mm256_max_ps(max_vals, vals); - } - - // Process remainder elements - if (i < length*stride){ - result = minmax_pairwise_strided(a + i, length - i / stride, stride); - } - - return reduce_result_from_mm256(min_vals, max_vals, result); -} -#endif - MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { // Return early for empty arrays From 75db06466f77976f87a301267c970f9acb63c21a Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 15:44:44 +0200 Subject: [PATCH 15/16] Revert "Regroup the code by ISA" This reverts commit 04bf6d65ff6c4e45702d07372d79f0c32aedb202. --- numpy_minmax/_minmax.c | 156 ++++++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 73 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 449c55b..4fac089 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -3,13 +3,6 @@ #define IS_X86_64 (defined(__x86_64__) || defined(_M_X64)) -typedef struct { - float min_val; - float max_val; -} MinMaxResult; - -typedef unsigned char Byte; - #if IS_X86_64 #include @@ -22,7 +15,17 @@ typedef unsigned char Byte; #ifndef bit_AVX512F #define bit_AVX512F (1 << 16) #endif +#endif + + +typedef struct { + float min_val; + float max_val; +} MinMaxResult; + +typedef unsigned char Byte; +#if IS_X86_64 bool system_supports_avx512() { unsigned int eax, ebx, ecx, edx; @@ -40,7 +43,29 @@ bool system_supports_avx512() { // Check the AVX512F bit in EBX return (ebx & bit_AVX512F) != 0; } +#endif +static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { + // Initialize min and max with the last element of the array. + // This ensures that it works correctly for odd length arrays as well as even. + MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]}; + + // Process elements in pairs + for (size_t i = 0; i < length - 1; i += 2) { + float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1]; + float larger = a[i] < a[i + 1] ? a[i + 1] : a[i]; + + if (smaller < result.min_val) { + result.min_val = smaller; + } + if (larger > result.max_val) { + result.max_val = larger; + } + } + return result; +} + +#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_vals, MinMaxResult result) { float temp_min[8], temp_max[8]; _mm256_storeu_ps(temp_min, min_vals); @@ -51,7 +76,9 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ } return result; } +#endif +#if IS_X86_64 MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -72,7 +99,9 @@ MinMaxResult minmax_avx(const float *a, size_t length) { return reduce_result_from_mm256(min_vals, max_vals, result); } +#endif +#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) { float temp_min[16], temp_max[16]; _mm512_storeu_ps(temp_min, min_vals); @@ -83,7 +112,9 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_ } return result; } +#endif +#if IS_X86_64 MinMaxResult minmax_avx512(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -105,74 +136,8 @@ MinMaxResult minmax_avx512(const float *a, size_t length) { return reduce_result_from_mm512(min_vals, max_vals, result); } - -// Takes the avx min/max on strided input. Strides are in number of bytes, -// which is why the data pointer is Byte (i.e. unsigned char) -MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { - MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; - - // This is faster than intrinsic gather on tested platforms - __m256 min_vals = _mm256_set_ps( - *(float*)(a), - *(float*)(a + stride), - *(float*)(a + 2*stride), - *(float*)(a + 3*stride), - *(float*)(a + 4*stride), - *(float*)(a + 5*stride), - *(float*)(a + 6*stride), - *(float*)(a + 7*stride) - ); - __m256 max_vals = min_vals; - - // Process elements in chunks of eight - size_t i = 8*stride; - for (; i <= (length - 8)*stride; i += 8*stride) { - __m256 vals = _mm256_set_ps( - *(float*)(a + i), - *(float*)(a + i + stride), - *(float*)(a + i + 2*stride), - *(float*)(a + i + 3*stride), - *(float*)(a + i + 4*stride), - *(float*)(a + i + 5*stride), - *(float*)(a + i + 6*stride), - *(float*)(a + i + 7*stride) - ); - min_vals = _mm256_min_ps(min_vals, vals); - max_vals = _mm256_max_ps(max_vals, vals); - } - - // Process remainder elements - if (i < length*stride){ - result = minmax_pairwise_strided(a + i, length - i / stride, stride); - } - - return reduce_result_from_mm256(min_vals, max_vals, result); -} #endif - - -static inline MinMaxResult minmax_pairwise(const float *a, size_t length) { - // Initialize min and max with the last element of the array. - // This ensures that it works correctly for odd length arrays as well as even. - MinMaxResult result = {.min_val = a[length -1], .max_val = a[length-1]}; - - // Process elements in pairs - for (size_t i = 0; i < length - 1; i += 2) { - float smaller = a[i] < a[i + 1] ? a[i] : a[i + 1]; - float larger = a[i] < a[i + 1] ? a[i + 1] : a[i]; - - if (smaller < result.min_val) { - result.min_val = smaller; - } - if (larger > result.max_val) { - result.max_val = larger; - } - } - return result; -} - - MinMaxResult minmax_contiguous(const float *a, size_t length) { // Return early for empty arrays if (length == 0) { @@ -226,6 +191,51 @@ MinMaxResult minmax_pairwise_strided(const Byte *a, size_t length, long stride) return result; } +#if IS_X86_64 +// Takes the avx min/max on strided input. Strides are in number of bytes, +// which is why the data pointer is Byte (i.e. unsigned char) +MinMaxResult minmax_avx_strided(const Byte *a, size_t length, long stride) { + MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; + + // This is faster than intrinsic gather on tested platforms + __m256 min_vals = _mm256_set_ps( + *(float*)(a), + *(float*)(a + stride), + *(float*)(a + 2*stride), + *(float*)(a + 3*stride), + *(float*)(a + 4*stride), + *(float*)(a + 5*stride), + *(float*)(a + 6*stride), + *(float*)(a + 7*stride) + ); + __m256 max_vals = min_vals; + + // Process elements in chunks of eight + size_t i = 8*stride; + for (; i <= (length - 8)*stride; i += 8*stride) { + __m256 vals = _mm256_set_ps( + *(float*)(a + i), + *(float*)(a + i + stride), + *(float*)(a + i + 2*stride), + *(float*)(a + i + 3*stride), + *(float*)(a + i + 4*stride), + *(float*)(a + i + 5*stride), + *(float*)(a + i + 6*stride), + *(float*)(a + i + 7*stride) + ); + min_vals = _mm256_min_ps(min_vals, vals); + max_vals = _mm256_max_ps(max_vals, vals); + } + + // Process remainder elements + if (i < length*stride){ + result = minmax_pairwise_strided(a + i, length - i / stride, stride); + } + + return reduce_result_from_mm256(min_vals, max_vals, result); +} +#endif + MinMaxResult minmax_1d_strided(const float *a, size_t length, long stride) { // Return early for empty arrays From 1d6aba99ddad8b0e681843bd94d252f02bb5a679 Mon Sep 17 00:00:00 2001 From: iver56 Date: Mon, 29 Jul 2024 15:46:13 +0200 Subject: [PATCH 16/16] Remove some redundant ifs --- numpy_minmax/_minmax.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/numpy_minmax/_minmax.c b/numpy_minmax/_minmax.c index 4fac089..f1d4f1b 100644 --- a/numpy_minmax/_minmax.c +++ b/numpy_minmax/_minmax.c @@ -76,9 +76,7 @@ static inline MinMaxResult reduce_result_from_mm256(__m256 min_vals, __m256 max_ } return result; } -#endif -#if IS_X86_64 MinMaxResult minmax_avx(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX }; @@ -99,9 +97,7 @@ MinMaxResult minmax_avx(const float *a, size_t length) { return reduce_result_from_mm256(min_vals, max_vals, result); } -#endif -#if IS_X86_64 static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_vals, MinMaxResult result) { float temp_min[16], temp_max[16]; _mm512_storeu_ps(temp_min, min_vals); @@ -112,9 +108,7 @@ static inline MinMaxResult reduce_result_from_mm512(__m512 min_vals, __m512 max_ } return result; } -#endif -#if IS_X86_64 MinMaxResult minmax_avx512(const float *a, size_t length) { MinMaxResult result = { .min_val = FLT_MAX, .max_val = -FLT_MAX };