Tired fixing this PR

AuburnSounds · Oct 23, 2024 · a5056c5 · a5056c5
1 parent 03925a5
commit a5056c5
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 3 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,15 +1,25 @@
 Advice:
 
+- Correctness with original semantics. Historically contributions to this repositery give tons of work to the maintainer because of insufficient reading of this document and general lack of care for correctness.
+  * If your work is not high-quality it will be auto-closed, because fixing it may take more time than creating it.
+  * Follow the rules here that stem from experience implementing SIMD intrinsics, a library that shall be very reliable.
+
+Here are the source for semantics:
+  - Intel Intrinsics Guide
+  - When the guide and the instruction disagree, we look at what C++ compilers do 
+    for this intrinsic.
+
 - **GODBOLT EVERYTHING YOU COMMIT**
   * Use `godbolt-template.d` and modify to your wished
   * GDC (version 12 or later) with -mavx -mavx2 (the template doesn't build without -mavx)
   * LDC (version 1.24+ or later) with -mtriple arm64, -O2, -O0, -mattr=+avx2, etc. 
 
 
-- Do intrinsics **one by one**, not all at once. This is **very** detailed work, it's not possible nor desirable to go fast while writing intrinsics. 
+- I implore you to do intrinsics **one by one**, not all at once. This is **very** detailed work, it's not possible nor desirable to go fast while writing intrinsics. 
    * Please don't go fast. 
    * Please make small PR because there is a lot of context to communicate.
    * Get pre-approval before working on something big.
+   * If you're a first time contributor, PR with only one intrinsic.
 
 - Add PERF comment anywhere you feel that something could be done faster in a supported combination: DMD D_SIMD, LDC x86_64, LDC arm64, LDC x86, GDC x86_64, with or without optimizations, with or without instruction support... 
   * If this is supposed returns a SIMD literal, does it inline?

diff --git a/source/inteli/avx2intrin.d b/source/inteli/avx2intrin.d
@@ -3796,10 +3796,41 @@ unittest
 /// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes.
 alias _mm256_srli_si256 = _mm256_bsrli_epi128;
 
-// TODO __m128i _mm_srlv_epi32 (__m128i a, __m128i count) pure @safe
+/// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `b` while shifting in zeroes.
+__m128i _mm_srlv_epi32(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m128i)__builtin_ia32_psrlv4si(cast(byte16)a, cast(byte16)count);
+    else
+    {
+        // UB if b[n] >= 32
+        __m128i R = _mm_setr_epi32(a.array[0] >>> count.array[0], 
+                                   a.array[1] >>> count.array[1], 
+                                   a.array[2] >>> count.array[2], 
+                                   a.array[3] >>> count.array[3]);
+
+        // Map large and negative shifts to 32
+        __m128i mm32 = _mm_set1_epi32(32);
+        __m128i shift = _mm_min_epu32(count, mm32);
+
+        // Set to 0 where the shift is >= 32
+        R = R & _mm_cmplt_epi32(shift, mm32);
+        return R;
+    }
+}
+unittest
+{
+    __m128i A     = _mm_setr_epi32(-1,  1, 4, -4);
+    __m128i shift = _mm_setr_epi32( 2, -6, 1, 32);
+    int4 R = cast(int4) _mm_srlv_epi32(A, shift);
+    int[4] expected = [ 1073741823, 0, 2, 0 ];
+    assert(R.array == expected);
+}
+
+
 // TODO __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @safe
 
-/// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
+/// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
 __m128i _mm_srlv_epi64(__m128i a, __m128i count) pure @trusted
 {
     static if (GDC_or_LDC_with_AVX2)