diff --git a/libraries/lib-time-and-pitch/StaffPad/SimdComplexConversions_sse2.h b/libraries/lib-time-and-pitch/StaffPad/SimdComplexConversions_sse2.h index 787b8886efad..160191a321a8 100644 --- a/libraries/lib-time-and-pitch/StaffPad/SimdComplexConversions_sse2.h +++ b/libraries/lib-time-and-pitch/StaffPad/SimdComplexConversions_sse2.h @@ -299,32 +299,25 @@ template void perform_parallel_simd_aligned( const std::complex* input, float* output, int n, const fnc& f) { - // fnc& f needs to be a lambda of type [](auto &a, auto &b){}. - // the autos will be float_x4/float - constexpr int N = 4; - constexpr int byte_size = sizeof(float); - - for (int i = 0; i <= n - N; i += N) + for (int i = 0; i <= n - 4; i += 4) { - alignas(N * byte_size) float temp_real[N] { real(input[i + 0]), - real(input[i + 1]), - real(input[i + 2]), - real(input[i + 3]) }; - - alignas(N * byte_size) float temp_imag[N] { imag(input[i + 0]), - imag(input[i + 1]), - imag(input[i + 2]), - imag(input[i + 3]) }; - - auto rp = _mm_load_ps(temp_real); - auto ip = _mm_load_ps(temp_imag); + // Safe according to C++ standard + auto p1 = _mm_load_ps(reinterpret_cast(input + i)); + auto p2 = _mm_load_ps(reinterpret_cast(input + i + 2)); + + // p1 = {real(c1), imag(c1), real(c2), imag(c2)} + // p2 = {real(c3), imag(c3), real(c4), imag(c4)} + + auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0)); + auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1)); + __m128 out; f(rp, ip, out); _mm_store_ps(output + i, out); } // deal with last partial packet - for (int i = n & (~(N - 1)); i < n; ++i) + for (int i = n & (~3); i < n; ++i) { __m128 out; f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out); @@ -336,10 +329,7 @@ inline void rotate_parallel_simd_aligned( const float* oldPhase, const float* newPhase, std::complex* output, int n) { - constexpr int N = 4; - constexpr int byte_size = sizeof(float); - - for (int i = 0; i <= n - N; i += N) + for (int i = 0; i <= n - 4; i += 4) { auto [sin, cos] = sincos_ps( _mm_sub_ps( @@ -347,16 +337,15 @@ inline void rotate_parallel_simd_aligned( _mm_load_ps(oldPhase + i))); - alignas( - 16) float temp_real[N] = { real(output[i + 0]), real(output[i + 1]), - real(output[i + 2]), real(output[i + 3]) }; + // Safe according to C++ standard + auto p1 = _mm_load_ps(reinterpret_cast(output + i)); + auto p2 = _mm_load_ps(reinterpret_cast(output + i + 2)); - alignas( - 16) float temp_imag[N] = { imag(output[i + 0]), imag(output[i + 1]), - imag(output[i + 2]), imag(output[i + 3]) }; + // p1 = {real(c1), imag(c1), real(c2), imag(c2)} + // p2 = {real(c3), imag(c3), real(c4), imag(c4)} - auto rp = _mm_load_ps(temp_real); - auto ip = _mm_load_ps(temp_imag); + auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0)); + auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1)); // We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos) @@ -364,14 +353,14 @@ inline void rotate_parallel_simd_aligned( auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin)); auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos)); - _mm_store_ps(temp_real, out_rp); - _mm_store_ps(temp_imag, out_ip); + p1 = _mm_unpacklo_ps(out_rp, out_ip); + p2 = _mm_unpackhi_ps(out_rp, out_ip); - for (int j = 0; j < N; ++j) - output[i + j] = { temp_real[j], temp_imag[j] }; + _mm_store_ps(reinterpret_cast(output + i), p1); + _mm_store_ps(reinterpret_cast(output + i + 2), p2); } // deal with last partial packet - for (int i = n & (~(N - 1)); i < n; ++i) + for (int i = n & (~3); i < n; ++i) { auto theta = newPhase[i] - oldPhase[i]; output[i] *= std::complex(cosf(theta), sinf(theta));