Skip to content

Commit

Permalink
Use shuffling instead of temp memory storages
Browse files Browse the repository at this point in the history
  • Loading branch information
crsib committed Oct 4, 2023
1 parent b4d400f commit ce80ab5
Showing 1 changed file with 25 additions and 36 deletions.
61 changes: 25 additions & 36 deletions libraries/lib-time-and-pitch/StaffPad/SimdComplexConversions_sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,32 +299,25 @@ template <typename fnc>
void perform_parallel_simd_aligned(
const std::complex<float>* input, float* output, int n, const fnc& f)
{
// fnc& f needs to be a lambda of type [](auto &a, auto &b){}.
// the autos will be float_x4/float
constexpr int N = 4;
constexpr int byte_size = sizeof(float);

for (int i = 0; i <= n - N; i += N)
for (int i = 0; i <= n - 4; i += 4)
{
alignas(N * byte_size) float temp_real[N] { real(input[i + 0]),
real(input[i + 1]),
real(input[i + 2]),
real(input[i + 3]) };

alignas(N * byte_size) float temp_imag[N] { imag(input[i + 0]),
imag(input[i + 1]),
imag(input[i + 2]),
imag(input[i + 3]) };

auto rp = _mm_load_ps(temp_real);
auto ip = _mm_load_ps(temp_imag);
// Safe according to C++ standard
auto p1 = _mm_load_ps(reinterpret_cast<const float*>(input + i));
auto p2 = _mm_load_ps(reinterpret_cast<const float*>(input + i + 2));

// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}

auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));

__m128 out;
f(rp, ip, out);

_mm_store_ps(output + i, out);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i)
for (int i = n & (~3); i < n; ++i)
{
__m128 out;
f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out);
Expand All @@ -336,42 +329,38 @@ inline void rotate_parallel_simd_aligned(
const float* oldPhase, const float* newPhase, std::complex<float>* output,
int n)
{
constexpr int N = 4;
constexpr int byte_size = sizeof(float);

for (int i = 0; i <= n - N; i += N)
for (int i = 0; i <= n - 4; i += 4)
{
auto [sin, cos] = sincos_ps(
_mm_sub_ps(
_mm_load_ps(newPhase + i),
_mm_load_ps(oldPhase + i)));


alignas(
16) float temp_real[N] = { real(output[i + 0]), real(output[i + 1]),
real(output[i + 2]), real(output[i + 3]) };
// Safe according to C++ standard
auto p1 = _mm_load_ps(reinterpret_cast<float*>(output + i));
auto p2 = _mm_load_ps(reinterpret_cast<float*>(output + i + 2));

alignas(
16) float temp_imag[N] = { imag(output[i + 0]), imag(output[i + 1]),
imag(output[i + 2]), imag(output[i + 3]) };
// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}

auto rp = _mm_load_ps(temp_real);
auto ip = _mm_load_ps(temp_imag);
auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));


// We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos)

auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin));
auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos));

_mm_store_ps(temp_real, out_rp);
_mm_store_ps(temp_imag, out_ip);
p1 = _mm_unpacklo_ps(out_rp, out_ip);
p2 = _mm_unpackhi_ps(out_rp, out_ip);

for (int j = 0; j < N; ++j)
output[i + j] = { temp_real[j], temp_imag[j] };
_mm_store_ps(reinterpret_cast<float*>(output + i), p1);
_mm_store_ps(reinterpret_cast<float*>(output + i + 2), p2);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i)
for (int i = n & (~3); i < n; ++i)
{
auto theta = newPhase[i] - oldPhase[i];
output[i] *= std::complex<float>(cosf(theta), sinf(theta));
Expand Down

0 comments on commit ce80ab5

Please sign in to comment.