Skip to content

Commit

Permalink
Use faster SIMD for modulations
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Evstyukhin committed Nov 18, 2022
1 parent fbb8124 commit d679b6e
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 27 deletions.
94 changes: 84 additions & 10 deletions src/Bc7CoreMode4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -869,17 +869,37 @@ namespace Mode4 {
static INLINED BlockError ComputeSubsetTable23(const Area& area, __m128i mc, uint64_t& indices2, uint64_t& indices3, const int rotation) noexcept
{
const __m128i mrot = GetRotationShuffleNarrow(rotation);
const __m128i mhalf = _mm_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);

const __m128i mmask3 = _mm_shuffle_epi8(_mm_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0), mrot);

mc = _mm_shuffle_epi8(mc, mrot);
#if defined(OPTION_AVX2)
const __m256i vhalf = _mm256_set1_epi16(32);

__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_broadcastq_epi64(mmask3);
#else
const __m128i mhalf = _mm_set1_epi16(32);
#endif

int errorAlpha = 0;
int error3;
{
Modulations state3;
#if defined(OPTION_AVX2)
__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

vt = _mm256_maddubs_epi16(vc, vt);

vt = _mm256_add_epi16(vt, vhalf);

vt = _mm256_srli_epi16(vt, 6);

_mm256_store_si256((__m256i*)state3.Values_I16, _mm256_and_si256(vmask3, vt));
#else
__m128i mtx = gTableInterpolate2_U8[0];
__m128i mty = gTableInterpolate2_U8[1];

Expand All @@ -892,9 +912,9 @@ namespace Mode4 {
mtx = _mm_srli_epi16(mtx, 6);
mty = _mm_srli_epi16(mty, 6);

Modulations state3;
_mm_store_si128((__m128i*)&state3.Values_I16[0], _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state3.Values_I16, _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state3.Values_I16[2], _mm_and_si128(mmask3, mty));
#endif

const __m128i mweights3 = _mm_and_si128(mmask3, gWeightsAGRB);

Expand Down Expand Up @@ -929,6 +949,23 @@ namespace Mode4 {

int error1;
{
Modulations state1;
#if defined(OPTION_AVX2)
__m256i vt0 = *(const __m256i*)&gTableInterpolate3_U8[0];
__m256i vt1 = *(const __m256i*)&gTableInterpolate3_U8[2];

vt0 = _mm256_maddubs_epi16(vc, vt0);
vt1 = _mm256_maddubs_epi16(vc, vt1);

vt0 = _mm256_add_epi16(vt0, vhalf);
vt1 = _mm256_add_epi16(vt1, vhalf);

vt0 = _mm256_srli_epi16(vt0, 6);
vt1 = _mm256_srli_epi16(vt1, 6);

_mm256_store_si256((__m256i*)state1.Values_I16, _mm256_andnot_si256(vmask3, vt0));
_mm256_store_si256((__m256i*)&state1.Values_I16[4], _mm256_andnot_si256(vmask3, vt1));
#else
__m128i mtx = gTableInterpolate3_U8[0];
__m128i mty = gTableInterpolate3_U8[1];
__m128i mtz = gTableInterpolate3_U8[2];
Expand All @@ -949,11 +986,11 @@ namespace Mode4 {
mtz = _mm_srli_epi16(mtz, 6);
mtw = _mm_srli_epi16(mtw, 6);

Modulations state1;
_mm_store_si128((__m128i*)&state1.Values_I16[0], _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state1.Values_I16, _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state1.Values_I16[2], _mm_andnot_si128(mmask3, mty));
_mm_store_si128((__m128i*)&state1.Values_I16[4], _mm_andnot_si128(mmask3, mtz));
_mm_store_si128((__m128i*)&state1.Values_I16[6], _mm_andnot_si128(mmask3, mtw));
#endif

const __m128i mweights1 = _mm_andnot_si128(mmask3, gWeightsAGRB);

Expand Down Expand Up @@ -992,17 +1029,42 @@ namespace Mode4 {
static INLINED BlockError ComputeSubsetTable32(const Area& area, __m128i mc, uint64_t& indices2, uint64_t& indices3, const int rotation) noexcept
{
const __m128i mrot = GetRotationShuffleNarrow(rotation);
const __m128i mhalf = _mm_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);

const __m128i mmask3 = _mm_shuffle_epi8(_mm_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0), mrot);

mc = _mm_shuffle_epi8(mc, mrot);
#if defined(OPTION_AVX2)
const __m256i vhalf = _mm256_set1_epi16(32);

__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_broadcastq_epi64(mmask3);
#else
const __m128i mhalf = _mm_set1_epi16(32);
#endif

int errorAlpha = 0;
int error3;
{
Modulations state3;
#if defined(OPTION_AVX2)
__m256i vt0 = *(const __m256i*)&gTableInterpolate3_U8[0];
__m256i vt1 = *(const __m256i*)&gTableInterpolate3_U8[2];

vt0 = _mm256_maddubs_epi16(vc, vt0);
vt1 = _mm256_maddubs_epi16(vc, vt1);

vt0 = _mm256_add_epi16(vt0, vhalf);
vt1 = _mm256_add_epi16(vt1, vhalf);

vt0 = _mm256_srli_epi16(vt0, 6);
vt1 = _mm256_srli_epi16(vt1, 6);

_mm256_store_si256((__m256i*)state3.Values_I16, _mm256_and_si256(vmask3, vt0));
_mm256_store_si256((__m256i*)&state3.Values_I16[4], _mm256_and_si256(vmask3, vt1));
#else
__m128i mtx = gTableInterpolate3_U8[0];
__m128i mty = gTableInterpolate3_U8[1];
__m128i mtz = gTableInterpolate3_U8[2];
Expand All @@ -1023,11 +1085,11 @@ namespace Mode4 {
mtz = _mm_srli_epi16(mtz, 6);
mtw = _mm_srli_epi16(mtw, 6);

Modulations state3;
_mm_store_si128((__m128i*)&state3.Values_I16[0], _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state3.Values_I16, _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state3.Values_I16[2], _mm_and_si128(mmask3, mty));
_mm_store_si128((__m128i*)&state3.Values_I16[4], _mm_and_si128(mmask3, mtz));
_mm_store_si128((__m128i*)&state3.Values_I16[6], _mm_and_si128(mmask3, mtw));
#endif

const __m128i mweights3 = _mm_and_si128(mmask3, gWeightsAGRB);

Expand Down Expand Up @@ -1062,6 +1124,18 @@ namespace Mode4 {

int error1;
{
Modulations state1;
#if defined(OPTION_AVX2)
__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

vt = _mm256_maddubs_epi16(vc, vt);

vt = _mm256_add_epi16(vt, vhalf);

vt = _mm256_srli_epi16(vt, 6);

_mm256_store_si256((__m256i*)state1.Values_I16, _mm256_andnot_si256(vmask3, vt));
#else
__m128i mtx = gTableInterpolate2_U8[0];
__m128i mty = gTableInterpolate2_U8[1];

Expand All @@ -1074,9 +1148,9 @@ namespace Mode4 {
mtx = _mm_srli_epi16(mtx, 6);
mty = _mm_srli_epi16(mty, 6);

Modulations state1;
_mm_store_si128((__m128i*)&state1.Values_I16[0], _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state1.Values_I16, _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state1.Values_I16[2], _mm_andnot_si128(mmask3, mty));
#endif

const __m128i mweights1 = _mm_andnot_si128(mmask3, gWeightsAGRB);

Expand Down
42 changes: 37 additions & 5 deletions src/Bc7CoreMode5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,17 +430,37 @@ namespace Mode5 {
static INLINED BlockError ComputeSubsetTable2(const Area& area, __m128i mc, uint64_t& indicesColor, uint64_t& indicesAlpha, const int rotation) noexcept
{
const __m128i mrot = GetRotationShuffleNarrow(rotation);
const __m128i mhalf = _mm_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);

const __m128i mmask3 = _mm_shuffle_epi8(_mm_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0), mrot);

mc = _mm_shuffle_epi8(mc, mrot);
#if defined(OPTION_AVX2)
const __m256i vhalf = _mm256_set1_epi16(32);

__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_broadcastq_epi64(mmask3);
#else
const __m128i mhalf = _mm_set1_epi16(32);
#endif

int errorAlpha = 0;
int error3;
{
Modulations state3;
#if defined(OPTION_AVX2)
__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

vt = _mm256_maddubs_epi16(vc, vt);

vt = _mm256_add_epi16(vt, vhalf);

vt = _mm256_srli_epi16(vt, 6);

_mm256_store_si256((__m256i*)state3.Values_I16, _mm256_and_si256(vmask3, vt));
#else
__m128i mtx = gTableInterpolate2_U8[0];
__m128i mty = gTableInterpolate2_U8[1];

Expand All @@ -453,9 +473,9 @@ namespace Mode5 {
mtx = _mm_srli_epi16(mtx, 6);
mty = _mm_srli_epi16(mty, 6);

Modulations state3;
_mm_store_si128((__m128i*)&state3.Values_I16[0], _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state3.Values_I16, _mm_and_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state3.Values_I16[2], _mm_and_si128(mmask3, mty));
#endif

const __m128i mweights3 = _mm_and_si128(mmask3, gWeightsAGRB);

Expand Down Expand Up @@ -490,6 +510,18 @@ namespace Mode5 {

int error1;
{
Modulations state1;
#if defined(OPTION_AVX2)
__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

vt = _mm256_maddubs_epi16(vc, vt);

vt = _mm256_add_epi16(vt, vhalf);

vt = _mm256_srli_epi16(vt, 6);

_mm256_store_si256((__m256i*)state1.Values_I16, _mm256_andnot_si256(vmask3, vt));
#else
__m128i mtx = gTableInterpolate2_U8[0];
__m128i mty = gTableInterpolate2_U8[1];

Expand All @@ -502,9 +534,9 @@ namespace Mode5 {
mtx = _mm_srli_epi16(mtx, 6);
mty = _mm_srli_epi16(mty, 6);

Modulations state1;
_mm_store_si128((__m128i*)&state1.Values_I16[0], _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)state1.Values_I16, _mm_andnot_si128(mmask3, mtx));
_mm_store_si128((__m128i*)&state1.Values_I16[2], _mm_andnot_si128(mmask3, mty));
#endif

const __m128i mweights1 = _mm_andnot_si128(mmask3, gWeightsAGRB);

Expand Down
58 changes: 54 additions & 4 deletions src/Bc7CoreMode6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -932,10 +932,60 @@ namespace Mode6 {

static INLINED BlockError ComputeSubsetTable4(const Area& area, __m128i mc, uint64_t& indices) noexcept
{
const __m128i mhalf = _mm_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);

Modulations state;
#if defined(OPTION_AVX512)
const __m512i whalf = _mm512_set1_epi16(32);

__m512i wc = _mm512_broadcastq_epi64(mc);

__m512i wt0 = *(const __m512i*)&gTableInterpolate4_U8[0];
__m512i wt1 = *(const __m512i*)&gTableInterpolate4_U8[4];

wt0 = _mm512_maddubs_epi16(wc, wt0);
wt1 = _mm512_maddubs_epi16(wc, wt1);

wt0 = _mm512_add_epi16(wt0, whalf);
wt1 = _mm512_add_epi16(wt1, whalf);

wt0 = _mm512_srli_epi16(wt0, 6);
wt1 = _mm512_srli_epi16(wt1, 6);

_mm512_store_epi32((__m256i*)state.Values_I16, wt0);
_mm512_store_epi32((__m256i*)&state.Values_I16[8], wt1);
#elif defined(OPTION_AVX2)
const __m256i vhalf = _mm256_set1_epi16(32);

__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vtx = *(const __m256i*)&gTableInterpolate4_U8[0];
__m256i vty = *(const __m256i*)&gTableInterpolate4_U8[2];
__m256i vtz = *(const __m256i*)&gTableInterpolate4_U8[4];
__m256i vtw = *(const __m256i*)&gTableInterpolate4_U8[6];

vtx = _mm256_maddubs_epi16(vc, vtx);
vty = _mm256_maddubs_epi16(vc, vty);
vtz = _mm256_maddubs_epi16(vc, vtz);
vtw = _mm256_maddubs_epi16(vc, vtw);

vtx = _mm256_add_epi16(vtx, vhalf);
vty = _mm256_add_epi16(vty, vhalf);
vtz = _mm256_add_epi16(vtz, vhalf);
vtw = _mm256_add_epi16(vtw, vhalf);

vtx = _mm256_srli_epi16(vtx, 6);
vty = _mm256_srli_epi16(vty, 6);
vtz = _mm256_srli_epi16(vtz, 6);
vtw = _mm256_srli_epi16(vtw, 6);

_mm256_store_si256((__m256i*)state.Values_I16, vtx);
_mm256_store_si256((__m256i*)&state.Values_I16[4], vty);
_mm256_store_si256((__m256i*)&state.Values_I16[8], vtz);
_mm256_store_si256((__m256i*)&state.Values_I16[12], vtw);
#else
const __m128i mhalf = _mm_set1_epi16(32);

__m128i mtx = gTableInterpolate4_U8[0];
__m128i mty = gTableInterpolate4_U8[1];
__m128i mtz = gTableInterpolate4_U8[2];
Expand Down Expand Up @@ -972,15 +1022,15 @@ namespace Mode6 {
mrz = _mm_srli_epi16(mrz, 6);
mrw = _mm_srli_epi16(mrw, 6);

Modulations state;
_mm_store_si128((__m128i*)&state.Values_I16[0], mtx);
_mm_store_si128((__m128i*)state.Values_I16, mtx);
_mm_store_si128((__m128i*)&state.Values_I16[2], mty);
_mm_store_si128((__m128i*)&state.Values_I16[4], mtz);
_mm_store_si128((__m128i*)&state.Values_I16[6], mtw);
_mm_store_si128((__m128i*)&state.Values_I16[8], mrx);
_mm_store_si128((__m128i*)&state.Values_I16[10], mry);
_mm_store_si128((__m128i*)&state.Values_I16[12], mrz);
_mm_store_si128((__m128i*)&state.Values_I16[14], mrw);
#endif

int error = ComputeSubsetTable4(area, gWeightsAGRB, state);

Expand Down
24 changes: 20 additions & 4 deletions src/SnippetComputeOpaqueSubset2.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,10 +613,26 @@ static INLINED auto ComputeSubsetTable2(const Area& area, __m128i mc, uint64_t&
mc = _mm_or_si128(mc, _mm_set_epi16(0, 0, 0, 0, 0, 0, 255, 255));
}

const __m128i mhalf = _mm_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);

Modulations state;
#if defined(OPTION_AVX2)
const __m256i vhalf = _mm256_set1_epi16(32);

__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

vt = _mm256_maddubs_epi16(vc, vt);

vt = _mm256_add_epi16(vt, vhalf);

vt = _mm256_srli_epi16(vt, 6);

_mm256_store_si256((__m256i*)state.Values_I16, vt);
#else
const __m128i mhalf = _mm_set1_epi16(32);

__m128i mtx = gTableInterpolate2_U8[0];
__m128i mty = gTableInterpolate2_U8[1];

Expand All @@ -629,9 +645,9 @@ static INLINED auto ComputeSubsetTable2(const Area& area, __m128i mc, uint64_t&
mtx = _mm_srli_epi16(mtx, 6);
mty = _mm_srli_epi16(mty, 6);

Modulations state;
_mm_store_si128((__m128i*)&state.Values_I16[0], mtx);
_mm_store_si128((__m128i*)state.Values_I16, mtx);
_mm_store_si128((__m128i*)&state.Values_I16[2], mty);
#endif

int error = ComputeSubsetTable2(area, gWeightsAGRB, state);

Expand Down
Loading

0 comments on commit d679b6e

Please sign in to comment.