Skip to content

Commit

Permalink
+add Base::Convert16bNchwGemm1x1; fix bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Jul 24, 2024
1 parent bc3592c commit fbb71f8
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ <h5>New features</h5>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function SynetRelu16b.</li>
<li>API of SynetAdd16b framework.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of class SynetAdd16bUniform.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of class SynetConvolution16bNchwGemm.</li>
<li>Base implementation, SSE4.1 optimizations of class SynetConvolution16bNchwGemm.</li>
</ul>
<h5>Improving</h5>
<ul>
Expand Down
181 changes: 178 additions & 3 deletions src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,187 @@ namespace Simd
#if defined(SIMD_SYNET_ENABLE)
namespace Base
{
typedef Base::SynetConvolution16bNchwGemm::AlgParam AlgParam;
typedef Base::SynetConvolution16bNchwGemm::ConvolutionPtr Convolution;

//-----------------------------------------------------------------------------------------

static void Convert16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst)
{
const float* src = ((float*)src8) + (cBeg * p.srcH + yBeg) * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j, dS = p.srcH * p.srcW;
size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k;
for (j = 0; j < NF; j += a.F)
{
for (k = 0; k < K2; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = Float32ToBFloat16(src0[f]);
*dst++ = Float32ToBFloat16(src1[f]);
}
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = Float32ToBFloat16(src0[f]);
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
src += a.F;
}
if (j < N)
{
size_t tail = N - j, f;
for (k = 0; k < K2; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
for (f = 0; f < tail; ++f)
{
*dst++ = Float32ToBFloat16(src0[f]);
*dst++ = Float32ToBFloat16(src1[f]);
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
for (f = 0; f < tail; ++f)
{
*dst++ = Float32ToBFloat16(src0[f]);
*dst++ = 0;
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
}
}

static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst)
{
const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j, dS = p.srcH * p.srcW;
size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k;
for (j = 0; j < NF; j += a.F)
{
for (k = 0; k < K2; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = src0[f];
*dst++ = src1[f];
}
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = src0[f];
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
src += a.F;
}
if (j < N)
{
size_t tail = N - j, f;
for (k = 0; k < K2; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
*dst++ = src1[f];
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
*dst++ = 0;
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
}
}

//-----------------------------------------------------------------------------------------

SynetConvolution16bNchwGemm::SynetConvolution16bNchwGemm(const ConvParam& p)
: SynetConvolution16b(p)
{
_convert = 0;
_convolutions[0] = 0;
_convolutions[1] = 0;
if (_src16b)
{
if (_is1x1)
_convert = Reorder16bNchwGemm1x1;
//else
// _convert = Reorder16bNhwcGemm;
}
else
{
if (_is1x1)
_convert = Convert16bNchwGemm1x1;
//else
// _convert = Convert16bNhwcGemm;
}
}

String SynetConvolution16bNchwGemm::Desc() const
Expand Down Expand Up @@ -149,11 +324,11 @@ namespace Simd
size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
if (_is1x1)
_convert(src, p, a, yBeg, yEnd, mak, mak + macroK, buf);
size_t bufOffs = _is1x1 ? mak * a.F : 0;
size_t bufOffs = _is1x1 ? 0 : mak * a.F;
for (size_t dc = 0; dc < p.dstC; dc += a.macroD)
{
size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc;
size_t sumOffs = a.macroK < a.bufK ? yBeg * p.dstW * a.macroD : 0;
size_t sumOffs = a.macroK < a.bufK ? (dc * p.dstH + yBeg) * p.dstW : 0;
size_t dstOffs = (dc * p.dstH + yBeg) * p.dstW * _elemD;
const uint16_t* weight = _weight.data + a.bufD * mak + dc * macroK;
if (mak + macroK == a.bufK)
Expand All @@ -170,7 +345,7 @@ namespace Simd

bool SynetConvolution16bNchwGemm::Preferable(const ConvParam& p)
{
return p.trans == 0 && p.group == 1 && Is1x1(p) && p.srcT == SimdTensorData16b;
return p.trans == 0 && p.group == 1 && Is1x1(p);//&& p.srcT == SimdTensorData16b;
}
}
#endif
Expand Down
6 changes: 3 additions & 3 deletions src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,10 @@ namespace Simd

static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst)
{
const uint16_t* src = (uint16_t*)src8 + (cBeg * p.srcH + yBeg) * p.srcW;
const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j, dS = p.srcH * p.srcW;
size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k;
for (j = 0; j < NF; j += F)
for (j = 0; j < NF; j += a.F)
{
for (k = 0; k < K2; k += 2)
{
Expand Down Expand Up @@ -170,7 +170,7 @@ namespace Simd
*dst++ = 0;
}
}
src += F;
src += a.F;
}
if(j < N)
{
Expand Down
7 changes: 4 additions & 3 deletions src/Test/TestSynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,21 +255,22 @@ namespace Test
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 321, 321, 16, _2, _1, _1, _0, _0, 1, aRe, tT, f32, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 16, 320, 320, 32, _2, _1, _1, _0, _1, 1, aRe, tT, b16, f32), c, f1, f2);
#endif
#if 0
#if 1
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, aSw, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, aSw, tF, f32, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, aSw, tF, b16, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, aSw, tT, b16, b16), c, f1, f2);
#endif
#if 1
#if 0
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2255, 55, 55, 155, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 55, 15, 16, 55, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 15, 15, 56, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 55, 15, 15, 56, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 55, 15, 15, 55, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
#endif

#else
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 15, 3, 4, 16, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 64, _3, _1, _1, _1, _1, 1, aSw, tT, b16, f32), c, f1, f2);
Expand Down

0 comments on commit fbb71f8

Please sign in to comment.