Skip to content

Commit

Permalink
Merge pull request opencv#3248 from ilya-lavrenov:arm_sum
Browse files Browse the repository at this point in the history
  • Loading branch information
vpisarev committed Sep 22, 2014
2 parents 4889201 + 27b933b commit 1c0b946
Showing 1 changed file with 173 additions and 6 deletions.
179 changes: 173 additions & 6 deletions modules/core/src/stat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,181 @@ template<typename T> static inline Scalar rawToScalar(const T& v)
* sum *
\****************************************************************************************/

template <typename T, typename ST>
struct Sum_SIMD
{
int operator () (const T *, const uchar *, ST *, int, int) const
{
return 0;
}
};

#if CV_NEON

template <>
struct Sum_SIMD<uchar, int>
{
int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4))
return 0;

int x = 0;
uint32x4_t v_sum = vdupq_n_u32(0u);

for ( ; x <= len - 16; x += 16)
{
uint8x16_t v_src = vld1q_u8(src0 + x);
uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));

v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));

v_half = vmovl_u8(vget_high_u8(v_src));
v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
}

for ( ; x <= len - 8; x += 8)
{
uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));

v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
}

unsigned int CV_DECL_ALIGNED(16) ar[4];
vst1q_u32(ar, v_sum);

for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];

return x / cn;
}
};

template <>
struct Sum_SIMD<schar, int>
{
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4))
return 0;

int x = 0;
int32x4_t v_sum = vdupq_n_s32(0);

for ( ; x <= len - 16; x += 16)
{
int8x16_t v_src = vld1q_s8(src0 + x);
int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));

v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));

v_half = vmovl_s8(vget_high_s8(v_src));
v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
}

for ( ; x <= len - 8; x += 8)
{
int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));

v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
}

int CV_DECL_ALIGNED(16) ar[4];
vst1q_s32(ar, v_sum);

for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];

return x / cn;
}
};

template <>
struct Sum_SIMD<ushort, int>
{
int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4))
return 0;

int x = 0;
uint32x4_t v_sum = vdupq_n_u32(0u);

for ( ; x <= len - 8; x += 8)
{
uint16x8_t v_src = vld1q_u16(src0 + x);

v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
}

for ( ; x <= len - 4; x += 4)
v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x)));

unsigned int CV_DECL_ALIGNED(16) ar[4];
vst1q_u32(ar, v_sum);

for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];

return x / cn;
}
};

template <>
struct Sum_SIMD<short, int>
{
int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4))
return 0;

int x = 0;
int32x4_t v_sum = vdupq_n_s32(0u);

for ( ; x <= len - 8; x += 8)
{
int16x8_t v_src = vld1q_s16(src0 + x);

v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
}

for ( ; x <= len - 4; x += 4)
v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x)));

int CV_DECL_ALIGNED(16) ar[4];
vst1q_s32(ar, v_sum);

for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];

return x / cn;
}
};

#endif

template<typename T, typename ST>
static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
{
const T* src = src0;
if( !mask )
{
int i=0;
int k = cn % 4;
Sum_SIMD<T, ST> vop;
int i = vop(src0, mask, dst, len, cn), k = cn % 4;
src += i * cn;

if( k == 1 )
{
ST s0 = dst[0];
Expand All @@ -86,7 +253,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
else if( k == 2 )
{
ST s0 = dst[0], s1 = dst[1];
for( i = 0; i < len; i++, src += cn )
for( ; i < len; i++, src += cn )
{
s0 += src[0];
s1 += src[1];
Expand All @@ -97,7 +264,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
else if( k == 3 )
{
ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
for( i = 0; i < len; i++, src += cn )
for( ; i < len; i++, src += cn )
{
s0 += src[0];
s1 += src[1];
Expand All @@ -110,9 +277,9 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )

for( ; k < cn; k += 4 )
{
src = src0 + k;
src = src0 + i*cn + k;
ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
for( i = 0; i < len; i++, src += cn )
for( ; i < len; i++, src += cn )
{
s0 += src[0]; s1 += src[1];
s2 += src[2]; s3 += src[3];
Expand Down

0 comments on commit 1c0b946

Please sign in to comment.