diff --git a/neo/idlib/math/Simd_AVX.cpp b/neo/idlib/math/Simd_AVX.cpp index 659e01ff6..4ef11728f 100644 --- a/neo/idlib/math/Simd_AVX.cpp +++ b/neo/idlib/math/Simd_AVX.cpp @@ -35,6 +35,10 @@ If you have questions concerning this license or the applicable additional terms // //=============================================================== +// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used) +#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \ + defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX__) + #include #include "idlib/geometry/DrawVert.h" @@ -122,3 +126,5 @@ void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, c } _mm256_zeroupper(); } + +#endif /* _MSC_VER */ diff --git a/neo/idlib/math/Simd_AVX.h b/neo/idlib/math/Simd_AVX.h index 6b8b9bdc3..a8030bb2f 100644 --- a/neo/idlib/math/Simd_AVX.h +++ b/neo/idlib/math/Simd_AVX.h @@ -28,10 +28,15 @@ class idSIMD_AVX : public idSIMD_SSE3 { public: - // Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used) +// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used) +#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \ + defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX__) + virtual const char *VPCALL GetName( void ) const; virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); + +#endif /* _MSC_VER */ }; #endif /* !__MATH_SIMD_AVX_H__ */ diff --git a/neo/idlib/math/Simd_AVX2.cpp b/neo/idlib/math/Simd_AVX2.cpp index c338027f3..3b6f9b1cb 100644 --- a/neo/idlib/math/Simd_AVX2.cpp +++ b/neo/idlib/math/Simd_AVX2.cpp @@ -26,7 +26,7 @@ If you have questions concerning this license or the applicable additional terms =========================================================================== */ -#include "sys/platform.h" +#include "sys/platform.h" #include "Simd_AVX2.h" //=============================================================== @@ -35,6 +35,10 @@ If you have questions concerning this license or the applicable additional terms // //=============================================================== +// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used) +#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \ + defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX2__) + #include #include "idlib/geometry/DrawVert.h" @@ -62,20 +66,17 @@ void VPCALL idSIMD_AVX2::CullByFrustum( idDrawVert *verts, const int numVerts, c const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); const __m256 eps = _mm256_set1_ps( epsilon ); - const byte mask6 = (1 << 6) - 1; + const byte mask6 = ( 1 << 6 ) - 1; for ( int j = 0; j < numVerts; j++ ) { idVec3 &vec = verts[j].xyz; __m256 vX = _mm256_set1_ps( vec.x ); __m256 vY = _mm256_set1_ps( vec.y ); __m256 vZ = _mm256_set1_ps( vec.z ); - __m256 d = _mm256_fmadd_ps( fA, vX, - _mm256_fmadd_ps( fB, vY, - _mm256_fmadd_ps( fC, vZ, fD ) - ) - ); + __m256 d = _mm256_fmadd_ps( fA, vX, _mm256_fmadd_ps( fB, vY, + _mm256_fmadd_ps( fC, vZ, fD ) ) ); int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); - pointCull[j] = (byte)mask_lo & mask6; + pointCull[j] = ( byte )mask_lo & mask6; } _mm256_zeroupper(); } @@ -92,21 +93,20 @@ void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts, const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); const __m256 eps = _mm256_set1_ps( epsilon ); static const __m256 epsM = _mm256_set1_ps( -epsilon ); - const short mask6 = (1 << 6) - 1; + const short mask6 = ( 1 << 6 ) - 1; for ( int j = 0; j < numVerts; j++ ) { idVec3 &vec = verts[j].xyz; __m256 vX = _mm256_set1_ps( vec.x ); __m256 vY = _mm256_set1_ps( vec.y ); __m256 vZ = _mm256_set1_ps( vec.z ); - __m256 d = _mm256_fmadd_ps( fA, vX, - _mm256_fmadd_ps( fB, vY, - _mm256_fmadd_ps( fC, vZ, fD ) - ) - ); + __m256 d = _mm256_fmadd_ps( fA, vX, _mm256_fmadd_ps( fB, vY, + _mm256_fmadd_ps( fC, vZ, fD ) ) ); int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) ); - pointCull[j] = (unsigned short)(mask_lo & mask6 | (mask_hi & mask6) << 6); + pointCull[j] = ( unsigned short )( mask_lo & mask6 | ( mask_hi & mask6 ) << 6 ); } _mm256_zeroupper(); } + +#endif diff --git a/neo/idlib/math/Simd_AVX2.h b/neo/idlib/math/Simd_AVX2.h index 764f1ee8d..1698363ec 100644 --- a/neo/idlib/math/Simd_AVX2.h +++ b/neo/idlib/math/Simd_AVX2.h @@ -28,10 +28,15 @@ class idSIMD_AVX2 : public idSIMD_AVX { public: - // Revelator: these work whether gcc clang or msvc in x86 or x64 (no inline assembly used) +// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used) +#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \ + defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX2__) + virtual const char *VPCALL GetName( void ) const; virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); + +#endif /* _MSC_VER */ }; #endif /* !__MATH_SIMD_AVX2_H__ */ diff --git a/neo/idlib/math/Simd_SSE.cpp b/neo/idlib/math/Simd_SSE.cpp index ae0bc6f08..7e3a6b6b4 100644 --- a/neo/idlib/math/Simd_SSE.cpp +++ b/neo/idlib/math/Simd_SSE.cpp @@ -35,8 +35,6 @@ If you have questions concerning this license or the applicable additional terms // E //=============================================================== -#include - #include "idlib/geometry/DrawVert.h" #include "idlib/geometry/JointTransform.h" #include "idlib/math/Vector.h" @@ -55,6 +53,8 @@ If you have questions concerning this license or the applicable additional terms #if defined(__GNUC__) && defined(__SSE__) +#include + #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) @@ -90,12 +90,12 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers. int count_l4 = count; // count_l4 = eax int count_l1 = count; // count_l1 = edx - char *constant_p = (char *)&constant; // constant_p = edi - char *src_p = (char *) src; // src_p = esi - char *dst_p = (char *) dst; // dst_p = ecx + char *constant_p = ( char * )&constant; // constant_p = edi + char *src_p = ( char * ) src; // src_p = esi + char *dst_p = ( char * ) dst; // dst_p = ecx assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET ); + assert( ptrdiff_t( &src->xyz ) - ptrdiff_t( src ) == DRAWVERT_XYZ_OFFSET ); /* and eax, ~3 @@ -109,100 +109,100 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) */ count_l4 = count_l4 & ~3; - xmm4 = _mm_load_ss((float *) (constant_p)); - xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm5 = _mm_load_ss((float *) (constant_p + 4)); - xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm6 = _mm_load_ss((float *) (constant_p + 8)); - xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm7 = _mm_load_ss((float *) (constant_p + 12)); - xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )); + xmm4 = _mm_load_ss( ( float * )( constant_p ) ); + xmm4 = _mm_shuffle_ps( xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm5 = _mm_load_ss( ( float * )( constant_p + 4 ) ); + xmm5 = _mm_shuffle_ps( xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm6 = _mm_load_ss( ( float * )( constant_p + 8 ) ); + xmm6 = _mm_shuffle_ps( xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm7 = _mm_load_ss( ( float * )( constant_p + 12 ) ); + xmm7 = _mm_shuffle_ps( xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) ); /* jz startVert1 */ - if(count_l4 != 0) { - /* - imul eax, DRAWVERT_SIZE - add esi, eax - neg eax - */ + if ( count_l4 != 0 ) { + /* + imul eax, DRAWVERT_SIZE + add esi, eax + neg eax + */ count_l4 = count_l4 * DRAWVERT_SIZE; src_p = src_p + count_l4; count_l4 = -count_l4; - /* - loopVert4: - */ + /* + loopVert4: + */ do { - /* - movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X - movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X - movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 - movaps xmm1, xmm0 // 3, X, 0, 1 - */ - xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X - xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X - xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1 + /* + movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X + movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X + movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 + movaps xmm1, xmm0 // 3, X, 0, 1 + */ + xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 3, X, X, X + xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8 ) ); // 2, X, X, X + xmm0 = _mm_loadh_pi( xmm0, ( __m64 * )( src_p + count_l4 + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 3, X, 0, 1 xmm1 = xmm0; // 3, X, 0, 1 - /* - movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 - shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 - */ - xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1 - xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5 - - /* - movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 - shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 - */ - xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X - xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7 - xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9 - /* - movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 - shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 - */ - xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7 - xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10 - /* - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X - shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 - */ - xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X - xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11 - - /* - add ecx, 16 - add eax, 4*DRAWVERT_SIZE - */ + /* + movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 + shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 + */ + xmm1 = _mm_loadl_pi( xmm1, ( __m64 * )( src_p + count_l4 + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4 ) ); // 4, 5, 0, 1 + xmm2 = _mm_shuffle_ps( xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) ); // 2, X, 4, 5 + + /* + movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X + movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 + shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 + */ + xmm3 = _mm_load_ss( ( float * )( src_p + count_l4 + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 9, X, X, X + xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 9, X, 6, 7 + xmm0 = _mm_shuffle_ps( xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) ); // 0, 3, 6, 9 + /* + movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 + shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 + */ + xmm3 = _mm_loadl_pi( xmm3, ( __m64 * )( src_p + count_l4 + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4 ) ); // 10, 11, 6, 7 + xmm1 = _mm_shuffle_ps( xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) ); // 1, 4, 7, 10 + /* + movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X + shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 + */ + xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8 ) ); // 10, 11, 8, X + xmm2 = _mm_shuffle_ps( xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) ); // 2, 5, 8, 11 + + /* + add ecx, 16 + add eax, 4*DRAWVERT_SIZE + */ dst_p = dst_p + 16; - count_l4 = count_l4 + 4*DRAWVERT_SIZE; + count_l4 = count_l4 + 4 * DRAWVERT_SIZE; - /* - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - addps xmm0, xmm7 - addps xmm0, xmm1 - addps xmm0, xmm2 - */ - xmm0 = _mm_mul_ps(xmm0, xmm4); - xmm1 = _mm_mul_ps(xmm1, xmm5); - xmm2 = _mm_mul_ps(xmm2, xmm6); - xmm0 = _mm_add_ps(xmm0, xmm7); - xmm0 = _mm_add_ps(xmm0, xmm1); - xmm0 = _mm_add_ps(xmm0, xmm2); - - /* - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 - jl loopVert4 - */ - _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0); - _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0); - } while(count_l4 < 0); + /* + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + addps xmm0, xmm7 + addps xmm0, xmm1 + addps xmm0, xmm2 + */ + xmm0 = _mm_mul_ps( xmm0, xmm4 ); + xmm1 = _mm_mul_ps( xmm1, xmm5 ); + xmm2 = _mm_mul_ps( xmm2, xmm6 ); + xmm0 = _mm_add_ps( xmm0, xmm7 ); + xmm0 = _mm_add_ps( xmm0, xmm1 ); + xmm0 = _mm_add_ps( xmm0, xmm2 ); + + /* + movlps [ecx-16+0], xmm0 + movhps [ecx-16+8], xmm0 + jl loopVert4 + */ + _mm_storel_pi( ( __m64 * )( dst_p - 16 + 0 ), xmm0 ); + _mm_storeh_pi( ( __m64 * )( dst_p - 16 + 8 ), xmm0 ); + } while ( count_l4 < 0 ); } /* @@ -211,39 +211,39 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe jz done */ count_l1 = count_l1 & 3; - if(count_l1 != 0) { - /* - loopVert1: - movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] - mulss xmm0, xmm4 - mulss xmm1, xmm5 - mulss xmm2, xmm6 - addss xmm0, xmm7 - add ecx, 4 - addss xmm0, xmm1 - add eax, DRAWVERT_SIZE - addss xmm0, xmm2 - dec edx - movss [ecx-4], xmm0 - jnz loopVert1 - */ + if ( count_l1 != 0 ) { + /* + loopVert1: + movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] + movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] + movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, xmm4 + mulss xmm1, xmm5 + mulss xmm2, xmm6 + addss xmm0, xmm7 + add ecx, 4 + addss xmm0, xmm1 + add eax, DRAWVERT_SIZE + addss xmm0, xmm2 + dec edx + movss [ecx-4], xmm0 + jnz loopVert1 + */ do { - xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0)); - xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4)); - xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8)); - xmm0 = _mm_mul_ss(xmm0, xmm4); - xmm1 = _mm_mul_ss(xmm1, xmm5); - xmm2 = _mm_mul_ss(xmm2, xmm6); - xmm0 = _mm_add_ss(xmm0, xmm7); + xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm1 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 4 ) ); + xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 8 ) ); + xmm0 = _mm_mul_ss( xmm0, xmm4 ); + xmm1 = _mm_mul_ss( xmm1, xmm5 ); + xmm2 = _mm_mul_ss( xmm2, xmm6 ); + xmm0 = _mm_add_ss( xmm0, xmm7 ); dst_p = dst_p + 4; - xmm0 = _mm_add_ss(xmm0, xmm1); + xmm0 = _mm_add_ss( xmm0, xmm1 ); count_l4 = count_l4 + DRAWVERT_SIZE; - xmm0 = _mm_add_ss(xmm0, xmm2); + xmm0 = _mm_add_ss( xmm0, xmm2 ); count_l1 = count_l1 - 1; - _mm_store_ss((float *) (dst_p-4), xmm0); - } while( count_l1 != 0); + _mm_store_ss( ( float * )( dst_p - 4 ), xmm0 ); + } while ( count_l1 != 0 ); } /* done: @@ -258,7 +258,7 @@ idSIMD_SSE::MinMax void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET ); + assert( ptrdiff_t( &src->xyz ) - ptrdiff_t( src ) == DRAWVERT_XYZ_OFFSET ); __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; char *indexes_p; @@ -276,13 +276,13 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, movaps xmm2, xmm0 movaps xmm3, xmm1 */ - xmm0 = _mm_load_ss(&idMath::INFINITY); - // To satisfy the compiler use xmm0 instead. - xmm1 = _mm_xor_ps(xmm0, xmm0); - xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm1 = _mm_sub_ps(xmm1, xmm0); - xmm2 = xmm0; - xmm3 = xmm1; + xmm0 = _mm_load_ss( &idMath::INFINITY ); + // To satisfy the compiler use xmm0 instead. + xmm1 = _mm_xor_ps( xmm0, xmm0 ); + xmm0 = _mm_shuffle_ps( xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm1 = _mm_sub_ps( xmm1, xmm0 ); + xmm2 = xmm0; + xmm3 = xmm1; /* mov edi, indexes @@ -291,91 +291,91 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, and eax, ~3 jz done4 */ - indexes_p = (char *) indexes; - src_p = (char *) src; - count_l = count; - count_l = count_l & ~3; - if(count_l != 0) { - /* - shl eax, 2 - add edi, eax - neg eax - */ - count_l = count_l << 2; - indexes_p = indexes_p + count_l; - count_l = -count_l; - /* - loop4: -// prefetchnta [edi+128] -// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] - */ + indexes_p = ( char * ) indexes; + src_p = ( char * ) src; + count_l = count; + count_l = count_l & ~3; + if ( count_l != 0 ) { + /* + shl eax, 2 + add edi, eax + neg eax + */ + count_l = count_l << 2; + indexes_p = indexes_p + count_l; + count_l = -count_l; + /* + loop4: + // prefetchnta [edi+128] + // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] + */ do { - /* - mov edx, [edi+eax+0] - imul edx, DRAWVERT_SIZE - movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - minps xmm0, xmm4 - maxps xmm1, xmm4 - */ - edx = *((int*)(indexes_p+count_l+0)); + /* + mov edx, [edi+eax+0] + imul edx, DRAWVERT_SIZE + movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] + movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + minps xmm0, xmm4 + maxps xmm1, xmm4 + */ + edx = *( ( int * )( indexes_p + count_l + 0 ) ); edx = edx * DRAWVERT_SIZE; - xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); - xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); - xmm0 = _mm_min_ps(xmm0, xmm4); - xmm1 = _mm_max_ps(xmm1, xmm4); - - /* - mov edx, [edi+eax+4] - imul edx, DRAWVERT_SIZE - movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4] - minps xmm2, xmm5 - maxps xmm3, xmm5 - */ - edx = *((int*)(indexes_p+count_l+4)); + xmm4 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) ); + xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm0 = _mm_min_ps( xmm0, xmm4 ); + xmm1 = _mm_max_ps( xmm1, xmm4 ); + + /* + mov edx, [edi+eax+4] + imul edx, DRAWVERT_SIZE + movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4] + minps xmm2, xmm5 + maxps xmm3, xmm5 + */ + edx = *( ( int * )( indexes_p + count_l + 4 ) ); edx = edx * DRAWVERT_SIZE; - xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0)); - xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) ); - xmm2 = _mm_min_ps(xmm2, xmm5); - xmm3 = _mm_max_ps(xmm3, xmm5); - - /* - mov edx, [edi+eax+8] - imul edx, DRAWVERT_SIZE - movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - minps xmm0, xmm6 - maxps xmm1, xmm6 - */ - edx = *((int*)(indexes_p+count_l+8)); + xmm5 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm5 = _mm_loadh_pi( xmm5, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 4 ) ); + xmm2 = _mm_min_ps( xmm2, xmm5 ); + xmm3 = _mm_max_ps( xmm3, xmm5 ); + + /* + mov edx, [edi+eax+8] + imul edx, DRAWVERT_SIZE + movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] + movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + minps xmm0, xmm6 + maxps xmm1, xmm6 + */ + edx = *( ( int * )( indexes_p + count_l + 8 ) ); edx = edx * DRAWVERT_SIZE; - xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); - xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); - xmm0 = _mm_min_ps(xmm0, xmm6); - xmm1 = _mm_max_ps(xmm1, xmm6); - - /* - mov edx, [edi+eax+12] - imul edx, DRAWVERT_SIZE - movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4] - minps xmm2, xmm7 - maxps xmm3, xmm7 - */ - edx = *((int*)(indexes_p+count_l+12)); + xmm6 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) ); + xmm6 = _mm_loadh_pi( xmm6, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm0 = _mm_min_ps( xmm0, xmm6 ); + xmm1 = _mm_max_ps( xmm1, xmm6 ); + + /* + mov edx, [edi+eax+12] + imul edx, DRAWVERT_SIZE + movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4] + minps xmm2, xmm7 + maxps xmm3, xmm7 + */ + edx = *( ( int * )( indexes_p + count_l + 12 ) ); edx = edx * DRAWVERT_SIZE; - xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0)); - xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) ); - xmm2 = _mm_min_ps(xmm2, xmm7); - xmm3 = _mm_max_ps(xmm3, xmm7); - - /* - add eax, 4*4 - jl loop4 - */ - count_l = count_l + 4*4; - } while (count_l < 0); + xmm7 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm7 = _mm_loadh_pi( xmm7, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 4 ) ); + xmm2 = _mm_min_ps( xmm2, xmm7 ); + xmm3 = _mm_max_ps( xmm3, xmm7 ); + + /* + add eax, 4*4 + jl loop4 + */ + count_l = count_l + 4 * 4; + } while ( count_l < 0 ); } /* done4: @@ -385,40 +385,40 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, */ count_l = count; count_l = count_l & 3; - if(count_l != 0) { - /* - shl eax, 2 - add edi, eax - neg eax - */ + if ( count_l != 0 ) { + /* + shl eax, 2 + add edi, eax + neg eax + */ count_l = count_l << 2; indexes_p = indexes_p + count_l; count_l = -count_l; - /* - loop1: - */ - do{ - /* - mov edx, [edi+eax+0] - imul edx, DRAWVERT_SIZE; - movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - minps xmm0, xmm4 - maxps xmm1, xmm4 - */ - edx = *((int*)(indexes_p+count_l+0)); + /* + loop1: + */ + do { + /* + mov edx, [edi+eax+0] + imul edx, DRAWVERT_SIZE; + movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] + movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + minps xmm0, xmm4 + maxps xmm1, xmm4 + */ + edx = *( ( int * )( indexes_p + count_l + 0 ) ); edx = edx * DRAWVERT_SIZE; - xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); - xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); - xmm0 = _mm_min_ps(xmm0, xmm4); - xmm1 = _mm_max_ps(xmm1, xmm4); - - /* - add eax, 4 - jl loop1 - */ + xmm4 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) ); + xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) ); + xmm0 = _mm_min_ps( xmm0, xmm4 ); + xmm1 = _mm_max_ps( xmm1, xmm4 ); + + /* + add eax, 4 + jl loop1 + */ count_l = count_l + 4; - } while (count_l < 0); + } while ( count_l < 0 ); } @@ -435,16 +435,16 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, movhps [edi], xmm1 movss [edi+8], xmm1 */ - xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )); - xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )); - xmm0 = _mm_min_ps(xmm0, xmm2); - xmm1 = _mm_max_ps(xmm1, xmm3); - min_p = (char *) &min; - _mm_storeh_pi((__m64 *)(min_p), xmm0); - _mm_store_ss((float *)(min_p+8), xmm0); - max_p = (char *) &max; - _mm_storeh_pi((__m64 *)(max_p), xmm1); - _mm_store_ss((float *)(max_p+8), xmm1); + xmm2 = _mm_shuffle_ps( xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) ); + xmm3 = _mm_shuffle_ps( xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) ); + xmm0 = _mm_min_ps( xmm0, xmm2 ); + xmm1 = _mm_max_ps( xmm1, xmm3 ); + min_p = ( char * ) &min; + _mm_storeh_pi( ( __m64 * )( min_p ), xmm0 ); + _mm_store_ss( ( float * )( min_p + 8 ), xmm0 ); + max_p = ( char * ) &max; + _mm_storeh_pi( ( __m64 * )( max_p ), xmm1 ); + _mm_store_ss( ( float * )( max_p + 8 ), xmm1 ); } /* @@ -479,10 +479,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * and eax, ~3 */ count_l4 = count; - constant_p = (char *) &constant; + constant_p = ( char * ) &constant; count_l1 = count_l4; - src_p = (char *) src; - dst_p = (char *) dst; + src_p = ( char * ) src; + dst_p = ( char * ) dst; count_l4 = count_l4 & ~3; /* @@ -493,91 +493,91 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * movss xmm7, [edi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) */ - xmm5 = _mm_load_ss((float *) (constant_p+0)); - xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm6 = _mm_load_ss((float *) (constant_p+4)); - xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )); - xmm7 = _mm_load_ss((float *) (constant_p+8)); - xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )); + xmm5 = _mm_load_ss( ( float * )( constant_p + 0 ) ); + xmm5 = _mm_shuffle_ps( xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm6 = _mm_load_ss( ( float * )( constant_p + 4 ) ); + xmm6 = _mm_shuffle_ps( xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) ); + xmm7 = _mm_load_ss( ( float * )( constant_p + 8 ) ); + xmm7 = _mm_shuffle_ps( xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) ); /* jz startVert1 */ - if (count_l4 != 0) { - /* - imul eax, 16 - add esi, eax - neg eax - */ + if ( count_l4 != 0 ) { + /* + imul eax, 16 + add esi, eax + neg eax + */ count_l4 = count_l4 * 16; src_p = src_p + count_l4; count_l4 = -count_l4; - /* - loopVert4: - */ + /* + loopVert4: + */ do { - /* - movlps xmm1, [esi+eax+ 0] - movlps xmm3, [esi+eax+ 8] - movhps xmm1, [esi+eax+16] - movhps xmm3, [esi+eax+24] - movlps xmm2, [esi+eax+32] - movlps xmm4, [esi+eax+40] - movhps xmm2, [esi+eax+48] - movhps xmm4, [esi+eax+56] - movaps xmm0, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) - movaps xmm2, xmm3 - shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) - */ - xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0)); - xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8)); - xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16)); - xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24)); - xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32)); - xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40)); - xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48)); - xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56)); + /* + movlps xmm1, [esi+eax+ 0] + movlps xmm3, [esi+eax+ 8] + movhps xmm1, [esi+eax+16] + movhps xmm3, [esi+eax+24] + movlps xmm2, [esi+eax+32] + movlps xmm4, [esi+eax+40] + movhps xmm2, [esi+eax+48] + movhps xmm4, [esi+eax+56] + movaps xmm0, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) + movaps xmm2, xmm3 + shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) + */ + xmm1 = _mm_loadl_pi( xmm1, ( __m64 * )( src_p + count_l4 + 0 ) ); + xmm3 = _mm_loadl_pi( xmm3, ( __m64 * )( src_p + count_l4 + 8 ) ); + xmm1 = _mm_loadh_pi( xmm1, ( __m64 * )( src_p + count_l4 + 16 ) ); + xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 24 ) ); + xmm2 = _mm_loadl_pi( xmm2, ( __m64 * )( src_p + count_l4 + 32 ) ); + xmm4 = _mm_loadl_pi( xmm4, ( __m64 * )( src_p + count_l4 + 40 ) ); + xmm2 = _mm_loadh_pi( xmm2, ( __m64 * )( src_p + count_l4 + 48 ) ); + xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + count_l4 + 56 ) ); xmm0 = xmm1; - xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )); - xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )); + xmm0 = _mm_shuffle_ps( xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) ); + xmm1 = _mm_shuffle_ps( xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) ); xmm2 = xmm3; - xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )); - xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )); + xmm2 = _mm_shuffle_ps( xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) ); + xmm3 = _mm_shuffle_ps( xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) ); - /* - add ecx, 16 - add eax, 4*16 - */ + /* + add ecx, 16 + add eax, 4*16 + */ dst_p = dst_p + 16; - count_l4 = count_l4 + 4*16; - - /* - mulps xmm0, xmm5 - mulps xmm1, xmm6 - mulps xmm2, xmm7 - addps xmm0, xmm3 - addps xmm0, xmm1 - addps xmm0, xmm2 - */ - xmm0 = _mm_mul_ps(xmm0, xmm5); - xmm1 = _mm_mul_ps(xmm1, xmm6); - xmm2 = _mm_mul_ps(xmm2, xmm7); - xmm0 = _mm_add_ps(xmm0, xmm3); - xmm0 = _mm_add_ps(xmm0, xmm1); - xmm0 = _mm_add_ps(xmm0, xmm2); + count_l4 = count_l4 + 4 * 16; - /* - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 - jl loopVert4 - */ - _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0); - _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0); - } while (count_l4 < 0); + /* + mulps xmm0, xmm5 + mulps xmm1, xmm6 + mulps xmm2, xmm7 + addps xmm0, xmm3 + addps xmm0, xmm1 + addps xmm0, xmm2 + */ + xmm0 = _mm_mul_ps( xmm0, xmm5 ); + xmm1 = _mm_mul_ps( xmm1, xmm6 ); + xmm2 = _mm_mul_ps( xmm2, xmm7 ); + xmm0 = _mm_add_ps( xmm0, xmm3 ); + xmm0 = _mm_add_ps( xmm0, xmm1 ); + xmm0 = _mm_add_ps( xmm0, xmm2 ); + + /* + movlps [ecx-16+0], xmm0 + movhps [ecx-16+8], xmm0 + jl loopVert4 + */ + _mm_storel_pi( ( __m64 * )( dst_p - 16 + 0 ), xmm0 ); + _mm_storeh_pi( ( __m64 * )( dst_p - 16 + 8 ), xmm0 ); + } while ( count_l4 < 0 ); } /* @@ -587,74 +587,178 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * */ count_l1 = count_l1 & 3; - if(count_l1 != 0) { - /* - loopVert1: - */ + if ( count_l1 != 0 ) { + /* + loopVert1: + */ do { - /* - movss xmm0, [esi+eax+0] - movss xmm1, [esi+eax+4] - movss xmm2, [esi+eax+8] - mulss xmm0, xmm5 - mulss xmm1, xmm6 - mulss xmm2, xmm7 - addss xmm0, [esi+eax+12] - add ecx, 4 - addss xmm0, xmm1 - add eax, 16 - addss xmm0, xmm2 - dec edx - movss [ecx-4], xmm0 - jnz loopVert1 - */ - xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0)); - xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4)); - xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8)); - xmm3 = _mm_load_ss((float *) (src_p+count_l4+12)); - - xmm0 = _mm_mul_ss(xmm0, xmm5); - xmm1 = _mm_mul_ss(xmm1, xmm6); - xmm2 = _mm_mul_ss(xmm2, xmm7); - - xmm0 = _mm_add_ss(xmm0, xmm3); + /* + movss xmm0, [esi+eax+0] + movss xmm1, [esi+eax+4] + movss xmm2, [esi+eax+8] + mulss xmm0, xmm5 + mulss xmm1, xmm6 + mulss xmm2, xmm7 + addss xmm0, [esi+eax+12] + add ecx, 4 + addss xmm0, xmm1 + add eax, 16 + addss xmm0, xmm2 + dec edx + movss [ecx-4], xmm0 + jnz loopVert1 + */ + xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + 0 ) ); + xmm1 = _mm_load_ss( ( float * )( src_p + count_l4 + 4 ) ); + xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + 8 ) ); + xmm3 = _mm_load_ss( ( float * )( src_p + count_l4 + 12 ) ); + + xmm0 = _mm_mul_ss( xmm0, xmm5 ); + xmm1 = _mm_mul_ss( xmm1, xmm6 ); + xmm2 = _mm_mul_ss( xmm2, xmm7 ); + + xmm0 = _mm_add_ss( xmm0, xmm3 ); dst_p = dst_p + 4; - xmm0 = _mm_add_ss(xmm0, xmm1); + xmm0 = _mm_add_ss( xmm0, xmm1 ); count_l4 = count_l4 + 16; - xmm0 = _mm_add_ss(xmm0, xmm2); + xmm0 = _mm_add_ss( xmm0, xmm2 ); count_l1 = count_l1 - 1; - _mm_store_ss((float *) (dst_p-4), xmm0); - } while (count_l1 != 0); + _mm_store_ss( ( float * )( dst_p - 4 ), xmm0 ); + } while ( count_l1 != 0 ); } /* done: */ } -#elif defined(_MSC_VER) && defined(_M_IX86) +/* +============ +idSIMD_SSE::CullByFrustum +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); -#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) -#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + pointCull[j] = mask_lo & mask6; + } +} -// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary) -#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \ - __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \ - __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \ - __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \ - __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \ - __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \ - __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \ - __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \ - __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \ - __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \ - __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \ - __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \ - __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */ +/* +============ +idSIMD_SSE::CullByFrustum2 +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); -// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary) -#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \ - __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \ - __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \ + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + eps = _mm_set1_ps( -epsilon ); + int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) ); + int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + int mask_hi = mask_hi14 | mask_hi56 << 4; + pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6; + } +} + +#elif defined(_MSC_VER) && defined(_M_IX86) + +#include + +#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) +#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) + +// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary) +#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \ + __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \ + __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \ + __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \ + __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \ + __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \ + __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \ + __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \ + __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \ + __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \ + __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \ + __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \ + __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */ + +// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary) +#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \ + __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \ + __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \ __asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \ __asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \ __asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \ @@ -1017,17 +1121,17 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT } ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 ); -ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 ); +ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1 << 15 ); -ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) ); -ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) ); -ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) ); -ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) ); +ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, ( 3 << 0 ) | ( 2 << 8 ) | ( 1 << 16 ) | ( 0 << 24 ) ); +ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, ( 0 << 0 ) | ( 1 << 8 ) | ( 2 << 16 ) | ( 3 << 24 ) ); +ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, ( 1 << 0 ) | ( 0 << 8 ) | ( 3 << 16 ) | ( 2 << 24 ) ); +ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, ( 2 << 0 ) | ( 3 << 8 ) | ( 0 << 16 ) | ( 1 << 24 ) ); -ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 ); -ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) ); -ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) ); -ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) ); +ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, ( unsigned int )( 1 << 31 ), 0, 0, 0 ); +ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, ( unsigned int )( 1 << 31 ) ); +ALIGN4_INIT1( unsigned int SIMD_SP_absMask, ( unsigned int ) ~( 1 << 31 ) ); +ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, ( unsigned int ) ~( 1 << 23 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_not, 0xFFFFFFFF ); ALIGN4_INIT1( float SIMD_SP_zero, 0.0f ); @@ -1036,7 +1140,7 @@ ALIGN4_INIT1( float SIMD_SP_one, 1.0f ); ALIGN4_INIT1( float SIMD_SP_two, 2.0f ); ALIGN4_INIT1( float SIMD_SP_three, 3.0f ); ALIGN4_INIT1( float SIMD_SP_four, 4.0f ); -ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) ); +ALIGN4_INIT1( float SIMD_SP_maxShort, ( 1 << 15 ) ); ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f ); ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI ); ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI ); @@ -1047,7 +1151,7 @@ ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f ); -ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f ); +ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f * 0.5f ); ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f ); ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f ); @@ -1952,7 +2056,7 @@ float SSE_ATan( float y, float x ) { if ( fabs( y ) > fabs( x ) ) { a = -x / y; d = idMath::HALF_PI; - *((unsigned int *)&d) ^= ( *((unsigned int *)&x) ^ *((unsigned int *)&y) ) & (1<<31); + *( ( unsigned int * )&d ) ^= ( *( ( unsigned int * )&x ) ^ * ( ( unsigned int * )&y ) ) & ( 1 << 31 ); } else { a = y / x; d = 0.0f; @@ -2179,63 +2283,62 @@ void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, int pre, post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); - __asm - { - movss xmm1,constant - shufps xmm1,xmm1,0 + __asm { + movss xmm1, constant + shufps xmm1, xmm1, 0 KFLOATINITDS( dst, src, count, pre, post ) - and eax,15 + and eax, 15 jne lpNA jmp lpA align 16 -lpA: - movaps xmm2,[edx+ebx] - movaps xmm3,[edx+ebx+16] - rcpps xmm4,xmm2 - rcpps xmm5,xmm3 - prefetchnta [edx+ebx+64] - mulps xmm2,xmm4 - mulps xmm2,xmm4 - mulps xmm3,xmm5 - mulps xmm3,xmm5 - addps xmm4,xmm4 - addps xmm5,xmm5 - subps xmm4,xmm2 - subps xmm5,xmm3 - mulps xmm4,xmm1 - mulps xmm5,xmm1 - movaps [edi+ebx],xmm4 - movaps [edi+ebx+16],xmm5 - add ebx,16*2 + lpA: + movaps xmm2, [edx + ebx] + movaps xmm3, [edx + ebx + 16] + rcpps xmm4, xmm2 + rcpps xmm5, xmm3 + prefetchnta [edx + ebx + 64] + mulps xmm2, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + mulps xmm3, xmm5 + addps xmm4, xmm4 + addps xmm5, xmm5 + subps xmm4, xmm2 + subps xmm5, xmm3 + mulps xmm4, xmm1 + mulps xmm5, xmm1 + movaps [edi + ebx], xmm4 + movaps [edi + ebx + 16], xmm5 + add ebx, 16 * 2 jl lpA jmp done align 16 -lpNA: - movups xmm2,[edx+ebx] - movups xmm3,[edx+ebx+16] - rcpps xmm4,xmm2 - rcpps xmm5,xmm3 - prefetchnta [edx+ebx+64] - mulps xmm2,xmm4 - mulps xmm2,xmm4 - mulps xmm3,xmm5 - mulps xmm3,xmm5 - addps xmm4,xmm4 - addps xmm5,xmm5 - subps xmm4,xmm2 - subps xmm5,xmm3 - mulps xmm4,xmm1 - mulps xmm5,xmm1 - movaps [edi+ebx],xmm4 - movaps [edi+ebx+16],xmm5 - add ebx,16*2 + lpNA: + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] + rcpps xmm4, xmm2 + rcpps xmm5, xmm3 + prefetchnta [edx + ebx + 64] + mulps xmm2, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + mulps xmm3, xmm5 + addps xmm4, xmm4 + addps xmm5, xmm5 + subps xmm4, xmm2 + subps xmm5, xmm3 + mulps xmm4, xmm1 + mulps xmm5, xmm1 + movaps [edi + ebx], xmm4 + movaps [edi + ebx + 16], xmm5 + add ebx, 16 * 2 jl lpNA -done: - mov edx,src - mov edi,dst - KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ), - KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count ) + done: + mov edx, src + mov edi, dst + KFLOATOPER( KDIVDSS1( [edi + ebx], xmm1, [edx + ebx] ), + KDIVDSS4( [edi + ebx], xmm1, [edx + ebx] ), count ) } } @@ -2247,66 +2350,65 @@ idSIMD_SSE::Div ============ */ void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) { - int pre,post; + int pre, post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); - __asm - { + __asm { KFLOATINITDSS( dst, src0, src1, count, pre, post ) - and eax,15 + and eax, 15 jne lpNA jmp lpA align 16 -lpA: - movaps xmm2,[esi+ebx] - movaps xmm3,[esi+ebx+16] - rcpps xmm4,xmm2 - rcpps xmm5,xmm3 - prefetchnta [esi+ebx+64] - mulps xmm2,xmm4 - mulps xmm2,xmm4 - mulps xmm3,xmm5 - mulps xmm3,xmm5 - addps xmm4,xmm4 - addps xmm5,xmm5 - subps xmm4,xmm2 - subps xmm5,xmm3 - mulps xmm4,[edx+ebx] - mulps xmm5,[edx+ebx+16] - movaps [edi+ebx],xmm4 - movaps [edi+ebx+16],xmm5 - add ebx,16*2 + lpA: + movaps xmm2, [esi + ebx] + movaps xmm3, [esi + ebx + 16] + rcpps xmm4, xmm2 + rcpps xmm5, xmm3 + prefetchnta [esi + ebx + 64] + mulps xmm2, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + mulps xmm3, xmm5 + addps xmm4, xmm4 + addps xmm5, xmm5 + subps xmm4, xmm2 + subps xmm5, xmm3 + mulps xmm4, [edx + ebx] + mulps xmm5, [edx + ebx + 16] + movaps [edi + ebx], xmm4 + movaps [edi + ebx + 16], xmm5 + add ebx, 16 * 2 jl lpA jmp done align 16 -lpNA: - movups xmm2,[esi+ebx] - movups xmm3,[esi+ebx+16] - rcpps xmm4,xmm2 - rcpps xmm5,xmm3 - prefetchnta [esi+ebx+64] - mulps xmm2,xmm4 - mulps xmm2,xmm4 - mulps xmm3,xmm5 - mulps xmm3,xmm5 - addps xmm4,xmm4 - addps xmm5,xmm5 - subps xmm4,xmm2 - subps xmm5,xmm3 - movups xmm2,[edx+ebx] - movups xmm3,[edx+ebx+16] - mulps xmm4,xmm2 - mulps xmm5,xmm3 - movaps [edi+ebx],xmm4 - movaps [edi+ebx+16],xmm5 - add ebx,16*2 + lpNA: + movups xmm2, [esi + ebx] + movups xmm3, [esi + ebx + 16] + rcpps xmm4, xmm2 + rcpps xmm5, xmm3 + prefetchnta [esi + ebx + 64] + mulps xmm2, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + mulps xmm3, xmm5 + addps xmm4, xmm4 + addps xmm5, xmm5 + subps xmm4, xmm2 + subps xmm5, xmm3 + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] + mulps xmm4, xmm2 + mulps xmm5, xmm3 + movaps [edi + ebx], xmm4 + movaps [edi + ebx + 16], xmm5 + add ebx, 16 * 2 jl lpNA -done: - mov edx,src0 - mov esi,src1 - mov edi,dst - KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ), - KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count ) + done: + mov edx, src0 + mov esi, src1 + mov edi, dst + KFLOATOPER( KDIVDSS1( [edi + ebx], [edx + ebx], [esi + ebx] ), + KDIVDSS4( [edi + ebx], [edx + ebx], [esi + ebx] ), count ) } } /* @@ -2527,8 +2629,7 @@ idSIMD_SSE::Dot ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) { - __asm - { + __asm { mov eax, count mov edi, constant mov edx, eax @@ -2538,9 +2639,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm5, [edi+4] + movss xmm5, [edi + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [edi+8] + movss xmm6, [edi + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) jz done4 @@ -2548,37 +2649,37 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s add esi, eax neg eax - loop4: - movlps xmm1, [esi+eax+ 0] - movlps xmm2, [esi+eax+ 8] - movlps xmm3, [esi+eax+16] - movhps xmm1, [esi+eax+24] - movhps xmm2, [esi+eax+32] - movhps xmm3, [esi+eax+40] + loop4: + movlps xmm1, [esi + eax + 0] + movlps xmm2, [esi + eax + 8] + movlps xmm3, [esi + eax + 16] + movhps xmm1, [esi + eax + 24] + movhps xmm2, [esi + eax + 32] + movhps xmm3, [esi + eax + 40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) add ecx, 16 - add eax, 4*12 + add eax, 4 * 12 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 ) - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 + movlps [ecx - 16 + 0], xmm0 + movhps [ecx - 16 + 8], xmm0 jl loop4 - done4: + done4: and edx, 3 jz done1 - loop1: - movss xmm0, [esi+eax+0] - movss xmm1, [esi+eax+4] - movss xmm2, [esi+eax+8] + loop1: + movss xmm0, [esi + eax + 0] + movss xmm1, [esi + eax + 4] + movss xmm2, [esi + eax + 8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 @@ -2587,10 +2688,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s add eax, 12 addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loop1 - done1: + done1: } } @@ -2612,9 +2713,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * movss xmm5, [edi+0] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [edi+4] + movss xmm6, [edi + 4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [edi+8] + movss xmm7, [edi + 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 @@ -2622,16 +2723,16 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * add esi, eax neg eax - loopVert4: + loopVert4: - movlps xmm1, [esi+eax+ 0] - movlps xmm3, [esi+eax+ 8] - movhps xmm1, [esi+eax+16] - movhps xmm3, [esi+eax+24] - movlps xmm2, [esi+eax+32] - movlps xmm4, [esi+eax+40] - movhps xmm2, [esi+eax+48] - movhps xmm4, [esi+eax+56] + movlps xmm1, [esi + eax + 0] + movlps xmm3, [esi + eax + 8] + movhps xmm1, [esi + eax + 16] + movhps xmm3, [esi + eax + 24] + movlps xmm2, [esi + eax + 32] + movlps xmm4, [esi + eax + 40] + movhps xmm2, [esi + eax + 48] + movhps xmm4, [esi + eax + 56] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) @@ -2640,7 +2741,7 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) add ecx, 16 - add eax, 4*16 + add eax, 4 * 16 mulps xmm0, xmm5 mulps xmm1, xmm6 @@ -2649,31 +2750,31 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane * addps xmm0, xmm1 addps xmm0, xmm2 - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 + movlps [ecx - 16 + 0], xmm0 + movhps [ecx - 16 + 8], xmm0 jl loopVert4 - startVert1: + startVert1: and edx, 3 jz done - loopVert1: - movss xmm0, [esi+eax+0] - movss xmm1, [esi+eax+4] - movss xmm2, [esi+eax+8] + loopVert1: + movss xmm0, [esi + eax + 0] + movss xmm1, [esi + eax + 4] + movss xmm2, [esi + eax + 8] mulss xmm0, xmm5 mulss xmm1, xmm6 mulss xmm2, xmm7 - addss xmm0, [esi+eax+12] + addss xmm0, [esi + eax + 12] add ecx, 4 addss xmm0, xmm1 add eax, 16 addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loopVert1 - done: + done: } } @@ -2687,7 +2788,7 @@ idSIMD_SSE::Dot void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); // 0, 1, 2 // 3, 4, 5 @@ -2704,9 +2805,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm5, [edi+4] + movss xmm5, [edi + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [edi+8] + movss xmm6, [edi + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 @@ -2714,27 +2815,27 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer add esi, eax neg eax - loopVert4: - movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X - movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X - movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 + loopVert4: + movss xmm0, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 3, X, X, X + movss xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] // 2, X, X, X + movhps xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 3, X, 0, 1 movaps xmm1, xmm0 // 3, X, 0, 1 - movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 + movlps xmm1, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] // 4, 5, 0, 1 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 - movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 + movss xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 9, X, X, X + movhps xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 9, X, 6, 7 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 - movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 + movlps xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] // 10, 11, 6, 7 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X + movhps xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] // 10, 11, 8, X shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 add ecx, 16 - add eax, 4*DRAWVERT_SIZE + add eax, 4 * DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 @@ -2742,18 +2843,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer addps xmm0, xmm1 addps xmm0, xmm2 - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 + movlps [ecx - 16 + 0], xmm0 + movhps [ecx - 16 + 8], xmm0 jl loopVert4 - startVert1: + startVert1: and edx, 3 jz done - loopVert1: - movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] + loopVert1: + movss xmm0, [esi + eax + DRAWVERT_XYZ_OFFSET + 0] + movss xmm1, [esi + eax + DRAWVERT_XYZ_OFFSET + 4] + movss xmm2, [esi + eax + DRAWVERT_XYZ_OFFSET + 8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 @@ -2762,10 +2863,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loopVert1 - done: + done: } } @@ -2777,8 +2878,7 @@ idSIMD_SSE::Dot ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) { - __asm - { + __asm { mov eax, count mov edi, constant mov edx, eax @@ -2788,11 +2888,11 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 * movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm5, [edi+4] + movss xmm5, [edi + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [edi+8] + movss xmm6, [edi + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [edi+12] + movss xmm7, [edi + 12] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz done4 @@ -2800,20 +2900,20 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 * add esi, eax neg eax - loop4: - movlps xmm1, [esi+eax+ 0] - movlps xmm2, [esi+eax+ 8] - movlps xmm3, [esi+eax+16] - movhps xmm1, [esi+eax+24] - movhps xmm2, [esi+eax+32] - movhps xmm3, [esi+eax+40] + loop4: + movlps xmm1, [esi + eax + 0] + movlps xmm2, [esi + eax + 8] + movlps xmm3, [esi + eax + 16] + movhps xmm1, [esi + eax + 24] + movhps xmm2, [esi + eax + 32] + movhps xmm3, [esi + eax + 40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) add ecx, 16 - add eax, 4*12 + add eax, 4 * 12 mulps xmm0, xmm4 mulps xmm1, xmm5 @@ -2823,18 +2923,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 * addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 ) - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 + movlps [ecx - 16 + 0], xmm0 + movhps [ecx - 16 + 8], xmm0 jl loop4 - done4: + done4: and edx, 3 jz done1 - loop1: - movss xmm0, [esi+eax+0] - movss xmm1, [esi+eax+4] - movss xmm2, [esi+eax+8] + loop1: + movss xmm0, [esi + eax + 0] + movss xmm1, [esi + eax + 4] + movss xmm2, [esi + eax + 8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 @@ -2844,10 +2944,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 * add eax, 12 addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loop1 - done1: + done1: } } @@ -2895,22 +2995,22 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane mov ecx, count movlps xmm4, [ebx] - shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0) - movlps xmm5, [ebx+8] - shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0) + shufps xmm4, xmm4, SHUFFLEPS( 1, 0, 1, 0 ) + movlps xmm5, [ebx + 8] + shufps xmm5, xmm5, SHUFFLEPS( 1, 0, 1, 0 ) xorps xmm0, xmm0 xorps xmm1, xmm1 - _lpAlignDest: + _lpAlignDest: test edx, 0x0f jz _destAligned - SINGLE_OP(eax,edx) + SINGLE_OP( eax, edx ) dec ecx jnz _lpAlignDest jmp _vpExit - _destAligned: + _destAligned: push ecx cmp ecx, 4 @@ -2918,30 +3018,30 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane and ecx, ~3 shl ecx, 2 - lea eax, [eax+ecx*4] + lea eax, [eax + ecx * 4] add edx, ecx neg ecx - movlps xmm0, [eax+ecx*4] - movhps xmm0, [eax+ecx*4+16] - movlps xmm2, [eax+ecx*4+32] - movhps xmm2, [eax+ecx*4+48] + movlps xmm0, [eax + ecx * 4] + movhps xmm0, [eax + ecx * 4 + 16] + movlps xmm2, [eax + ecx * 4 + 32] + movhps xmm2, [eax + ecx * 4 + 48] jmp _lpStart align 16 - _lp: - prefetchnta [eax+ecx*4+128] + _lp: + prefetchnta [eax + ecx * 4 + 128] addps xmm1, xmm0 - movlps xmm0, [eax+ecx*4] - movhps xmm0, [eax+ecx*4+16] - movlps xmm2, [eax+ecx*4+32] - movhps xmm2, [eax+ecx*4+48] - movaps [edx+ecx-16],xmm1 - _lpStart: - movlps xmm1, [eax+ecx*4+8] - movhps xmm1, [eax+ecx*4+24] - movlps xmm3, [eax+ecx*4+40] - movhps xmm3, [eax+ecx*4+56] + movlps xmm0, [eax + ecx * 4] + movhps xmm0, [eax + ecx * 4 + 16] + movlps xmm2, [eax + ecx * 4 + 32] + movhps xmm2, [eax + ecx * 4 + 48] + movaps [edx + ecx - 16], xmm1 + _lpStart: + movlps xmm1, [eax + ecx * 4 + 8] + movhps xmm1, [eax + ecx * 4 + 24] + movlps xmm3, [eax + ecx * 4 + 40] + movhps xmm3, [eax + ecx * 4 + 56] add ecx, 16 mulps xmm1, xmm5 mulps xmm2, xmm4 @@ -2950,23 +3050,23 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane mulps xmm0, xmm4 addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0 movaps xmm1, xmm0 - shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0 - shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0 + shufps xmm0, xmm2, SHUFFLEPS( 2, 0, 2, 0 ) // x3+z3 x2+z2 x1+z1 x0+z0 + shufps xmm1, xmm2, SHUFFLEPS( 3, 1, 3, 1 ) // y3+w3 y2+w2 y1+w1 y0+w0 js _lp addps xmm1, xmm0 - movaps [edx+ecx-16], xmm1 - _post: + movaps [edx + ecx - 16], xmm1 + _post: pop ecx and ecx, 0x3 cmp ecx, 2 jl _post1 - DUAL_OP(eax,edx) + DUAL_OP( eax, edx ) sub ecx, 2 - _post1: + _post1: cmp ecx, 1 jne _vpExit - SINGLE_OP(eax,edx) - _vpExit: + SINGLE_OP( eax, edx ) + _vpExit: } #undef DUAL_OP @@ -2984,7 +3084,7 @@ idSIMD_SSE::Dot void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); // 0, 1, 2 // 3, 4, 5 @@ -3001,11 +3101,11 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm5, [edi+4] + movss xmm5, [edi + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [edi+8] + movss xmm6, [edi + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [edi+12] + movss xmm7, [edi + 12] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 @@ -3013,27 +3113,27 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe add esi, eax neg eax - loopVert4: - movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X - movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X - movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 + loopVert4: + movss xmm0, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 3, X, X, X + movss xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] // 2, X, X, X + movhps xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 3, X, 0, 1 movaps xmm1, xmm0 // 3, X, 0, 1 - movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 + movlps xmm1, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] // 4, 5, 0, 1 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 - movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 + movss xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 9, X, X, X + movhps xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] // 9, X, 6, 7 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 - movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 + movlps xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] // 10, 11, 6, 7 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 - movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X + movhps xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] // 10, 11, 8, X shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 add ecx, 16 - add eax, 4*DRAWVERT_SIZE + add eax, 4 * DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 @@ -3042,18 +3142,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe addps xmm0, xmm1 addps xmm0, xmm2 - movlps [ecx-16+0], xmm0 - movhps [ecx-16+8], xmm0 + movlps [ecx - 16 + 0], xmm0 + movhps [ecx - 16 + 8], xmm0 jl loopVert4 - startVert1: + startVert1: and edx, 3 jz done - loopVert1: - movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] + loopVert1: + movss xmm0, [esi + eax + DRAWVERT_XYZ_OFFSET + 0] + movss xmm1, [esi + eax + DRAWVERT_XYZ_OFFSET + 4] + movss xmm2, [esi + eax + DRAWVERT_XYZ_OFFSET + 8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 @@ -3063,10 +3163,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loopVert1 - done: + done: } } @@ -3078,8 +3178,7 @@ idSIMD_SSE::Dot ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) { - __asm - { + __asm { mov eax, count mov edi, src0 mov edx, eax @@ -3093,7 +3192,7 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, add esi, eax neg eax - loop4: + loop4: movlps xmm0, [esi+eax] // 0, 1, X, X movlps xmm3, [edi+eax] // 0, 1, X, X movlps xmm1, [esi+eax+8] // 2, 3, X, X @@ -3121,21 +3220,21 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, addps xmm7, xmm1 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 ) - movlps [ecx-16+0], xmm7 - movhps [ecx-16+8], xmm7 + movlps [ecx - 16 + 0], xmm7 + movhps [ecx - 16 + 8], xmm7 jl loop4 - done4: + done4: and edx, 3 jz done1 - loop1: - movss xmm0, [esi+eax+0] - movss xmm3, [edi+eax+0] - movss xmm1, [esi+eax+4] - movss xmm4, [edi+eax+4] - movss xmm2, [esi+eax+8] - movss xmm5, [edi+eax+8] + loop1: + movss xmm0, [esi + eax + 0] + movss xmm3, [edi + eax + 0] + movss xmm1, [esi + eax + 4] + movss xmm4, [edi + eax + 4] + movss xmm2, [esi + eax + 8] + movss xmm5, [edi + eax + 8] mulss xmm0, xmm3 mulss xmm1, xmm4 mulss xmm2, xmm5 @@ -3144,10 +3243,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, add eax, 12 addss xmm0, xmm2 dec edx - movss [ecx-4], xmm0 + movss [ecx - 4], xmm0 jnz loop1 - done1: + done1: } } @@ -3159,108 +3258,108 @@ idSIMD_SSE::Dot ============ */ void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) { - switch( count ) { - case 0: - dot = 0.0f; - return; - case 1: - dot = src1[0] * src2[0]; - return; - case 2: - dot = src1[0] * src2[0] + src1[1] * src2[1]; - return; - case 3: - dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2]; - return; - default: - __asm { - mov ecx, src1 - mov edx, src2 - mov eax, ecx - or eax, edx - and eax, 15 - jz alignedDot - // unaligned - mov eax, count - shr eax, 2 - shl eax, 4 - add ecx, eax - add edx, eax - neg eax - movups xmm0, [ecx+eax] - movups xmm1, [edx+eax] - mulps xmm0, xmm1 - add eax, 16 - jz doneDot + switch ( count ) { + case 0: + dot = 0.0f; + return; + case 1: + dot = src1[0] * src2[0]; + return; + case 2: + dot = src1[0] * src2[0] + src1[1] * src2[1]; + return; + case 3: + dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2]; + return; + default: + __asm { + mov ecx, src1 + mov edx, src2 + mov eax, ecx + or eax, edx + and eax, 15 + jz alignedDot + // unaligned + mov eax, count + shr eax, 2 + shl eax, 4 + add ecx, eax + add edx, eax + neg eax + movups xmm0, [ecx+eax] + movups xmm1, [edx+eax] + mulps xmm0, xmm1 + add eax, 16 + jz doneDot loopUnalignedDot: - movups xmm1, [ecx+eax] - movups xmm2, [edx+eax] - mulps xmm1, xmm2 - addps xmm0, xmm1 - add eax, 16 - jl loopUnalignedDot - jmp doneDot - // aligned + movups xmm1, [ecx+eax] + movups xmm2, [edx+eax] + mulps xmm1, xmm2 + addps xmm0, xmm1 + add eax, 16 + jl loopUnalignedDot + jmp doneDot + // aligned alignedDot: - mov eax, count - shr eax, 2 - shl eax, 4 - add ecx, eax - add edx, eax - neg eax - movaps xmm0, [ecx+eax] - movaps xmm1, [edx+eax] - mulps xmm0, xmm1 - add eax, 16 - jz doneDot + mov eax, count + shr eax, 2 + shl eax, 4 + add ecx, eax + add edx, eax + neg eax + movaps xmm0, [ecx+eax] + movaps xmm1, [edx+eax] + mulps xmm0, xmm1 + add eax, 16 + jz doneDot loopAlignedDot: - movaps xmm1, [ecx+eax] - movaps xmm2, [edx+eax] - mulps xmm1, xmm2 - addps xmm0, xmm1 - add eax, 16 - jl loopAlignedDot + movaps xmm1, [ecx+eax] + movaps xmm2, [edx+eax] + mulps xmm1, xmm2 + addps xmm0, xmm1 + add eax, 16 + jl loopAlignedDot doneDot: + } + switch ( count & 3 ) { + case 1: + __asm { + movss xmm1, [ecx] + movss xmm2, [edx] + mulss xmm1, xmm2 + addss xmm0, xmm1 } - switch( count & 3 ) { - case 1: - __asm { - movss xmm1, [ecx] - movss xmm2, [edx] - mulss xmm1, xmm2 - addss xmm0, xmm1 - } - break; - case 2: - __asm { - xorps xmm2, xmm2 - movlps xmm1, [ecx] - movlps xmm2, [edx] - mulps xmm1, xmm2 - addps xmm0, xmm1 - } - break; - case 3: - __asm { - movss xmm1, [ecx] - movhps xmm1, [ecx+4] - movss xmm2, [edx] - movhps xmm2, [edx+4] - mulps xmm1, xmm2 - addps xmm0, xmm1 - } - break; + break; + case 2: + __asm { + xorps xmm2, xmm2 + movlps xmm1, [ecx] + movlps xmm2, [edx] + mulps xmm1, xmm2 + addps xmm0, xmm1 } + break; + case 3: __asm { - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm1 - mov eax, dot - movss [eax], xmm0 + movss xmm1, [ecx] + movhps xmm1, [ecx+4] + movss xmm2, [edx] + movhps xmm2, [edx+4] + mulps xmm1, xmm2 + addps xmm0, xmm1 } - return; + break; + } + __asm { + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm1 + mov eax, dot + movss [eax], xmm0 + } + return; } } @@ -3558,10 +3657,10 @@ idSIMD_SSE::MinMax void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) { int i, pre, post; - min = idMath::INFINITY; max = -idMath::INFINITY; + min = idMath::INFINITY; + max = -idMath::INFINITY; - __asm - { + __asm { push ebx mov eax, min mov ebx, max @@ -3575,30 +3674,30 @@ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const jz lpA jmp lpNA align 16 -lpNA: - movups xmm2, [edx+ebx] - movups xmm3, [edx+ebx+16] + lpNA: + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] minps xmm0, xmm2 maxps xmm1, xmm2 - prefetchnta [edx+ebx+64] + prefetchnta [edx + ebx + 64] minps xmm0, xmm3 maxps xmm1, xmm3 - add ebx, 16*2 + add ebx, 16 * 2 jl lpNA jmp done2 -lpA: - movaps xmm2, [edx+ebx] - movaps xmm3, [edx+ebx+16] + lpA: + movaps xmm2, [edx + ebx] + movaps xmm3, [edx + ebx + 16] minps xmm0, xmm2 maxps xmm1, xmm2 - prefetchnta [edx+ebx+64] + prefetchnta [edx + ebx + 64] minps xmm0, xmm3 maxps xmm1, xmm3 - add ebx, 16*2 + add ebx, 16 * 2 jl lpA jmp done2 align 16 -done2: + done2: movaps xmm2, xmm0 movaps xmm3, xmm1 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) @@ -3617,7 +3716,7 @@ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const mov ebx, max movss [eax], xmm0 movss [ebx], xmm1 -done: + done: pop ebx } @@ -3662,21 +3761,21 @@ void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, con movlps xmm2, [esi] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) dec eax - add esi, 2*4 + add esi, 2 * 4 minps xmm0, xmm2 maxps xmm1, xmm2 - startLoop: - imul eax, 2*4 + startLoop: + imul eax, 2 * 4 add esi, eax neg eax - loopVert: - movlps xmm2, [esi+eax] - movhps xmm2, [esi+eax+8] - add eax, 4*4 + loopVert: + movlps xmm2, [esi + eax] + movhps xmm2, [esi + eax + 8] + add eax, 4 * 4 minps xmm0, xmm2 maxps xmm1, xmm2 jl loopVert - done: + done: movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ) minps xmm0, xmm2 @@ -3713,33 +3812,33 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, con add esi, eax neg eax - loop4: -// prefetchnta [esi+4*12] + loop4: + // prefetchnta [esi+4*12] - movss xmm4, [esi+eax+0*12+8] - movhps xmm4, [esi+eax+0*12+0] + movss xmm4, [esi + eax + 0 * 12 + 8] + movhps xmm4, [esi + eax + 0 * 12 + 0] minps xmm0, xmm4 maxps xmm1, xmm4 - movss xmm5, [esi+eax+1*12+0] - movhps xmm5, [esi+eax+1*12+4] + movss xmm5, [esi + eax + 1 * 12 + 0] + movhps xmm5, [esi + eax + 1 * 12 + 4] minps xmm2, xmm5 maxps xmm3, xmm5 - movss xmm6, [esi+eax+2*12+8] - movhps xmm6, [esi+eax+2*12+0] + movss xmm6, [esi + eax + 2 * 12 + 8] + movhps xmm6, [esi + eax + 2 * 12 + 0] minps xmm0, xmm6 maxps xmm1, xmm6 - movss xmm7, [esi+eax+3*12+0] - movhps xmm7, [esi+eax+3*12+4] + movss xmm7, [esi + eax + 3 * 12 + 0] + movhps xmm7, [esi + eax + 3 * 12 + 4] minps xmm2, xmm7 maxps xmm3, xmm7 - add eax, 4*12 + add eax, 4 * 12 jl loop4 - done4: + done4: mov eax, count and eax, 3 jz done1 @@ -3747,26 +3846,26 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, con add esi, eax neg eax - loop1: - movss xmm4, [esi+eax+0*12+8] - movhps xmm4, [esi+eax+0*12+0] + loop1: + movss xmm4, [esi + eax + 0 * 12 + 8] + movhps xmm4, [esi + eax + 0 * 12 + 0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 12 jl loop1 - done1: + done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 - movss [esi+8], xmm0 + movss [esi + 8], xmm0 mov edi, max movhps [edi], xmm1 - movss [edi+8], xmm1 + movss [edi + 8], xmm1 } } @@ -3778,7 +3877,7 @@ idSIMD_SSE::MinMax void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { @@ -3797,33 +3896,33 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, add esi, eax neg eax - loop4: -// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] + loop4: + // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] - movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm4 maxps xmm1, xmm4 - movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] minps xmm2, xmm5 maxps xmm3, xmm5 - movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm6, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm6, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm6 maxps xmm1, xmm6 - movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm7, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm7, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] minps xmm2, xmm7 maxps xmm3, xmm7 - add eax, 4*DRAWVERT_SIZE + add eax, 4 * DRAWVERT_SIZE jl loop4 - done4: + done4: mov eax, count and eax, 3 jz done1 @@ -3831,26 +3930,26 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, add esi, eax neg eax - loop1: - movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + loop1: + movss xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, DRAWVERT_SIZE jl loop1 - done1: + done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 - movss [esi+8], xmm0 + movss [esi + 8], xmm0 mov edi, max movhps [edi], xmm1 - movss [edi+8], xmm1 + movss [edi + 8], xmm1 } } @@ -3862,7 +3961,7 @@ idSIMD_SSE::MinMax void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { @@ -3882,42 +3981,42 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, add edi, eax neg eax - loop4: -// prefetchnta [edi+128] -// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] + loop4: + // prefetchnta [edi+128] + // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] - mov edx, [edi+eax+0] + mov edx, [edi + eax + 0] imul edx, DRAWVERT_SIZE - movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + movss xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm4 maxps xmm1, xmm4 - mov edx, [edi+eax+4] + mov edx, [edi + eax + 4] imul edx, DRAWVERT_SIZE - movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4] + movss xmm5, [esi + edx + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm5, [esi + edx + DRAWVERT_XYZ_OFFSET + 4] minps xmm2, xmm5 maxps xmm3, xmm5 - mov edx, [edi+eax+8] + mov edx, [edi + eax + 8] imul edx, DRAWVERT_SIZE - movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + movss xmm6, [esi + edx + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm6, [esi + edx + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm6 maxps xmm1, xmm6 - mov edx, [edi+eax+12] + mov edx, [edi + eax + 12] imul edx, DRAWVERT_SIZE - movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] - movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4] + movss xmm7, [esi + edx + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm7, [esi + edx + DRAWVERT_XYZ_OFFSET + 4] minps xmm2, xmm7 maxps xmm3, xmm7 - add eax, 4*4 + add eax, 4 * 4 jl loop4 - done4: + done4: mov eax, count and eax, 3 jz done1 @@ -3925,28 +4024,28 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, add edi, eax neg eax - loop1: - mov edx, [edi+eax+0] + loop1: + mov edx, [edi + eax + 0] imul edx, DRAWVERT_SIZE; - movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] - movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] + movss xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 4 jl loop1 - done1: + done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 - movss [esi+8], xmm0 + movss [esi + 8], xmm0 mov edi, max movhps [edi], xmm1 - movss [edi+8], xmm1 + movss [edi + 8], xmm1 } } @@ -3958,64 +4057,63 @@ idSIMD_SSE::Clamp void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) { int i, pre, post; - __asm - { - movss xmm0,min - movss xmm1,max - shufps xmm0,xmm0,0 - shufps xmm1,xmm1,0 + __asm { + movss xmm0, min + movss xmm1, max + shufps xmm0, xmm0, 0 + shufps xmm1, xmm1, 0 KFLOATINITDS( dst, src, count, pre, post ) - and eax,15 + and eax, 15 jne lpNA jmp lpA align 16 -lpA: - movaps xmm2,[edx+ebx] - movaps xmm3,[edx+ebx+16] - maxps xmm2,xmm0 - maxps xmm3,xmm0 - prefetchnta [edx+ebx+64] - minps xmm2,xmm1 - minps xmm3,xmm1 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpA: + movaps xmm2, [edx + ebx] + movaps xmm3, [edx + ebx + 16] + maxps xmm2, xmm0 + maxps xmm3, xmm0 + prefetchnta [edx + ebx + 64] + minps xmm2, xmm1 + minps xmm3, xmm1 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpA jmp done align 16 -lpNA: - movups xmm2,[edx+ebx] - movups xmm3,[edx+ebx+16] - maxps xmm2,xmm0 - maxps xmm3,xmm0 - prefetchnta [edx+ebx+64] - minps xmm2,xmm1 - minps xmm3,xmm1 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpNA: + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] + maxps xmm2, xmm0 + maxps xmm3, xmm0 + prefetchnta [edx + ebx + 64] + minps xmm2, xmm1 + minps xmm3, xmm1 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpNA -done: + done: } for ( i = 0; i < pre; i++ ) { if ( src[i] < min ) - dst[i] = min; + { dst[i] = min; } else if ( src[i] > max ) - dst[i] = max; + { dst[i] = max; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } - for( i = count - post; i < count; i++ ) { + for ( i = count - post; i < count; i++ ) { if ( src[i] < min ) - dst[i] = min; + { dst[i] = min; } else if ( src[i] > max ) - dst[i] = max; + { dst[i] = max; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } } @@ -4027,53 +4125,52 @@ idSIMD_SSE::ClampMin void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) { int i, pre, post; - __asm - { - movss xmm0,min - shufps xmm0,xmm0,0 + __asm { + movss xmm0, min + shufps xmm0, xmm0, 0 KFLOATINITDS( dst, src, count, pre, post ) - and eax,15 + and eax, 15 jne lpNA jmp lpA align 16 -lpA: - movaps xmm2,[edx+ebx] - movaps xmm3,[edx+ebx+16] - maxps xmm2,xmm0 - prefetchnta [edx+ebx+64] - maxps xmm3,xmm0 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpA: + movaps xmm2, [edx + ebx] + movaps xmm3, [edx + ebx + 16] + maxps xmm2, xmm0 + prefetchnta [edx + ebx + 64] + maxps xmm3, xmm0 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpA jmp done align 16 -lpNA: - movups xmm2,[edx+ebx] - movups xmm3,[edx+ebx+16] - maxps xmm2,xmm0 - prefetchnta [edx+ebx+64] - maxps xmm3,xmm0 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpNA: + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] + maxps xmm2, xmm0 + prefetchnta [edx + ebx + 64] + maxps xmm3, xmm0 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpNA -done: + done: } - for( i = 0; i < pre; i++ ) { + for ( i = 0; i < pre; i++ ) { if ( src[i] < min ) - dst[i] = min; + { dst[i] = min; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } - for( i = count - post; i < count; i++ ) { + for ( i = count - post; i < count; i++ ) { if ( src[i] < min ) - dst[i] = min; + { dst[i] = min; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } } @@ -4085,54 +4182,53 @@ idSIMD_SSE::ClampMax void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) { int i, pre, post; - __asm - { - movss xmm1,max - shufps xmm1,xmm1,0 + __asm { + movss xmm1, max + shufps xmm1, xmm1, 0 KFLOATINITDS( dst, src, count, pre, post ) - and eax,15 + and eax, 15 jne lpNA jmp lpA align 16 -lpA: - movaps xmm2,[edx+ebx] - movaps xmm3,[edx+ebx+16] - minps xmm2,xmm1 - prefetchnta [edx+ebx+64] - minps xmm3,xmm1 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpA: + movaps xmm2, [edx + ebx] + movaps xmm3, [edx + ebx + 16] + minps xmm2, xmm1 + prefetchnta [edx + ebx + 64] + minps xmm3, xmm1 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpA jmp done align 16 -lpNA: - movups xmm2,[edx+ebx] - movups xmm3,[edx+ebx+16] - minps xmm2,xmm1 - prefetchnta [edx+ebx+64] - minps xmm3,xmm1 - movaps [edi+ebx],xmm2 - movaps [edi+ebx+16],xmm3 - add ebx,16*2 + lpNA: + movups xmm2, [edx + ebx] + movups xmm3, [edx + ebx + 16] + minps xmm2, xmm1 + prefetchnta [edx + ebx + 64] + minps xmm3, xmm1 + movaps [edi + ebx], xmm2 + movaps [edi + ebx + 16], xmm3 + add ebx, 16 * 2 jl lpNA -done: + done: } - for( i = 0; i < pre; i++ ) { + for ( i = 0; i < pre; i++ ) { if ( src[i] > max ) - dst[i] = max; + { dst[i] = max; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } - for( i = count - post; i < count; i++ ) { + for ( i = count - post; i < count; i++ ) { if ( src[i] > max ) - dst[i] = max; + { dst[i] = max; } else - dst[i] = src[i]; + { dst[i] = src[i]; } } } @@ -4152,11 +4248,11 @@ void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) { add edx, eax neg eax xorps xmm0, xmm0 - loopZero16: + loopZero16: movaps [edx+eax], xmm0 add eax, 16 jl loopZero16 - doneZero16: + doneZero16: } } @@ -4177,13 +4273,13 @@ void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) { neg eax movss xmm0, SIMD_SP_signBitMask shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - loopNegate16: - movaps xmm1, [edx+eax] + loopNegate16: + movaps xmm1, [edx + eax] xorps xmm1, xmm0 - movaps [edx+eax], xmm1 + movaps [edx + eax], xmm1 add eax, 16 jl loopNegate16 - doneNegate16: + doneNegate16: } } @@ -4204,12 +4300,12 @@ void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) add ecx, eax add edx, eax neg eax - loopCopy16: + loopCopy16: movaps xmm0, [ecx+eax] movaps [edx+eax], xmm0 add eax, 16 jl loopCopy16 - doneCopy16: + doneCopy16: } } @@ -4232,13 +4328,13 @@ void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, add ecx, eax add edx, eax neg eax - loopAdd16: + loopAdd16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopAdd16 - doneAdd16: + doneAdd16: } } @@ -4261,13 +4357,13 @@ void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, add ecx, eax add edx, eax neg eax - loopSub16: + loopSub16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopSub16 - doneSub16: + doneSub16: } } @@ -4290,13 +4386,13 @@ void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float consta add edx, eax neg eax shufps xmm1, xmm1, 0x00 - loopMulScalar16: + loopMulScalar16: movaps xmm0, [edx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulScalar16 - doneMulScalar16: + doneMulScalar16: } } @@ -4317,13 +4413,13 @@ void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int cou add ecx, eax add edx, eax neg eax - loopAddAssign16: + loopAddAssign16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopAddAssign16 - doneAddAssign16: + doneAddAssign16: } } @@ -4344,13 +4440,13 @@ void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int cou add ecx, eax add edx, eax neg eax - loopSubAssign16: + loopSubAssign16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopSubAssign16 - doneSubAssign16: + doneSubAssign16: } } @@ -4371,13 +4467,13 @@ void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int add ecx, eax neg eax shufps xmm1, xmm1, 0x00 - loopMulAssign16: + loopMulAssign16: movaps xmm0, [ecx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulAssign16 - doneMulAssign16: + doneMulAssign16: } } @@ -4417,1444 +4513,698 @@ void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); - switch( mat.GetNumColumns() ) { - case 1: { - switch( numRows ) { - case 1: { // 1x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - STORE1( 0, xmm0, xmm1 ) - } - return; - } - case 6: { // 6x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0]; - mPtr++; - } - return; - } + switch ( mat.GetNumColumns() ) { + case 1: { + switch ( numRows ) { + case 1: { // 1x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + STORE1( 0, xmm0, xmm1 ) } - break; + return; } - case 2: { - switch( numRows ) { - case 2: { // 2x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm1, [esi+4] - movss xmm2, [edi] - mulss xmm2, xmm0 - movss xmm3, [edi+4] - mulss xmm3, xmm1 - addss xmm2, xmm3 - STORE1( 0, xmm2, xmm4 ) - mulss xmm0, [edi+8] - mulss xmm1, [edi+8+4] - addss xmm0, xmm1 - STORE1( 4, xmm0, xmm4 ) - } - return; - } - case 6: { // 6x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, [esi] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movaps xmm0, [edi] - mulps xmm0, xmm7 - movaps xmm1, [edi+16] - mulps xmm1, xmm7 - movaps xmm2, xmm0 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - movaps xmm3, [edi+32] - addps xmm0, xmm2 - mulps xmm3, xmm7 - STORE4( 0, xmm0, xmm4 ) - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm1, xmm3 - addps xmm3, xmm1 - STORE2LO( 16, xmm3, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; - mPtr += 2; - } - return; - } + case 6: { // 6x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm2 ) } - break; + return; } - case 3: { - switch( numRows ) { - case 3: { // 3x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm4, [edi] - mulss xmm4, xmm0 - movss xmm1, [esi+4] - movss xmm5, [edi+4] - mulss xmm5, xmm1 - addss xmm4, xmm5 - movss xmm2, [esi+8] - movss xmm6, [edi+8] - mulss xmm6, xmm2 - addss xmm4, xmm6 - movss xmm3, [edi+12] - mulss xmm3, xmm0 - STORE1( 0, xmm4, xmm7 ); - movss xmm5, [edi+12+4] - mulss xmm5, xmm1 - addss xmm3, xmm5 - movss xmm6, [edi+12+8] - mulss xmm6, xmm2 - addss xmm3, xmm6 - mulss xmm0, [edi+24] - mulss xmm1, [edi+24+4] - STORE1( 4, xmm3, xmm7 ); - addss xmm0, xmm1 - mulss xmm2, [edi+24+8] - addss xmm0, xmm2 - STORE1( 8, xmm0, xmm7 ); - } - return; - } - case 6: { // 6x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm5, [esi] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [esi+4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 - movlps xmm1, [edi+4*4] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 - movlps xmm2, [edi+6*4] - movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 - mulps xmm0, xmm5 - movlps xmm3, [edi+10*4] - shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 - movaps xmm3, xmm1 - shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 - mulps xmm1, xmm6 - shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 - mulps xmm3, xmm7 - addps xmm0, xmm1 - addps xmm0, xmm3 - STORE4( 0, xmm0, xmm4 ) - movss xmm1, [edi+12*4] - mulss xmm1, xmm5 - movss xmm2, [edi+13*4] - mulss xmm2, xmm6 - movss xmm3, [edi+14*4] - mulss xmm3, xmm7 - addss xmm1, xmm2 - addss xmm1, xmm3 - STORE1( 16, xmm1, xmm4 ) - mulss xmm5, [edi+15*4] - mulss xmm6, [edi+16*4] - mulss xmm7, [edi+17*4] - addss xmm5, xmm6 - addss xmm5, xmm7 - STORE1( 20, xmm5, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; - mPtr += 3; - } - return; - } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0]; + mPtr++; } - break; + return; } - case 4: { - switch( numRows ) { - case 4: { // 4x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi ] - movlps xmm0, qword ptr [edi ] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - STORE4( 0, xmm0, xmm2 ) - } - return; - } - case 6: { // 6x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi+ 0] - movlps xmm0, qword ptr [edi+ 0] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - movlps xmm1, qword ptr [edi+64] - movhps xmm1, qword ptr [edi+80] - STORE4( 0, xmm0, xmm4 ) - mulps xmm1, xmm6 - movlps xmm2, qword ptr [edi+72] - movhps xmm2, qword ptr [edi+88] - mulps xmm2, xmm7 - addps xmm1, xmm2 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm3, xmm1 - addps xmm1, xmm3 - STORE2LO( 16, xmm1, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; - mPtr += 4; - } - return; - } - } - break; } - case 5: { - switch( numRows ) { - case 5: { // 5x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X - movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 - movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X - movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 - movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 - shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 - movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 - movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 - movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 - shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 - movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 - movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 - movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 - shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 - movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 - movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 - movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X - movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 - shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 - movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 - shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 - movss xmm7, [esi+0*4] - shufps xmm7, xmm7, 0 - mulps xmm0, xmm7 - movss xmm5, [esi+1*4] - shufps xmm5, xmm5, 0 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, 0 - mulps xmm2, xmm6 - addps xmm0, xmm2 - movss xmm1, [esi+3*4] - shufps xmm1, xmm1, 0 - mulps xmm3, xmm1 - addps xmm0, xmm3 - movss xmm2, [esi+4*4] - shufps xmm2, xmm2, 0 - mulps xmm4, xmm2 - addps xmm0, xmm4 - mulss xmm7, [edi+20*4] - mulss xmm5, [edi+21*4] - addps xmm7, xmm5 - mulss xmm6, [edi+22*4] - addps xmm7, xmm6 - mulss xmm1, [edi+23*4] - addps xmm7, xmm1 - mulss xmm2, [edi+24*4] - addps xmm7, xmm2 - STORE4( 0, xmm0, xmm3 ) - STORE1( 16, xmm7, xmm4 ) - } - return; - } - case 6: { // 6x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [esi] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm0, [edi] - movhps xmm3, [edi+8] - movaps xmm1, [edi+16] - movlps xmm2, [edi+32] - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 - mulps xmm0, xmm6 - mulps xmm3, xmm7 - movlps xmm2, [edi+40] - addps xmm0, xmm3 // xmm0 + xmm1 - movhps xmm5, [edi+40+8] - movlps xmm3, [edi+40+16] - movhps xmm3, [edi+40+24] - movlps xmm4, [edi+40+32] - shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 - shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 - shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 - mulps xmm2, xmm6 - mulps xmm5, xmm7 - addps xmm2, xmm5 // xmm2 + xmm3 - movss xmm5, [esi+16] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm0 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) - shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) - addps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - STORE4( 0, xmm0, xmm2 ) - movlps xmm4, [edi+80] - movhps xmm3, [edi+80+8] - movaps xmm1, [edi+80+16] - movlps xmm2, [edi+80+32] - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 - mulps xmm4, xmm6 - mulps xmm3, xmm7 - mulps xmm1, xmm5 - addps xmm4, xmm3 // xmm4 + xmm1 - shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) - shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) - addps xmm4, xmm1 - shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) - addps xmm4, xmm1 - STORE2LO( 16, xmm4, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; - mPtr += 5; - } - return; - } + break; + } + case 2: { + switch ( numRows ) { + case 2: { // 2x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm1, [esi+4] + movss xmm2, [edi] + mulss xmm2, xmm0 + movss xmm3, [edi+4] + mulss xmm3, xmm1 + addss xmm2, xmm3 + STORE1( 0, xmm2, xmm4 ) + mulss xmm0, [edi + 8] + mulss xmm1, [edi + 8 + 4] + addss xmm0, xmm1 + STORE1( 4, xmm0, xmm4 ) } - break; + return; } - case 6: { - switch( numRows ) { - case 1: { // 1x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - movss xmm1, [esi+4] - mulss xmm1, [edi+4] - movss xmm2, [esi+8] - addss xmm0, xmm1 - mulss xmm2, [edi+8] - movss xmm3, [esi+12] - addss xmm0, xmm2 - mulss xmm3, [edi+12] - movss xmm4, [esi+16] - addss xmm0, xmm3 - mulss xmm4, [edi+16] - movss xmm5, [esi+20] - addss xmm0, xmm4 - mulss xmm5, [edi+20] - movss xmm6, [esi+24] - addss xmm0, xmm5 - mulss xmm6, [edi+24] - addss xmm0, xmm6 - STORE1( 0, xmm0, xmm7 ) - } - return; - } - case 2: { // 2x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - } - return; - } - case 3: { // 3x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - // row 2 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm1 - STORE1( 8, xmm0, xmm3 ) - } - return; - } - case 4: { // 4x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm4 ) - } - return; - } - case 5: { // 5x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm3 ) - // row 5 - movaps xmm0, [edi+96] - movaps xmm1, [edi+96+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, 0x01 - addss xmm0, xmm1 - STORE1( 16, xmm0, xmm3 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, qword ptr [esi] - movlps xmm6, qword ptr [esi+8] - shufps xmm7, xmm7, 0x44 - shufps xmm6, xmm6, 0x44 - movlps xmm0, qword ptr [edi ] - movhps xmm0, qword ptr [edi+ 24] - mulps xmm0, xmm7 - movlps xmm3, qword ptr [edi+ 8] - movhps xmm3, qword ptr [edi+ 32] - mulps xmm3, xmm6 - movlps xmm1, qword ptr [edi+ 48] - movhps xmm1, qword ptr [edi+ 72] - mulps xmm1, xmm7 - movlps xmm2, qword ptr [edi+ 96] - movhps xmm2, qword ptr [edi+120] - mulps xmm2, xmm7 - movlps xmm4, qword ptr [edi+ 56] - movhps xmm4, qword ptr [edi+ 80] - movlps xmm5, qword ptr [edi+104] - movhps xmm5, qword ptr [edi+128] - mulps xmm4, xmm6 - movlps xmm7, qword ptr [esi+16] - addps xmm0, xmm3 - shufps xmm7, xmm7, 0x44 - mulps xmm5, xmm6 - addps xmm1, xmm4 - movlps xmm3, qword ptr [edi+ 16] - movhps xmm3, qword ptr [edi+ 40] - addps xmm2, xmm5 - movlps xmm4, qword ptr [edi+ 64] - movhps xmm4, qword ptr [edi+ 88] - mulps xmm3, xmm7 - movlps xmm5, qword ptr [edi+112] - movhps xmm5, qword ptr [edi+136] - addps xmm0, xmm3 - mulps xmm4, xmm7 - mulps xmm5, xmm7 - addps xmm1, xmm4 - addps xmm2, xmm5 - movaps xmm6, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm6, xmm1, 0xDD - movaps xmm7, xmm2 - shufps xmm7, xmm2, 0x88 - shufps xmm2, xmm2, 0xDD - addps xmm0, xmm6 - addps xmm2, xmm7 - STORE4( 0, xmm0, xmm3 ) - STORE2LO( 16, xmm2, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + - mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; - mPtr += 6; - } - return; - } + case 6: { // 6x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, [esi] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movaps xmm0, [edi] + mulps xmm0, xmm7 + movaps xmm1, [edi + 16] + mulps xmm1, xmm7 + movaps xmm2, xmm0 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + movaps xmm3, [edi + 32] + addps xmm0, xmm2 + mulps xmm3, xmm7 + STORE4( 0, xmm0, xmm4 ) + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm1, xmm3 + addps xmm3, xmm1 + STORE2LO( 16, xmm3, xmm4 ) } - break; + return; } default: { - int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numColumns; j++ ) { - sum += mPtr[j] * vPtr[j]; - } - dstPtr[i] STOREC sum; - mPtr += numColumns; + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; + mPtr += 2; } - break; + return; + } } + break; } - -#undef STOREC -#undef STORE4 -#undef STORE2HI -#undef STORE2LO -#undef STORE1 -} - -/* -============ -idSIMD_SSE::MatX_MultiplyAddVecX - - optimizes the following matrix multiplications: - - NxN * Nx1 - Nx6 * 6x1 - 6xN * Nx1 - - with N in the range [1-6] -============ -*/ -void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { -#define STORE1( offset, reg1, reg2 ) \ - __asm movss reg2, [eax+offset] \ - __asm addss reg2, reg1 \ - __asm movss [eax+offset], reg2 -#define STORE2LO( offset, reg1, reg2 ) \ - __asm movlps reg2, [eax+offset] \ - __asm addps reg2, reg1 \ - __asm movlps [eax+offset], reg2 -#define STORE2HI( offset, reg1, reg2 ) \ - __asm movhps reg2, [eax+offset] \ - __asm addps reg2, reg1 \ - __asm movhps [eax+offset], reg2 -#define STORE4( offset, reg1, reg2 ) \ - __asm movlps reg2, [eax+offset] \ - __asm movhps reg2, [eax+offset+8] \ - __asm addps reg2, reg1 \ - __asm movlps [eax+offset], reg2 \ - __asm movhps [eax+offset+8], reg2 -#define STOREC += - - int numRows; - const float *mPtr, *vPtr; - float *dstPtr; - - assert( vec.GetSize() >= mat.GetNumColumns() ); - assert( dst.GetSize() >= mat.GetNumRows() ); - - mPtr = mat.ToFloatPtr(); - vPtr = vec.ToFloatPtr(); - dstPtr = dst.ToFloatPtr(); - numRows = mat.GetNumRows(); - switch( mat.GetNumColumns() ) { - case 1: { - switch( numRows ) { - case 1: { // 1x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - STORE1( 0, xmm0, xmm1 ) - } - return; - } - case 6: { // 6x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0]; - mPtr++; - } - return; - } + case 3: { + switch ( numRows ) { + case 3: { // 3x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm4, [edi] + mulss xmm4, xmm0 + movss xmm1, [esi+4] + movss xmm5, [edi+4] + mulss xmm5, xmm1 + addss xmm4, xmm5 + movss xmm2, [esi+8] + movss xmm6, [edi+8] + mulss xmm6, xmm2 + addss xmm4, xmm6 + movss xmm3, [edi+12] + mulss xmm3, xmm0 + STORE1( 0, xmm4, xmm7 ); + movss xmm5, [edi + 12 + 4] + mulss xmm5, xmm1 + addss xmm3, xmm5 + movss xmm6, [edi + 12 + 8] + mulss xmm6, xmm2 + addss xmm3, xmm6 + mulss xmm0, [edi + 24] + mulss xmm1, [edi + 24 + 4] + STORE1( 4, xmm3, xmm7 ); + addss xmm0, xmm1 + mulss xmm2, [edi + 24 + 8] + addss xmm0, xmm2 + STORE1( 8, xmm0, xmm7 ); } - break; + return; } - case 2: { - switch( numRows ) { - case 2: { // 2x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm1, [esi+4] - movss xmm2, [edi] - mulss xmm2, xmm0 - movss xmm3, [edi+4] - mulss xmm3, xmm1 - addss xmm2, xmm3 - STORE1( 0, xmm2, xmm4 ) - mulss xmm0, [edi+8] - mulss xmm1, [edi+8+4] - addss xmm0, xmm1 - STORE1( 4, xmm0, xmm4 ) - } - return; - } - case 6: { // 6x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, [esi] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movaps xmm0, [edi] - mulps xmm0, xmm7 - movaps xmm1, [edi+16] - mulps xmm1, xmm7 - movaps xmm2, xmm0 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - movaps xmm3, [edi+32] - addps xmm0, xmm2 - mulps xmm3, xmm7 - STORE4( 0, xmm0, xmm4 ) - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm1, xmm3 - addps xmm3, xmm1 - STORE2LO( 16, xmm3, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; - mPtr += 2; - } - return; - } + case 6: { // 6x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm5, [esi] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm6, [esi + 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 + movlps xmm1, [edi + 4 * 4] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 + movlps xmm2, [edi + 6 * 4] + movhps xmm2, [edi + 8 * 4] // xmm2 = 6, 7, 8, 9 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 + mulps xmm0, xmm5 + movlps xmm3, [edi + 10 * 4] + shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 + movaps xmm3, xmm1 + shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 + mulps xmm1, xmm6 + shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 + mulps xmm3, xmm7 + addps xmm0, xmm1 + addps xmm0, xmm3 + STORE4( 0, xmm0, xmm4 ) + movss xmm1, [edi + 12 * 4] + mulss xmm1, xmm5 + movss xmm2, [edi + 13 * 4] + mulss xmm2, xmm6 + movss xmm3, [edi + 14 * 4] + mulss xmm3, xmm7 + addss xmm1, xmm2 + addss xmm1, xmm3 + STORE1( 16, xmm1, xmm4 ) + mulss xmm5, [edi + 15 * 4] + mulss xmm6, [edi + 16 * 4] + mulss xmm7, [edi + 17 * 4] + addss xmm5, xmm6 + addss xmm5, xmm7 + STORE1( 20, xmm5, xmm4 ) } - break; + return; } - case 3: { - switch( numRows ) { - case 3: { // 3x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm4, [edi] - mulss xmm4, xmm0 - movss xmm1, [esi+4] - movss xmm5, [edi+4] - mulss xmm5, xmm1 - addss xmm4, xmm5 - movss xmm2, [esi+8] - movss xmm6, [edi+8] - mulss xmm6, xmm2 - addss xmm4, xmm6 - movss xmm3, [edi+12] - mulss xmm3, xmm0 - STORE1( 0, xmm4, xmm7 ); - movss xmm5, [edi+12+4] - mulss xmm5, xmm1 - addss xmm3, xmm5 - movss xmm6, [edi+12+8] - mulss xmm6, xmm2 - addss xmm3, xmm6 - mulss xmm0, [edi+24] - mulss xmm1, [edi+24+4] - STORE1( 4, xmm3, xmm7 ); - addss xmm0, xmm1 - mulss xmm2, [edi+24+8] - addss xmm0, xmm2 - STORE1( 8, xmm0, xmm7 ); - } - return; - } - case 6: { // 6x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm5, [esi] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [esi+4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 - movlps xmm1, [edi+4*4] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 - movlps xmm2, [edi+6*4] - movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 - mulps xmm0, xmm5 - movlps xmm3, [edi+10*4] - shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 - movaps xmm3, xmm1 - shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 - mulps xmm1, xmm6 - shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 - mulps xmm3, xmm7 - addps xmm0, xmm1 - addps xmm0, xmm3 - STORE4( 0, xmm0, xmm4 ) - movss xmm1, [edi+12*4] - mulss xmm1, xmm5 - movss xmm2, [edi+13*4] - mulss xmm2, xmm6 - movss xmm3, [edi+14*4] - mulss xmm3, xmm7 - addss xmm1, xmm2 - addss xmm1, xmm3 - STORE1( 16, xmm1, xmm4 ) - mulss xmm5, [edi+15*4] - mulss xmm6, [edi+16*4] - mulss xmm7, [edi+17*4] - addss xmm5, xmm6 - addss xmm5, xmm7 - STORE1( 20, xmm5, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; - mPtr += 3; - } - return; - } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; + mPtr += 3; } - break; + return; } - case 4: { - switch( numRows ) { - case 4: { // 4x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi ] - movlps xmm0, qword ptr [edi ] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - STORE4( 0, xmm0, xmm2 ) - } - return; - } - case 6: { // 6x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi+ 0] - movlps xmm0, qword ptr [edi+ 0] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - movlps xmm1, qword ptr [edi+64] - movhps xmm1, qword ptr [edi+80] - STORE4( 0, xmm0, xmm4 ) - mulps xmm1, xmm6 - movlps xmm2, qword ptr [edi+72] - movhps xmm2, qword ptr [edi+88] - mulps xmm2, xmm7 - addps xmm1, xmm2 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm3, xmm1 - addps xmm1, xmm3 - STORE2LO( 16, xmm1, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; - mPtr += 4; - } - return; - } + } + break; + } + case 4: { + switch ( numRows ) { + case 4: { // 4x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi ] + movlps xmm0, qword ptr [edi ] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + STORE4( 0, xmm0, xmm2 ) } - break; + return; } - case 5: { - switch( numRows ) { - case 5: { // 5x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X - movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 - movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X - movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 - movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 - shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 - movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 - movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 - movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 - shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 - movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 - movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 - movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 - shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 - movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 - movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 - movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X - movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 - shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 - movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 - shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 - movss xmm7, [esi+0*4] - shufps xmm7, xmm7, 0 - mulps xmm0, xmm7 - movss xmm5, [esi+1*4] - shufps xmm5, xmm5, 0 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, 0 - mulps xmm2, xmm6 - addps xmm0, xmm2 - movss xmm1, [esi+3*4] - shufps xmm1, xmm1, 0 - mulps xmm3, xmm1 - addps xmm0, xmm3 - movss xmm2, [esi+4*4] - shufps xmm2, xmm2, 0 - mulps xmm4, xmm2 - addps xmm0, xmm4 - mulss xmm7, [edi+20*4] - mulss xmm5, [edi+21*4] - addps xmm7, xmm5 - mulss xmm6, [edi+22*4] - addps xmm7, xmm6 - mulss xmm1, [edi+23*4] - addps xmm7, xmm1 - mulss xmm2, [edi+24*4] - addps xmm7, xmm2 - STORE4( 0, xmm0, xmm3 ) - STORE1( 16, xmm7, xmm4 ) - } - return; - } - case 6: { // 6x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [esi] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm0, [edi] - movhps xmm3, [edi+8] - movaps xmm1, [edi+16] - movlps xmm2, [edi+32] - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 - mulps xmm0, xmm6 - mulps xmm3, xmm7 - movlps xmm2, [edi+40] - addps xmm0, xmm3 // xmm0 + xmm1 - movhps xmm5, [edi+40+8] - movlps xmm3, [edi+40+16] - movhps xmm3, [edi+40+24] - movlps xmm4, [edi+40+32] - shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 - shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 - shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 - mulps xmm2, xmm6 - mulps xmm5, xmm7 - addps xmm2, xmm5 // xmm2 + xmm3 - movss xmm5, [esi+16] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm0 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) - shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) - addps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - STORE4( 0, xmm0, xmm2 ) - movlps xmm4, [edi+80] - movhps xmm3, [edi+80+8] - movaps xmm1, [edi+80+16] - movlps xmm2, [edi+80+32] - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 - mulps xmm4, xmm6 - mulps xmm3, xmm7 - mulps xmm1, xmm5 - addps xmm4, xmm3 // xmm4 + xmm1 - shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) - shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) - addps xmm4, xmm1 - shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) - addps xmm4, xmm1 - STORE2LO( 16, xmm4, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; - mPtr += 5; - } - return; - } + case 6: { // 6x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi+ 0] + movlps xmm0, qword ptr [edi+ 0] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + movlps xmm1, qword ptr [edi + 64] + movhps xmm1, qword ptr [edi + 80] + STORE4( 0, xmm0, xmm4 ) + mulps xmm1, xmm6 + movlps xmm2, qword ptr [edi + 72] + movhps xmm2, qword ptr [edi + 88] + mulps xmm2, xmm7 + addps xmm1, xmm2 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm3, xmm1 + addps xmm1, xmm3 + STORE2LO( 16, xmm1, xmm4 ) } - break; + return; } - case 6: { - switch( numRows ) { - case 1: { // 1x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - movss xmm1, [esi+4] - mulss xmm1, [edi+4] - movss xmm2, [esi+8] - addss xmm0, xmm1 - mulss xmm2, [edi+8] - movss xmm3, [esi+12] - addss xmm0, xmm2 - mulss xmm3, [edi+12] - movss xmm4, [esi+16] - addss xmm0, xmm3 - mulss xmm4, [edi+16] - movss xmm5, [esi+20] - addss xmm0, xmm4 - mulss xmm5, [edi+20] - movss xmm6, [esi+24] - addss xmm0, xmm5 - mulss xmm6, [edi+24] - addss xmm0, xmm6 - STORE1( 0, xmm0, xmm7 ) - } - return; - } - case 2: { // 2x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - } - return; - } - case 3: { // 3x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - // row 2 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm1 - STORE1( 8, xmm0, xmm3 ) - } - return; - } - case 4: { // 4x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm4 ) - } - return; - } - case 5: { // 5x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm3 ) - // row 5 - movaps xmm0, [edi+96] - movaps xmm1, [edi+96+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, 0x01 - addss xmm0, xmm1 - STORE1( 16, xmm0, xmm3 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, qword ptr [esi] - movlps xmm6, qword ptr [esi+8] - shufps xmm7, xmm7, 0x44 - shufps xmm6, xmm6, 0x44 - movlps xmm0, qword ptr [edi ] - movhps xmm0, qword ptr [edi+ 24] - mulps xmm0, xmm7 - movlps xmm3, qword ptr [edi+ 8] - movhps xmm3, qword ptr [edi+ 32] - mulps xmm3, xmm6 - movlps xmm1, qword ptr [edi+ 48] - movhps xmm1, qword ptr [edi+ 72] - mulps xmm1, xmm7 - movlps xmm2, qword ptr [edi+ 96] - movhps xmm2, qword ptr [edi+120] - mulps xmm2, xmm7 - movlps xmm4, qword ptr [edi+ 56] - movhps xmm4, qword ptr [edi+ 80] - movlps xmm5, qword ptr [edi+104] - movhps xmm5, qword ptr [edi+128] - mulps xmm4, xmm6 - movlps xmm7, qword ptr [esi+16] - addps xmm0, xmm3 - shufps xmm7, xmm7, 0x44 - mulps xmm5, xmm6 - addps xmm1, xmm4 - movlps xmm3, qword ptr [edi+ 16] - movhps xmm3, qword ptr [edi+ 40] - addps xmm2, xmm5 - movlps xmm4, qword ptr [edi+ 64] - movhps xmm4, qword ptr [edi+ 88] - mulps xmm3, xmm7 - movlps xmm5, qword ptr [edi+112] - movhps xmm5, qword ptr [edi+136] - addps xmm0, xmm3 - mulps xmm4, xmm7 - mulps xmm5, xmm7 - addps xmm1, xmm4 - addps xmm2, xmm5 - movaps xmm6, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm6, xmm1, 0xDD - movaps xmm7, xmm2 - shufps xmm7, xmm2, 0x88 - shufps xmm2, xmm2, 0xDD - addps xmm0, xmm6 - addps xmm2, xmm7 - STORE4( 0, xmm0, xmm3 ) - STORE2LO( 16, xmm2, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + - mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; - mPtr += 6; - } - return; - } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; + mPtr += 4; } - break; + return; + } + } + break; + } + case 5: { + switch ( numRows ) { + case 5: { // 5x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X + movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 + movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X + movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 + movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 + shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 + movlps xmm1, [edi + 6 * 4] // xmm1 = 6, 7, 0, 1 + movlps xmm5, [edi + 16 * 4] // xmm5 = 16, 17, 10, 11 + movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 + shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 + movhps xmm2, [edi + 2 * 4] // xmm2 = 6, 7, 2, 3 + movhps xmm5, [edi + 12 * 4] // xmm5 = 16, 17, 12, 13 + movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 + shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 + movlps xmm3, [edi + 8 * 4] // xmm3 = 8, 9, 2, 3 + movlps xmm5, [edi + 18 * 4] // xmm5 = 18, 19, 12, 13 + movss xmm4, [edi + 4 * 4] // xmm4 = 4, X, X, X + movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 + shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 + movhps xmm5, [edi + 14 * 4] // xmm6 = 18, 19, 14, 15 + shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 + movss xmm7, [esi + 0 * 4] + shufps xmm7, xmm7, 0 + mulps xmm0, xmm7 + movss xmm5, [esi + 1 * 4] + shufps xmm5, xmm5, 0 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, 0 + mulps xmm2, xmm6 + addps xmm0, xmm2 + movss xmm1, [esi + 3 * 4] + shufps xmm1, xmm1, 0 + mulps xmm3, xmm1 + addps xmm0, xmm3 + movss xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, 0 + mulps xmm4, xmm2 + addps xmm0, xmm4 + mulss xmm7, [edi + 20 * 4] + mulss xmm5, [edi + 21 * 4] + addps xmm7, xmm5 + mulss xmm6, [edi + 22 * 4] + addps xmm7, xmm6 + mulss xmm1, [edi + 23 * 4] + addps xmm7, xmm1 + mulss xmm2, [edi + 24 * 4] + addps xmm7, xmm2 + STORE4( 0, xmm0, xmm3 ) + STORE1( 16, xmm7, xmm4 ) + } + return; + } + case 6: { // 6x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [esi] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm0, [edi] + movhps xmm3, [edi + 8] + movaps xmm1, [edi + 16] + movlps xmm2, [edi + 32] + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 + mulps xmm0, xmm6 + mulps xmm3, xmm7 + movlps xmm2, [edi + 40] + addps xmm0, xmm3 // xmm0 + xmm1 + movhps xmm5, [edi + 40 + 8] + movlps xmm3, [edi + 40 + 16] + movhps xmm3, [edi + 40 + 24] + movlps xmm4, [edi + 40 + 32] + shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 + shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 + shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 + mulps xmm2, xmm6 + mulps xmm5, xmm7 + addps xmm2, xmm5 // xmm2 + xmm3 + movss xmm5, [esi + 16] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm0 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) + shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) + addps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + STORE4( 0, xmm0, xmm2 ) + movlps xmm4, [edi + 80] + movhps xmm3, [edi + 80 + 8] + movaps xmm1, [edi + 80 + 16] + movlps xmm2, [edi + 80 + 32] + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 + mulps xmm4, xmm6 + mulps xmm3, xmm7 + mulps xmm1, xmm5 + addps xmm4, xmm3 // xmm4 + xmm1 + shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) + shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) + addps xmm4, xmm1 + shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) + addps xmm4, xmm1 + STORE2LO( 16, xmm4, xmm2 ) + } + return; } default: { - int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numColumns; j++ ) { - sum += mPtr[j] * vPtr[j]; - } - dstPtr[i] STOREC sum; - mPtr += numColumns; + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; + mPtr += 5; } - break; + return; + } + } + break; + } + case 6: { + switch ( numRows ) { + case 1: { // 1x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + movss xmm1, [esi+4] + mulss xmm1, [edi+4] + movss xmm2, [esi+8] + addss xmm0, xmm1 + mulss xmm2, [edi+8] + movss xmm3, [esi+12] + addss xmm0, xmm2 + mulss xmm3, [edi+12] + movss xmm4, [esi+16] + addss xmm0, xmm3 + mulss xmm4, [edi+16] + movss xmm5, [esi+20] + addss xmm0, xmm4 + mulss xmm5, [edi+20] + movss xmm6, [esi+24] + addss xmm0, xmm5 + mulss xmm6, [edi+24] + addss xmm0, xmm6 + STORE1( 0, xmm0, xmm7 ) + } + return; + } + case 2: { // 2x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) + } + return; + } + case 3: { // 3x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) + // row 2 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm1 + STORE1( 8, xmm0, xmm3 ) + } + return; + } + case 4: { // 4x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm4 ) + } + return; + } + case 5: { // 5x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm3 ) + // row 5 + movaps xmm0, [edi + 96] + movaps xmm1, [edi + 96 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0x01 + addss xmm0, xmm1 + STORE1( 16, xmm0, xmm3 ) + } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, qword ptr [esi] + movlps xmm6, qword ptr [esi+8] + shufps xmm7, xmm7, 0x44 + shufps xmm6, xmm6, 0x44 + movlps xmm0, qword ptr [edi ] + movhps xmm0, qword ptr [edi+ 24] + mulps xmm0, xmm7 + movlps xmm3, qword ptr [edi+ 8] + movhps xmm3, qword ptr [edi+ 32] + mulps xmm3, xmm6 + movlps xmm1, qword ptr [edi+ 48] + movhps xmm1, qword ptr [edi+ 72] + mulps xmm1, xmm7 + movlps xmm2, qword ptr [edi+ 96] + movhps xmm2, qword ptr [edi+120] + mulps xmm2, xmm7 + movlps xmm4, qword ptr [edi+ 56] + movhps xmm4, qword ptr [edi+ 80] + movlps xmm5, qword ptr [edi+104] + movhps xmm5, qword ptr [edi+128] + mulps xmm4, xmm6 + movlps xmm7, qword ptr [esi+16] + addps xmm0, xmm3 + shufps xmm7, xmm7, 0x44 + mulps xmm5, xmm6 + addps xmm1, xmm4 + movlps xmm3, qword ptr [edi+ 16] + movhps xmm3, qword ptr [edi+ 40] + addps xmm2, xmm5 + movlps xmm4, qword ptr [edi+ 64] + movhps xmm4, qword ptr [edi+ 88] + mulps xmm3, xmm7 + movlps xmm5, qword ptr [edi+112] + movhps xmm5, qword ptr [edi+136] + addps xmm0, xmm3 + mulps xmm4, xmm7 + mulps xmm5, xmm7 + addps xmm1, xmm4 + addps xmm2, xmm5 + movaps xmm6, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm6, xmm1, 0xDD + movaps xmm7, xmm2 + shufps xmm7, xmm2, 0x88 + shufps xmm2, xmm2, 0xDD + addps xmm0, xmm6 + addps xmm2, xmm7 + STORE4( 0, xmm0, xmm3 ) + STORE2LO( 16, xmm2, xmm4 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; + mPtr += 6; + } + return; + } + } + break; + } + default: { + int numColumns = mat.GetNumColumns(); + for ( int i = 0; i < numRows; i++ ) { + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numColumns; j++ ) { + sum += mPtr[j] * vPtr[j]; + } + dstPtr[i] STOREC sum; + mPtr += numColumns; } + break; + } } #undef STOREC @@ -5866,7 +5216,7 @@ void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, co /* ============ -idSIMD_SSE::MatX_MultiplySubVecX +idSIMD_SSE::MatX_MultiplyAddVecX optimizes the following matrix multiplications: @@ -5877,26 +5227,26 @@ idSIMD_SSE::MatX_MultiplySubVecX with N in the range [1-6] ============ */ -void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { +void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ - __asm subss reg2, reg1 \ + __asm addss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ - __asm subps reg2, reg1 \ + __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ - __asm subps reg2, reg1 \ + __asm addps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ - __asm subps reg2, reg1 \ + __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 -#define STOREC -= +#define STOREC += int numRows; const float *mPtr, *vPtr; @@ -5909,1817 +5259,698 @@ void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, co vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); - switch( mat.GetNumColumns() ) { - case 1: { - switch( numRows ) { - case 1: { // 1x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - STORE1( 0, xmm0, xmm1 ) - } - return; - } - case 6: { // 6x1 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0]; - mPtr++; - } - return; - } + switch ( mat.GetNumColumns() ) { + case 1: { + switch ( numRows ) { + case 1: { // 1x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + STORE1( 0, xmm0, xmm1 ) } - break; + return; } - case 2: { - switch( numRows ) { - case 2: { // 2x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm1, [esi+4] - movss xmm2, [edi] - mulss xmm2, xmm0 - movss xmm3, [edi+4] - mulss xmm3, xmm1 - addss xmm2, xmm3 - STORE1( 0, xmm2, xmm4 ) - mulss xmm0, [edi+8] - mulss xmm1, [edi+8+4] - addss xmm0, xmm1 - STORE1( 4, xmm0, xmm4 ) - } - return; - } - case 6: { // 6x2 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, [esi] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movaps xmm0, [edi] - mulps xmm0, xmm7 - movaps xmm1, [edi+16] - mulps xmm1, xmm7 - movaps xmm2, xmm0 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - movaps xmm3, [edi+32] - addps xmm0, xmm2 - mulps xmm3, xmm7 - STORE4( 0, xmm0, xmm4 ) - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm1, xmm3 - addps xmm3, xmm1 - STORE2LO( 16, xmm3, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; - mPtr += 2; - } - return; - } + case 6: { // 6x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm2 ) } - break; + return; } - case 3: { - switch( numRows ) { - case 3: { // 3x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - movss xmm4, [edi] - mulss xmm4, xmm0 - movss xmm1, [esi+4] - movss xmm5, [edi+4] - mulss xmm5, xmm1 - addss xmm4, xmm5 - movss xmm2, [esi+8] - movss xmm6, [edi+8] - mulss xmm6, xmm2 - addss xmm4, xmm6 - movss xmm3, [edi+12] - mulss xmm3, xmm0 - STORE1( 0, xmm4, xmm7 ); - movss xmm5, [edi+12+4] - mulss xmm5, xmm1 - addss xmm3, xmm5 - movss xmm6, [edi+12+8] - mulss xmm6, xmm2 - addss xmm3, xmm6 - mulss xmm0, [edi+24] - mulss xmm1, [edi+24+4] - STORE1( 4, xmm3, xmm7 ); - addss xmm0, xmm1 - mulss xmm2, [edi+24+8] - addss xmm0, xmm2 - STORE1( 8, xmm0, xmm7 ); - } - return; - } - case 6: { // 6x3 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm5, [esi] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [esi+4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 - movlps xmm1, [edi+4*4] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 - movlps xmm2, [edi+6*4] - movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 - mulps xmm0, xmm5 - movlps xmm3, [edi+10*4] - shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 - movaps xmm3, xmm1 - shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 - mulps xmm1, xmm6 - shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 - mulps xmm3, xmm7 - addps xmm0, xmm1 - addps xmm0, xmm3 - STORE4( 0, xmm0, xmm4 ) - movss xmm1, [edi+12*4] - mulss xmm1, xmm5 - movss xmm2, [edi+13*4] - mulss xmm2, xmm6 - movss xmm3, [edi+14*4] - mulss xmm3, xmm7 - addss xmm1, xmm2 - addss xmm1, xmm3 - STORE1( 16, xmm1, xmm4 ) - mulss xmm5, [edi+15*4] - mulss xmm6, [edi+16*4] - mulss xmm7, [edi+17*4] - addss xmm5, xmm6 - addss xmm5, xmm7 - STORE1( 20, xmm5, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; - mPtr += 3; - } - return; - } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0]; + mPtr++; } - break; + return; } - case 4: { - switch( numRows ) { - case 4: { // 4x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi ] - movlps xmm0, qword ptr [edi ] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - STORE4( 0, xmm0, xmm2 ) - } - return; - } - case 6: { // 6x4 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, qword ptr [esi+ 0] - movlps xmm0, qword ptr [edi+ 0] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm0, qword ptr [edi+16] - mulps xmm0, xmm6 - movlps xmm7, qword ptr [esi+ 8] - movlps xmm2, qword ptr [edi+ 8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movhps xmm2, qword ptr [edi+24] - mulps xmm2, xmm7 - movlps xmm1, qword ptr [edi+32] - movhps xmm1, qword ptr [edi+48] - mulps xmm1, xmm6 - movlps xmm3, qword ptr [edi+40] - addps xmm0, xmm2 - movhps xmm3, qword ptr [edi+56] - mulps xmm3, xmm7 - movaps xmm4, xmm0 - addps xmm1, xmm3 - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm4 - movlps xmm1, qword ptr [edi+64] - movhps xmm1, qword ptr [edi+80] - STORE4( 0, xmm0, xmm4 ) - mulps xmm1, xmm6 - movlps xmm2, qword ptr [edi+72] - movhps xmm2, qword ptr [edi+88] - mulps xmm2, xmm7 - addps xmm1, xmm2 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm3, xmm1 - addps xmm1, xmm3 - STORE2LO( 16, xmm1, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; - mPtr += 4; - } - return; - } - } - break; } - case 5: { - switch( numRows ) { - case 5: { // 5x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X - movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 - movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X - movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 - movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 - shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 - movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 - movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 - movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 - shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 - movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 - movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 - movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 - shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 - movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 - movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 - movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X - movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 - shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 - movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 - shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 - movss xmm7, [esi+0*4] - shufps xmm7, xmm7, 0 - mulps xmm0, xmm7 - movss xmm5, [esi+1*4] - shufps xmm5, xmm5, 0 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, 0 - mulps xmm2, xmm6 - addps xmm0, xmm2 - movss xmm1, [esi+3*4] - shufps xmm1, xmm1, 0 - mulps xmm3, xmm1 - addps xmm0, xmm3 - movss xmm2, [esi+4*4] - shufps xmm2, xmm2, 0 - mulps xmm4, xmm2 - addps xmm0, xmm4 - mulss xmm7, [edi+20*4] - mulss xmm5, [edi+21*4] - addps xmm7, xmm5 - mulss xmm6, [edi+22*4] - addps xmm7, xmm6 - mulss xmm1, [edi+23*4] - addps xmm7, xmm1 - mulss xmm2, [edi+24*4] - addps xmm7, xmm2 - STORE4( 0, xmm0, xmm3 ) - STORE1( 16, xmm7, xmm4 ) - } - return; - } - case 6: { // 6x5 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [esi] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm7, [esi+8] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) - movlps xmm0, [edi] - movhps xmm3, [edi+8] - movaps xmm1, [edi+16] - movlps xmm2, [edi+32] - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 - mulps xmm0, xmm6 - mulps xmm3, xmm7 - movlps xmm2, [edi+40] - addps xmm0, xmm3 // xmm0 + xmm1 - movhps xmm5, [edi+40+8] - movlps xmm3, [edi+40+16] - movhps xmm3, [edi+40+24] - movlps xmm4, [edi+40+32] - shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 - shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 - shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 - mulps xmm2, xmm6 - mulps xmm5, xmm7 - addps xmm2, xmm5 // xmm2 + xmm3 - movss xmm5, [esi+16] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm0 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) - shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) - addps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - STORE4( 0, xmm0, xmm2 ) - movlps xmm4, [edi+80] - movhps xmm3, [edi+80+8] - movaps xmm1, [edi+80+16] - movlps xmm2, [edi+80+32] - shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 - shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 - shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 - mulps xmm4, xmm6 - mulps xmm3, xmm7 - mulps xmm1, xmm5 - addps xmm4, xmm3 // xmm4 + xmm1 - shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) - shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) - addps xmm4, xmm1 - shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) - addps xmm4, xmm1 - STORE2LO( 16, xmm4, xmm2 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; - mPtr += 5; - } - return; - } + break; + } + case 2: { + switch ( numRows ) { + case 2: { // 2x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm1, [esi+4] + movss xmm2, [edi] + mulss xmm2, xmm0 + movss xmm3, [edi+4] + mulss xmm3, xmm1 + addss xmm2, xmm3 + STORE1( 0, xmm2, xmm4 ) + mulss xmm0, [edi + 8] + mulss xmm1, [edi + 8 + 4] + addss xmm0, xmm1 + STORE1( 4, xmm0, xmm4 ) } - break; + return; } - case 6: { - switch( numRows ) { - case 1: { // 1x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - mulss xmm0, [edi] - movss xmm1, [esi+4] - mulss xmm1, [edi+4] - movss xmm2, [esi+8] - addss xmm0, xmm1 - mulss xmm2, [edi+8] - movss xmm3, [esi+12] - addss xmm0, xmm2 - mulss xmm3, [edi+12] - movss xmm4, [esi+16] - addss xmm0, xmm3 - mulss xmm4, [edi+16] - movss xmm5, [esi+20] - addss xmm0, xmm4 - mulss xmm5, [edi+20] - movss xmm6, [esi+24] - addss xmm0, xmm5 - mulss xmm6, [edi+24] - addss xmm0, xmm6 - STORE1( 0, xmm0, xmm7 ) - } - return; - } - case 2: { // 2x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - } - return; - } - case 3: { // 3x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) - movhlps xmm0, xmm1 - addps xmm0, xmm1 - STORE2LO( 0, xmm0, xmm3 ) - // row 2 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm1 - STORE1( 8, xmm0, xmm3 ) - } - return; - } - case 4: { // 4x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm4 ) - } - return; - } - case 5: { // 5x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - // load idVecX - movlps xmm4, [esi] - movhps xmm4, [esi+8] - movlps xmm5, [esi+16] - movlhps xmm5, xmm4 - movhlps xmm6, xmm4 - movlhps xmm6, xmm5 - // row 0 and 1 - movaps xmm0, [edi] - movaps xmm1, [edi+16] - movaps xmm2, [edi+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm7, xmm0 - movlhps xmm7, xmm2 - addps xmm7, xmm1 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm7, xmm0 - // row 2 and 3 - movaps xmm0, [edi+48] - movaps xmm1, [edi+48+16] - movaps xmm2, [edi+48+32] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - mulps xmm2, xmm6 - movhlps xmm3, xmm0 - movlhps xmm3, xmm2 - addps xmm1, xmm3 - shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) - addps xmm1, xmm0 - // last 4 additions for the first 4 rows and store result - movaps xmm0, xmm7 - shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) - shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) - addps xmm0, xmm7 - STORE4( 0, xmm0, xmm3 ) - // row 5 - movaps xmm0, [edi+96] - movaps xmm1, [edi+96+16] - mulps xmm0, xmm4 - mulps xmm1, xmm5 - addps xmm0, xmm1 - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movaps xmm1, xmm0 - shufps xmm1, xmm1, 0x01 - addss xmm0, xmm1 - STORE1( 16, xmm0, xmm3 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm7, qword ptr [esi] - movlps xmm6, qword ptr [esi+8] - shufps xmm7, xmm7, 0x44 - shufps xmm6, xmm6, 0x44 - movlps xmm0, qword ptr [edi ] - movhps xmm0, qword ptr [edi+ 24] - mulps xmm0, xmm7 - movlps xmm3, qword ptr [edi+ 8] - movhps xmm3, qword ptr [edi+ 32] - mulps xmm3, xmm6 - movlps xmm1, qword ptr [edi+ 48] - movhps xmm1, qword ptr [edi+ 72] - mulps xmm1, xmm7 - movlps xmm2, qword ptr [edi+ 96] - movhps xmm2, qword ptr [edi+120] - mulps xmm2, xmm7 - movlps xmm4, qword ptr [edi+ 56] - movhps xmm4, qword ptr [edi+ 80] - movlps xmm5, qword ptr [edi+104] - movhps xmm5, qword ptr [edi+128] - mulps xmm4, xmm6 - movlps xmm7, qword ptr [esi+16] - addps xmm0, xmm3 - shufps xmm7, xmm7, 0x44 - mulps xmm5, xmm6 - addps xmm1, xmm4 - movlps xmm3, qword ptr [edi+ 16] - movhps xmm3, qword ptr [edi+ 40] - addps xmm2, xmm5 - movlps xmm4, qword ptr [edi+ 64] - movhps xmm4, qword ptr [edi+ 88] - mulps xmm3, xmm7 - movlps xmm5, qword ptr [edi+112] - movhps xmm5, qword ptr [edi+136] - addps xmm0, xmm3 - mulps xmm4, xmm7 - mulps xmm5, xmm7 - addps xmm1, xmm4 - addps xmm2, xmm5 - movaps xmm6, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm6, xmm1, 0xDD - movaps xmm7, xmm2 - shufps xmm7, xmm2, 0x88 - shufps xmm2, xmm2, 0xDD - addps xmm0, xmm6 - addps xmm2, xmm7 - STORE4( 0, xmm0, xmm3 ) - STORE2LO( 16, xmm2, xmm4 ) - } - return; - } - default: { - for ( int i = 0; i < numRows; i++ ) { - dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + - mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; - mPtr += 6; - } - return; - } + case 6: { // 6x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, [esi] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movaps xmm0, [edi] + mulps xmm0, xmm7 + movaps xmm1, [edi + 16] + mulps xmm1, xmm7 + movaps xmm2, xmm0 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + movaps xmm3, [edi + 32] + addps xmm0, xmm2 + mulps xmm3, xmm7 + STORE4( 0, xmm0, xmm4 ) + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm1, xmm3 + addps xmm3, xmm1 + STORE2LO( 16, xmm3, xmm4 ) } - break; + return; } default: { - int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numColumns; j++ ) { - sum += mPtr[j] * vPtr[j]; - } - dstPtr[i] STOREC sum; - mPtr += numColumns; + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; + mPtr += 2; } - break; + return; + } } + break; } - -#undef STOREC -#undef STORE4 -#undef STORE2HI -#undef STORE2LO -#undef STORE1 -} - -/* -============ -idSIMD_SSE::MatX_TransposeMultiplyVecX - - optimizes the following matrix multiplications: - - Nx6 * Nx1 - 6xN * 6x1 - - with N in the range [1-6] -============ -*/ -void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { -#define STORE1( offset, reg1, reg2 ) \ - __asm movss [eax+offset], reg1 -#define STORE2LO( offset, reg1, reg2 ) \ - __asm movlps [eax+offset], reg1 -#define STORE2HI( offset, reg1, reg2 ) \ - __asm movhps [eax+offset], reg1 -#define STORE4( offset, reg1, reg2 ) \ - __asm movlps [eax+offset], reg1 \ - __asm movhps [eax+offset+8], reg1 -#define STOREC = - - int numColumns; - const float *mPtr, *vPtr; - float *dstPtr; - - assert( vec.GetSize() >= mat.GetNumRows() ); - assert( dst.GetSize() >= mat.GetNumColumns() ); - - mPtr = mat.ToFloatPtr(); - vPtr = vec.ToFloatPtr(); - dstPtr = dst.ToFloatPtr(); - numColumns = mat.GetNumColumns(); - switch( mat.GetNumRows() ) { - case 1: - switch( numColumns ) { - case 6: { // 1x6 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm3 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0]; - mPtr++; - } - return; - } + case 3: { + switch ( numRows ) { + case 3: { // 3x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm4, [edi] + mulss xmm4, xmm0 + movss xmm1, [esi+4] + movss xmm5, [edi+4] + mulss xmm5, xmm1 + addss xmm4, xmm5 + movss xmm2, [esi+8] + movss xmm6, [edi+8] + mulss xmm6, xmm2 + addss xmm4, xmm6 + movss xmm3, [edi+12] + mulss xmm3, xmm0 + STORE1( 0, xmm4, xmm7 ); + movss xmm5, [edi + 12 + 4] + mulss xmm5, xmm1 + addss xmm3, xmm5 + movss xmm6, [edi + 12 + 8] + mulss xmm6, xmm2 + addss xmm3, xmm6 + mulss xmm0, [edi + 24] + mulss xmm1, [edi + 24 + 4] + STORE1( 4, xmm3, xmm7 ); + addss xmm0, xmm1 + mulss xmm2, [edi + 24 + 8] + addss xmm0, xmm2 + STORE1( 8, xmm0, xmm7 ); } - break; - case 2: - switch( numColumns ) { - case 6: { // 2x6 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movaps xmm1, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) - movaps xmm2, [edi] - mulps xmm2, xmm0 - movlps xmm3, [edi+24] - movhps xmm3, [edi+32] - mulps xmm3, xmm1 - addps xmm2, xmm3 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm4, [edi+16] - movhps xmm4, [edi+40] - mulps xmm4, xmm0 - movhlps xmm3, xmm4 - addps xmm3, xmm4 - STORE4( 0, xmm2, xmm5 ) - STORE2LO( 16, xmm3, xmm6 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; - mPtr++; - } - return; - } + return; + } + case 6: { // 6x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm5, [esi] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm6, [esi + 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 + movlps xmm1, [edi + 4 * 4] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 + movlps xmm2, [edi + 6 * 4] + movhps xmm2, [edi + 8 * 4] // xmm2 = 6, 7, 8, 9 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 + mulps xmm0, xmm5 + movlps xmm3, [edi + 10 * 4] + shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 + movaps xmm3, xmm1 + shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 + mulps xmm1, xmm6 + shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 + mulps xmm3, xmm7 + addps xmm0, xmm1 + addps xmm0, xmm3 + STORE4( 0, xmm0, xmm4 ) + movss xmm1, [edi + 12 * 4] + mulss xmm1, xmm5 + movss xmm2, [edi + 13 * 4] + mulss xmm2, xmm6 + movss xmm3, [edi + 14 * 4] + mulss xmm3, xmm7 + addss xmm1, xmm2 + addss xmm1, xmm3 + STORE1( 16, xmm1, xmm4 ) + mulss xmm5, [edi + 15 * 4] + mulss xmm6, [edi + 16 * 4] + mulss xmm7, [edi + 17 * 4] + addss xmm5, xmm6 + addss xmm5, xmm7 + STORE1( 20, xmm5, xmm4 ) } - break; - case 3: - switch( numColumns ) { - case 6: { // 3x6 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movss xmm1, [esi+2*4] - movlps xmm3, [edi+(0*6+0)*4] - movhps xmm3, [edi+(0*6+2)*4] - movaps xmm4, xmm0 - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm1 - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(2*6+4)*4] - mulps xmm5, xmm1 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; - mPtr++; - } - return; - } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; + mPtr += 3; } - break; - case 4: - switch( numColumns ) { - case 6: { // 4x6 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3]; - mPtr++; - } - return; - } + return; + } + } + break; + } + case 4: { + switch ( numRows ) { + case 4: { // 4x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi ] + movlps xmm0, qword ptr [edi ] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + STORE4( 0, xmm0, xmm2 ) } - break; - case 5: - switch( numColumns ) { - case 6: { // 5x6 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movss xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm2 - mulps xmm4, [edi+(4*6+0)*4] - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; - mPtr++; - } - return; - } + return; + } + case 6: { // 6x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi+ 0] + movlps xmm0, qword ptr [edi+ 0] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + movlps xmm1, qword ptr [edi + 64] + movhps xmm1, qword ptr [edi + 80] + STORE4( 0, xmm0, xmm4 ) + mulps xmm1, xmm6 + movlps xmm2, qword ptr [edi + 72] + movhps xmm2, qword ptr [edi + 88] + mulps xmm2, xmm7 + addps xmm1, xmm2 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm3, xmm1 + addps xmm1, xmm3 + STORE2LO( 16, xmm1, xmm4 ) } - break; - case 6: - switch( numColumns ) { - case 1: { // 6x1 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movhps xmm0, [esi+8] - movlps xmm1, [esi+16] - mulps xmm0, [edi] - mulps xmm1, [edi+16] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) - addps xmm0, xmm1 - movhlps xmm2, xmm0 - addss xmm2, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm2, xmm0 - STORE1( 0, xmm2, xmm3 ) - } - return; - } - case 2: { // 6x2 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm6, [edi+0*4] - mulps xmm6, xmm0 - movlps xmm1, [esi+2*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+4*4] - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm2, [esi+4*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+8*4] - mulps xmm7, xmm2 - addps xmm6, xmm7 - movhlps xmm3, xmm6 - addps xmm3, xmm6 - STORE2LO( 0, xmm3, xmm7 ) - } - return; - } - case 3: { // 6x3 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+(0*3+2)*4] - movhps xmm0, [edi+(0*3+0)*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm6, [esi+0*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movss xmm1, [edi+(1*3+0)*4] - movhps xmm1, [edi+(1*3+1)*4] - movss xmm7, [esi+1*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movss xmm2, [edi+(2*3+2)*4] - movhps xmm2, [edi+(2*3+0)*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+2*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movss xmm3, [edi+(3*3+0)*4] - movhps xmm3, [edi+(3*3+1)*4] - movss xmm7, [esi+3*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movss xmm4, [edi+(4*3+2)*4] - movhps xmm4, [edi+(4*3+0)*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+4*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movss xmm5, [edi+(5*3+0)*4] - movhps xmm5, [edi+(5*3+1)*4] - movss xmm7, [esi+5*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE1( 0, xmm6, xmm7 ) - STORE2HI( 4, xmm6, xmm7 ) - } - return; - } - case 4: { // 6x4 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm3, [edi+(0*4+0)*4] - movhps xmm3, [edi+(0*4+2)*4] - movss xmm4, [esi+0*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*4+0)*4] - movhps xmm5, [edi+(1*4+2)*4] - movss xmm6, [esi+1*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*4+0)*4] - movhps xmm4, [edi+(2*4+2)*4] - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*4+0)*4] - movhps xmm5, [edi+(3*4+2)*4] - movss xmm6, [esi+3*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(4*4+0)*4] - movhps xmm4, [edi+(4*4+2)*4] - movss xmm6, [esi+4*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(5*4+0)*4] - movhps xmm5, [edi+(5*4+2)*4] - movss xmm6, [esi+5*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - } - return; - } - case 5: { // 6x5 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [edi+(0*5+0)*4] - movhps xmm6, [edi+(0*5+2)*4] - movss xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movlps xmm7, [edi+(1*5+0)*4] - movhps xmm7, [edi+(1*5+2)*4] - movss xmm1, [esi+1*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm7, [edi+(2*5+0)*4] - movhps xmm7, [edi+(2*5+2)*4] - movss xmm2, [esi+2*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movlps xmm7, [edi+(3*5+0)*4] - movhps xmm7, [edi+(3*5+2)*4] - movss xmm3, [esi+3*4] - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movlps xmm7, [edi+(4*5+0)*4] - movhps xmm7, [edi+(4*5+2)*4] - movss xmm4, [esi+4*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movlps xmm7, [edi+(5*5+0)*4] - movhps xmm7, [edi+(5*5+2)*4] - movss xmm5, [esi+5*4] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE4( 0, xmm6, xmm7 ) - movss xmm6, [edi+(0*5+4)*4] - mulss xmm6, xmm0 - movss xmm7, [edi+(1*5+4)*4] - mulss xmm7, xmm1 - addss xmm6, xmm7 - movss xmm7, [edi+(2*5+4)*4] - mulss xmm7, xmm2 - addss xmm6, xmm7 - movss xmm7, [edi+(3*5+4)*4] - mulss xmm7, xmm3 - addss xmm6, xmm7 - movss xmm7, [edi+(4*5+4)*4] - mulss xmm7, xmm4 - addss xmm6, xmm7 - movss xmm7, [edi+(5*5+4)*4] - mulss xmm7, xmm5 - addss xmm6, xmm7 - STORE1( 16, xmm6, xmm7 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movlps xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(4*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(5*6+0)*4] - movhps xmm5, [edi+(5*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - movhps xmm5, [edi+(5*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; - mPtr++; - } - return; - } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; + mPtr += 4; } - break; - default: - int numRows = mat.GetNumRows(); - for ( int i = 0; i < numColumns; i++ ) { - mPtr = mat.ToFloatPtr() + i; - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numRows; j++ ) { - mPtr += numColumns; - sum += mPtr[0] * vPtr[j]; - } - dstPtr[i] STOREC sum; + return; + } + } + break; + } + case 5: { + switch ( numRows ) { + case 5: { // 5x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X + movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 + movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X + movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 + movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 + shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 + movlps xmm1, [edi + 6 * 4] // xmm1 = 6, 7, 0, 1 + movlps xmm5, [edi + 16 * 4] // xmm5 = 16, 17, 10, 11 + movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 + shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 + movhps xmm2, [edi + 2 * 4] // xmm2 = 6, 7, 2, 3 + movhps xmm5, [edi + 12 * 4] // xmm5 = 16, 17, 12, 13 + movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 + shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 + movlps xmm3, [edi + 8 * 4] // xmm3 = 8, 9, 2, 3 + movlps xmm5, [edi + 18 * 4] // xmm5 = 18, 19, 12, 13 + movss xmm4, [edi + 4 * 4] // xmm4 = 4, X, X, X + movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 + shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 + movhps xmm5, [edi + 14 * 4] // xmm6 = 18, 19, 14, 15 + shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 + movss xmm7, [esi + 0 * 4] + shufps xmm7, xmm7, 0 + mulps xmm0, xmm7 + movss xmm5, [esi + 1 * 4] + shufps xmm5, xmm5, 0 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, 0 + mulps xmm2, xmm6 + addps xmm0, xmm2 + movss xmm1, [esi + 3 * 4] + shufps xmm1, xmm1, 0 + mulps xmm3, xmm1 + addps xmm0, xmm3 + movss xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, 0 + mulps xmm4, xmm2 + addps xmm0, xmm4 + mulss xmm7, [edi + 20 * 4] + mulss xmm5, [edi + 21 * 4] + addps xmm7, xmm5 + mulss xmm6, [edi + 22 * 4] + addps xmm7, xmm6 + mulss xmm1, [edi + 23 * 4] + addps xmm7, xmm1 + mulss xmm2, [edi + 24 * 4] + addps xmm7, xmm2 + STORE4( 0, xmm0, xmm3 ) + STORE1( 16, xmm7, xmm4 ) } - break; + return; + } + case 6: { // 6x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [esi] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm0, [edi] + movhps xmm3, [edi + 8] + movaps xmm1, [edi + 16] + movlps xmm2, [edi + 32] + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 + mulps xmm0, xmm6 + mulps xmm3, xmm7 + movlps xmm2, [edi + 40] + addps xmm0, xmm3 // xmm0 + xmm1 + movhps xmm5, [edi + 40 + 8] + movlps xmm3, [edi + 40 + 16] + movhps xmm3, [edi + 40 + 24] + movlps xmm4, [edi + 40 + 32] + shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 + shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 + shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 + mulps xmm2, xmm6 + mulps xmm5, xmm7 + addps xmm2, xmm5 // xmm2 + xmm3 + movss xmm5, [esi + 16] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm0 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) + shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) + addps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + STORE4( 0, xmm0, xmm2 ) + movlps xmm4, [edi + 80] + movhps xmm3, [edi + 80 + 8] + movaps xmm1, [edi + 80 + 16] + movlps xmm2, [edi + 80 + 32] + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 + mulps xmm4, xmm6 + mulps xmm3, xmm7 + mulps xmm1, xmm5 + addps xmm4, xmm3 // xmm4 + xmm1 + shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) + shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) + addps xmm4, xmm1 + shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) + addps xmm4, xmm1 + STORE2LO( 16, xmm4, xmm2 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; + mPtr += 5; + } + return; + } + } + break; } - -#undef STOREC -#undef STORE4 -#undef STORE2HI -#undef STORE2LO -#undef STORE1 -} - -/* -============ -idSIMD_SSE::MatX_TransposeMultiplyAddVecX - - optimizes the following matrix multiplications: - - Nx6 * Nx1 - 6xN * 6x1 - - with N in the range [1-6] -============ -*/ -void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { -#define STORE1( offset, reg1, reg2 ) \ - __asm movss reg2, [eax+offset] \ - __asm addss reg2, reg1 \ - __asm movss [eax+offset], reg2 -#define STORE2LO( offset, reg1, reg2 ) \ - __asm movlps reg2, [eax+offset] \ - __asm addps reg2, reg1 \ - __asm movlps [eax+offset], reg2 -#define STORE2HI( offset, reg1, reg2 ) \ - __asm movhps reg2, [eax+offset] \ - __asm addps reg2, reg1 \ - __asm movhps [eax+offset], reg2 -#define STORE4( offset, reg1, reg2 ) \ - __asm movlps reg2, [eax+offset] \ - __asm movhps reg2, [eax+offset+8] \ - __asm addps reg2, reg1 \ - __asm movlps [eax+offset], reg2 \ - __asm movhps [eax+offset+8], reg2 -#define STOREC += - - int numColumns; - const float *mPtr, *vPtr; - float *dstPtr; - - assert( vec.GetSize() >= mat.GetNumRows() ); - assert( dst.GetSize() >= mat.GetNumColumns() ); - - mPtr = mat.ToFloatPtr(); - vPtr = vec.ToFloatPtr(); - dstPtr = dst.ToFloatPtr(); - numColumns = mat.GetNumColumns(); - switch( mat.GetNumRows() ) { - case 1: - switch( numColumns ) { - case 6: { // 1x6 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm3 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0]; - mPtr++; - } - return; - } + case 6: { + switch ( numRows ) { + case 1: { // 1x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + movss xmm1, [esi+4] + mulss xmm1, [edi+4] + movss xmm2, [esi+8] + addss xmm0, xmm1 + mulss xmm2, [edi+8] + movss xmm3, [esi+12] + addss xmm0, xmm2 + mulss xmm3, [edi+12] + movss xmm4, [esi+16] + addss xmm0, xmm3 + mulss xmm4, [edi+16] + movss xmm5, [esi+20] + addss xmm0, xmm4 + mulss xmm5, [edi+20] + movss xmm6, [esi+24] + addss xmm0, xmm5 + mulss xmm6, [edi+24] + addss xmm0, xmm6 + STORE1( 0, xmm0, xmm7 ) } - break; - case 2: - switch( numColumns ) { - case 6: { // 2x6 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movaps xmm1, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) - movaps xmm2, [edi] - mulps xmm2, xmm0 - movlps xmm3, [edi+24] - movhps xmm3, [edi+32] - mulps xmm3, xmm1 - addps xmm2, xmm3 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm4, [edi+16] - movhps xmm4, [edi+40] - mulps xmm4, xmm0 - movhlps xmm3, xmm4 - addps xmm3, xmm4 - STORE4( 0, xmm2, xmm5 ) - STORE2LO( 16, xmm3, xmm6 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; - mPtr++; - } - return; - } + return; + } + case 2: { // 2x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) } - break; - case 3: - switch( numColumns ) { - case 6: { // 3x6 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movss xmm1, [esi+2*4] - movlps xmm3, [edi+(0*6+0)*4] - movhps xmm3, [edi+(0*6+2)*4] - movaps xmm4, xmm0 - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm1 - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(2*6+4)*4] - mulps xmm5, xmm1 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; - mPtr++; - } - return; - } + return; + } + case 3: { // 3x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) + // row 2 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm1 + STORE1( 8, xmm0, xmm3 ) } - break; - case 4: - switch( numColumns ) { - case 6: { // 4x6 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3]; - mPtr++; - } - return; - } + return; + } + case 4: { // 4x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm4 ) } - break; - case 5: - switch( numColumns ) { - case 6: { // 5x6 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movss xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm2 - mulps xmm4, [edi+(4*6+0)*4] - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; - mPtr++; - } - return; - } + return; + } + case 5: { // 5x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm3 ) + // row 5 + movaps xmm0, [edi + 96] + movaps xmm1, [edi + 96 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0x01 + addss xmm0, xmm1 + STORE1( 16, xmm0, xmm3 ) } - break; - case 6: - switch( numColumns ) { - case 1: { // 6x1 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movhps xmm0, [esi+8] - movlps xmm1, [esi+16] - mulps xmm0, [edi] - mulps xmm1, [edi+16] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) - addps xmm0, xmm1 - movhlps xmm2, xmm0 - addss xmm2, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm2, xmm0 - STORE1( 0, xmm2, xmm3 ) - } - return; - } - case 2: { // 6x2 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm6, [edi+0*4] - mulps xmm6, xmm0 - movlps xmm1, [esi+2*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+4*4] - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm2, [esi+4*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+8*4] - mulps xmm7, xmm2 - addps xmm6, xmm7 - movhlps xmm3, xmm6 - addps xmm3, xmm6 - STORE2LO( 0, xmm3, xmm7 ) - } - return; - } - case 3: { // 6x3 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+(0*3+2)*4] - movhps xmm0, [edi+(0*3+0)*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm6, [esi+0*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movss xmm1, [edi+(1*3+0)*4] - movhps xmm1, [edi+(1*3+1)*4] - movss xmm7, [esi+1*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movss xmm2, [edi+(2*3+2)*4] - movhps xmm2, [edi+(2*3+0)*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+2*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movss xmm3, [edi+(3*3+0)*4] - movhps xmm3, [edi+(3*3+1)*4] - movss xmm7, [esi+3*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movss xmm4, [edi+(4*3+2)*4] - movhps xmm4, [edi+(4*3+0)*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+4*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movss xmm5, [edi+(5*3+0)*4] - movhps xmm5, [edi+(5*3+1)*4] - movss xmm7, [esi+5*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE1( 0, xmm6, xmm7 ) - STORE2HI( 4, xmm6, xmm7 ) - } - return; - } - case 4: { // 6x4 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm3, [edi+(0*4+0)*4] - movhps xmm3, [edi+(0*4+2)*4] - movss xmm4, [esi+0*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*4+0)*4] - movhps xmm5, [edi+(1*4+2)*4] - movss xmm6, [esi+1*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*4+0)*4] - movhps xmm4, [edi+(2*4+2)*4] - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*4+0)*4] - movhps xmm5, [edi+(3*4+2)*4] - movss xmm6, [esi+3*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(4*4+0)*4] - movhps xmm4, [edi+(4*4+2)*4] - movss xmm6, [esi+4*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(5*4+0)*4] - movhps xmm5, [edi+(5*4+2)*4] - movss xmm6, [esi+5*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - } - return; - } - case 5: { // 6x5 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [edi+(0*5+0)*4] - movhps xmm6, [edi+(0*5+2)*4] - movss xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movlps xmm7, [edi+(1*5+0)*4] - movhps xmm7, [edi+(1*5+2)*4] - movss xmm1, [esi+1*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm7, [edi+(2*5+0)*4] - movhps xmm7, [edi+(2*5+2)*4] - movss xmm2, [esi+2*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movlps xmm7, [edi+(3*5+0)*4] - movhps xmm7, [edi+(3*5+2)*4] - movss xmm3, [esi+3*4] - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movlps xmm7, [edi+(4*5+0)*4] - movhps xmm7, [edi+(4*5+2)*4] - movss xmm4, [esi+4*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movlps xmm7, [edi+(5*5+0)*4] - movhps xmm7, [edi+(5*5+2)*4] - movss xmm5, [esi+5*4] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE4( 0, xmm6, xmm7 ) - movss xmm6, [edi+(0*5+4)*4] - mulss xmm6, xmm0 - movss xmm7, [edi+(1*5+4)*4] - mulss xmm7, xmm1 - addss xmm6, xmm7 - movss xmm7, [edi+(2*5+4)*4] - mulss xmm7, xmm2 - addss xmm6, xmm7 - movss xmm7, [edi+(3*5+4)*4] - mulss xmm7, xmm3 - addss xmm6, xmm7 - movss xmm7, [edi+(4*5+4)*4] - mulss xmm7, xmm4 - addss xmm6, xmm7 - movss xmm7, [edi+(5*5+4)*4] - mulss xmm7, xmm5 - addss xmm6, xmm7 - STORE1( 16, xmm6, xmm7 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movlps xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(4*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(5*6+0)*4] - movhps xmm5, [edi+(5*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - movhps xmm5, [edi+(5*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; - mPtr++; - } - return; - } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, qword ptr [esi] + movlps xmm6, qword ptr [esi+8] + shufps xmm7, xmm7, 0x44 + shufps xmm6, xmm6, 0x44 + movlps xmm0, qword ptr [edi ] + movhps xmm0, qword ptr [edi+ 24] + mulps xmm0, xmm7 + movlps xmm3, qword ptr [edi+ 8] + movhps xmm3, qword ptr [edi+ 32] + mulps xmm3, xmm6 + movlps xmm1, qword ptr [edi+ 48] + movhps xmm1, qword ptr [edi+ 72] + mulps xmm1, xmm7 + movlps xmm2, qword ptr [edi+ 96] + movhps xmm2, qword ptr [edi+120] + mulps xmm2, xmm7 + movlps xmm4, qword ptr [edi+ 56] + movhps xmm4, qword ptr [edi+ 80] + movlps xmm5, qword ptr [edi+104] + movhps xmm5, qword ptr [edi+128] + mulps xmm4, xmm6 + movlps xmm7, qword ptr [esi+16] + addps xmm0, xmm3 + shufps xmm7, xmm7, 0x44 + mulps xmm5, xmm6 + addps xmm1, xmm4 + movlps xmm3, qword ptr [edi+ 16] + movhps xmm3, qword ptr [edi+ 40] + addps xmm2, xmm5 + movlps xmm4, qword ptr [edi+ 64] + movhps xmm4, qword ptr [edi+ 88] + mulps xmm3, xmm7 + movlps xmm5, qword ptr [edi+112] + movhps xmm5, qword ptr [edi+136] + addps xmm0, xmm3 + mulps xmm4, xmm7 + mulps xmm5, xmm7 + addps xmm1, xmm4 + addps xmm2, xmm5 + movaps xmm6, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm6, xmm1, 0xDD + movaps xmm7, xmm2 + shufps xmm7, xmm2, 0x88 + shufps xmm2, xmm2, 0xDD + addps xmm0, xmm6 + addps xmm2, xmm7 + STORE4( 0, xmm0, xmm3 ) + STORE2LO( 16, xmm2, xmm4 ) } - break; - default: - int numRows = mat.GetNumRows(); - for ( int i = 0; i < numColumns; i++ ) { - mPtr = mat.ToFloatPtr() + i; - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numRows; j++ ) { - mPtr += numColumns; - sum += mPtr[0] * vPtr[j]; - } - dstPtr[i] STOREC sum; + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; + mPtr += 6; } - break; + return; + } + } + break; + } + default: { + int numColumns = mat.GetNumColumns(); + for ( int i = 0; i < numRows; i++ ) { + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numColumns; j++ ) { + sum += mPtr[j] * vPtr[j]; + } + dstPtr[i] STOREC sum; + mPtr += numColumns; + } + break; + } } #undef STOREC @@ -7731,17 +5962,18 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX /* ============ -void idSIMD_SSE::MatX_TransposeMultiplySubVecX +idSIMD_SSE::MatX_MultiplySubVecX optimizes the following matrix multiplications: - Nx6 * Nx1 - 6xN * 6x1 + NxN * Nx1 + Nx6 * 6x1 + 6xN * Nx1 with N in the range [1-6] ============ */ -void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { +void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm subss reg2, reg1 \ @@ -7762,528 +5994,2392 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX __asm movhps [eax+offset+8], reg2 #define STOREC -= - int numColumns; + int numRows; const float *mPtr, *vPtr; float *dstPtr; - assert( vec.GetSize() >= mat.GetNumRows() ); - assert( dst.GetSize() >= mat.GetNumColumns() ); + assert( vec.GetSize() >= mat.GetNumColumns() ); + assert( dst.GetSize() >= mat.GetNumRows() ); + + mPtr = mat.ToFloatPtr(); + vPtr = vec.ToFloatPtr(); + dstPtr = dst.ToFloatPtr(); + numRows = mat.GetNumRows(); + switch ( mat.GetNumColumns() ) { + case 1: { + switch ( numRows ) { + case 1: { // 1x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + STORE1( 0, xmm0, xmm1 ) + } + return; + } + case 6: { // 6x1 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm2 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0]; + mPtr++; + } + return; + } + } + break; + } + case 2: { + switch ( numRows ) { + case 2: { // 2x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm1, [esi+4] + movss xmm2, [edi] + mulss xmm2, xmm0 + movss xmm3, [edi+4] + mulss xmm3, xmm1 + addss xmm2, xmm3 + STORE1( 0, xmm2, xmm4 ) + mulss xmm0, [edi + 8] + mulss xmm1, [edi + 8 + 4] + addss xmm0, xmm1 + STORE1( 4, xmm0, xmm4 ) + } + return; + } + case 6: { // 6x2 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, [esi] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movaps xmm0, [edi] + mulps xmm0, xmm7 + movaps xmm1, [edi + 16] + mulps xmm1, xmm7 + movaps xmm2, xmm0 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + movaps xmm3, [edi + 32] + addps xmm0, xmm2 + mulps xmm3, xmm7 + STORE4( 0, xmm0, xmm4 ) + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm1, xmm3 + addps xmm3, xmm1 + STORE2LO( 16, xmm3, xmm4 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; + mPtr += 2; + } + return; + } + } + break; + } + case 3: { + switch ( numRows ) { + case 3: { // 3x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + movss xmm4, [edi] + mulss xmm4, xmm0 + movss xmm1, [esi+4] + movss xmm5, [edi+4] + mulss xmm5, xmm1 + addss xmm4, xmm5 + movss xmm2, [esi+8] + movss xmm6, [edi+8] + mulss xmm6, xmm2 + addss xmm4, xmm6 + movss xmm3, [edi+12] + mulss xmm3, xmm0 + STORE1( 0, xmm4, xmm7 ); + movss xmm5, [edi + 12 + 4] + mulss xmm5, xmm1 + addss xmm3, xmm5 + movss xmm6, [edi + 12 + 8] + mulss xmm6, xmm2 + addss xmm3, xmm6 + mulss xmm0, [edi + 24] + mulss xmm1, [edi + 24 + 4] + STORE1( 4, xmm3, xmm7 ); + addss xmm0, xmm1 + mulss xmm2, [edi + 24 + 8] + addss xmm0, xmm2 + STORE1( 8, xmm0, xmm7 ); + } + return; + } + case 6: { // 6x3 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm5, [esi] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm6, [esi + 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + movss xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 + movlps xmm1, [edi + 4 * 4] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 + movlps xmm2, [edi + 6 * 4] + movhps xmm2, [edi + 8 * 4] // xmm2 = 6, 7, 8, 9 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 + mulps xmm0, xmm5 + movlps xmm3, [edi + 10 * 4] + shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 + movaps xmm3, xmm1 + shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 + mulps xmm1, xmm6 + shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 + mulps xmm3, xmm7 + addps xmm0, xmm1 + addps xmm0, xmm3 + STORE4( 0, xmm0, xmm4 ) + movss xmm1, [edi + 12 * 4] + mulss xmm1, xmm5 + movss xmm2, [edi + 13 * 4] + mulss xmm2, xmm6 + movss xmm3, [edi + 14 * 4] + mulss xmm3, xmm7 + addss xmm1, xmm2 + addss xmm1, xmm3 + STORE1( 16, xmm1, xmm4 ) + mulss xmm5, [edi + 15 * 4] + mulss xmm6, [edi + 16 * 4] + mulss xmm7, [edi + 17 * 4] + addss xmm5, xmm6 + addss xmm5, xmm7 + STORE1( 20, xmm5, xmm4 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; + mPtr += 3; + } + return; + } + } + break; + } + case 4: { + switch ( numRows ) { + case 4: { // 4x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi ] + movlps xmm0, qword ptr [edi ] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + STORE4( 0, xmm0, xmm2 ) + } + return; + } + case 6: { // 6x4 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, qword ptr [esi+ 0] + movlps xmm0, qword ptr [edi+ 0] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm0, qword ptr [edi + 16] + mulps xmm0, xmm6 + movlps xmm7, qword ptr [esi + 8] + movlps xmm2, qword ptr [edi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movhps xmm2, qword ptr [edi + 24] + mulps xmm2, xmm7 + movlps xmm1, qword ptr [edi + 32] + movhps xmm1, qword ptr [edi + 48] + mulps xmm1, xmm6 + movlps xmm3, qword ptr [edi + 40] + addps xmm0, xmm2 + movhps xmm3, qword ptr [edi + 56] + mulps xmm3, xmm7 + movaps xmm4, xmm0 + addps xmm1, xmm3 + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm4 + movlps xmm1, qword ptr [edi + 64] + movhps xmm1, qword ptr [edi + 80] + STORE4( 0, xmm0, xmm4 ) + mulps xmm1, xmm6 + movlps xmm2, qword ptr [edi + 72] + movhps xmm2, qword ptr [edi + 88] + mulps xmm2, xmm7 + addps xmm1, xmm2 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm3, xmm1 + addps xmm1, xmm3 + STORE2LO( 16, xmm1, xmm4 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; + mPtr += 4; + } + return; + } + } + break; + } + case 5: { + switch ( numRows ) { + case 5: { // 5x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X + movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 + movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X + movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 + movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 + shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 + movlps xmm1, [edi + 6 * 4] // xmm1 = 6, 7, 0, 1 + movlps xmm5, [edi + 16 * 4] // xmm5 = 16, 17, 10, 11 + movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 + shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 + movhps xmm2, [edi + 2 * 4] // xmm2 = 6, 7, 2, 3 + movhps xmm5, [edi + 12 * 4] // xmm5 = 16, 17, 12, 13 + movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 + shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 + movlps xmm3, [edi + 8 * 4] // xmm3 = 8, 9, 2, 3 + movlps xmm5, [edi + 18 * 4] // xmm5 = 18, 19, 12, 13 + movss xmm4, [edi + 4 * 4] // xmm4 = 4, X, X, X + movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 + shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 + movhps xmm5, [edi + 14 * 4] // xmm6 = 18, 19, 14, 15 + shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 + movss xmm7, [esi + 0 * 4] + shufps xmm7, xmm7, 0 + mulps xmm0, xmm7 + movss xmm5, [esi + 1 * 4] + shufps xmm5, xmm5, 0 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, 0 + mulps xmm2, xmm6 + addps xmm0, xmm2 + movss xmm1, [esi + 3 * 4] + shufps xmm1, xmm1, 0 + mulps xmm3, xmm1 + addps xmm0, xmm3 + movss xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, 0 + mulps xmm4, xmm2 + addps xmm0, xmm4 + mulss xmm7, [edi + 20 * 4] + mulss xmm5, [edi + 21 * 4] + addps xmm7, xmm5 + mulss xmm6, [edi + 22 * 4] + addps xmm7, xmm6 + mulss xmm1, [edi + 23 * 4] + addps xmm7, xmm1 + mulss xmm2, [edi + 24 * 4] + addps xmm7, xmm2 + STORE4( 0, xmm0, xmm3 ) + STORE1( 16, xmm7, xmm4 ) + } + return; + } + case 6: { // 6x5 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [esi] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm7, [esi + 8] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) + movlps xmm0, [edi] + movhps xmm3, [edi + 8] + movaps xmm1, [edi + 16] + movlps xmm2, [edi + 32] + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 + mulps xmm0, xmm6 + mulps xmm3, xmm7 + movlps xmm2, [edi + 40] + addps xmm0, xmm3 // xmm0 + xmm1 + movhps xmm5, [edi + 40 + 8] + movlps xmm3, [edi + 40 + 16] + movhps xmm3, [edi + 40 + 24] + movlps xmm4, [edi + 40 + 32] + shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 + shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 + shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 + mulps xmm2, xmm6 + mulps xmm5, xmm7 + addps xmm2, xmm5 // xmm2 + xmm3 + movss xmm5, [esi + 16] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm0 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) + shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) + addps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + STORE4( 0, xmm0, xmm2 ) + movlps xmm4, [edi + 80] + movhps xmm3, [edi + 80 + 8] + movaps xmm1, [edi + 80 + 16] + movlps xmm2, [edi + 80 + 32] + shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 + shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 + shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 + mulps xmm4, xmm6 + mulps xmm3, xmm7 + mulps xmm1, xmm5 + addps xmm4, xmm3 // xmm4 + xmm1 + shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) + shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) + addps xmm4, xmm1 + shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) + addps xmm4, xmm1 + STORE2LO( 16, xmm4, xmm2 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; + mPtr += 5; + } + return; + } + } + break; + } + case 6: { + switch ( numRows ) { + case 1: { // 1x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + mulss xmm0, [edi] + movss xmm1, [esi+4] + mulss xmm1, [edi+4] + movss xmm2, [esi+8] + addss xmm0, xmm1 + mulss xmm2, [edi+8] + movss xmm3, [esi+12] + addss xmm0, xmm2 + mulss xmm3, [edi+12] + movss xmm4, [esi+16] + addss xmm0, xmm3 + mulss xmm4, [edi+16] + movss xmm5, [esi+20] + addss xmm0, xmm4 + mulss xmm5, [edi+20] + movss xmm6, [esi+24] + addss xmm0, xmm5 + mulss xmm6, [edi+24] + addss xmm0, xmm6 + STORE1( 0, xmm0, xmm7 ) + } + return; + } + case 2: { // 2x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) + } + return; + } + case 3: { // 3x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) + movhlps xmm0, xmm1 + addps xmm0, xmm1 + STORE2LO( 0, xmm0, xmm3 ) + // row 2 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm1 + STORE1( 8, xmm0, xmm3 ) + } + return; + } + case 4: { // 4x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm4 ) + } + return; + } + case 5: { // 5x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + // load idVecX + movlps xmm4, [esi] + movhps xmm4, [esi+8] + movlps xmm5, [esi+16] + movlhps xmm5, xmm4 + movhlps xmm6, xmm4 + movlhps xmm6, xmm5 + // row 0 and 1 + movaps xmm0, [edi] + movaps xmm1, [edi+16] + movaps xmm2, [edi+32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm7, xmm0 + movlhps xmm7, xmm2 + addps xmm7, xmm1 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm7, xmm0 + // row 2 and 3 + movaps xmm0, [edi + 48] + movaps xmm1, [edi + 48 + 16] + movaps xmm2, [edi + 48 + 32] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + movhlps xmm3, xmm0 + movlhps xmm3, xmm2 + addps xmm1, xmm3 + shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) + addps xmm1, xmm0 + // last 4 additions for the first 4 rows and store result + movaps xmm0, xmm7 + shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) + shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) + addps xmm0, xmm7 + STORE4( 0, xmm0, xmm3 ) + // row 5 + movaps xmm0, [edi + 96] + movaps xmm1, [edi + 96 + 16] + mulps xmm0, xmm4 + mulps xmm1, xmm5 + addps xmm0, xmm1 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0x01 + addss xmm0, xmm1 + STORE1( 16, xmm0, xmm3 ) + } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm7, qword ptr [esi] + movlps xmm6, qword ptr [esi+8] + shufps xmm7, xmm7, 0x44 + shufps xmm6, xmm6, 0x44 + movlps xmm0, qword ptr [edi ] + movhps xmm0, qword ptr [edi+ 24] + mulps xmm0, xmm7 + movlps xmm3, qword ptr [edi+ 8] + movhps xmm3, qword ptr [edi+ 32] + mulps xmm3, xmm6 + movlps xmm1, qword ptr [edi+ 48] + movhps xmm1, qword ptr [edi+ 72] + mulps xmm1, xmm7 + movlps xmm2, qword ptr [edi+ 96] + movhps xmm2, qword ptr [edi+120] + mulps xmm2, xmm7 + movlps xmm4, qword ptr [edi+ 56] + movhps xmm4, qword ptr [edi+ 80] + movlps xmm5, qword ptr [edi+104] + movhps xmm5, qword ptr [edi+128] + mulps xmm4, xmm6 + movlps xmm7, qword ptr [esi+16] + addps xmm0, xmm3 + shufps xmm7, xmm7, 0x44 + mulps xmm5, xmm6 + addps xmm1, xmm4 + movlps xmm3, qword ptr [edi+ 16] + movhps xmm3, qword ptr [edi+ 40] + addps xmm2, xmm5 + movlps xmm4, qword ptr [edi+ 64] + movhps xmm4, qword ptr [edi+ 88] + mulps xmm3, xmm7 + movlps xmm5, qword ptr [edi+112] + movhps xmm5, qword ptr [edi+136] + addps xmm0, xmm3 + mulps xmm4, xmm7 + mulps xmm5, xmm7 + addps xmm1, xmm4 + addps xmm2, xmm5 + movaps xmm6, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm6, xmm1, 0xDD + movaps xmm7, xmm2 + shufps xmm7, xmm2, 0x88 + shufps xmm2, xmm2, 0xDD + addps xmm0, xmm6 + addps xmm2, xmm7 + STORE4( 0, xmm0, xmm3 ) + STORE2LO( 16, xmm2, xmm4 ) + } + return; + } + default: { + for ( int i = 0; i < numRows; i++ ) { + dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; + mPtr += 6; + } + return; + } + } + break; + } + default: { + int numColumns = mat.GetNumColumns(); + for ( int i = 0; i < numRows; i++ ) { + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numColumns; j++ ) { + sum += mPtr[j] * vPtr[j]; + } + dstPtr[i] STOREC sum; + mPtr += numColumns; + } + break; + } + } + +#undef STOREC +#undef STORE4 +#undef STORE2HI +#undef STORE2LO +#undef STORE1 +} + +/* +============ +idSIMD_SSE::MatX_TransposeMultiplyVecX + + optimizes the following matrix multiplications: + + Nx6 * Nx1 + 6xN * 6x1 + + with N in the range [1-6] +============ +*/ +void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { +#define STORE1( offset, reg1, reg2 ) \ + __asm movss [eax+offset], reg1 +#define STORE2LO( offset, reg1, reg2 ) \ + __asm movlps [eax+offset], reg1 +#define STORE2HI( offset, reg1, reg2 ) \ + __asm movhps [eax+offset], reg1 +#define STORE4( offset, reg1, reg2 ) \ + __asm movlps [eax+offset], reg1 \ + __asm movhps [eax+offset+8], reg1 +#define STOREC = + + int numColumns; + const float *mPtr, *vPtr; + float *dstPtr; + + assert( vec.GetSize() >= mat.GetNumRows() ); + assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); - switch( mat.GetNumRows() ) { - case 1: - switch( numColumns ) { - case 6: { // 1x6 * 1x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - STORE4( 0, xmm0, xmm2 ) - STORE2LO( 16, xmm1, xmm3 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0]; - mPtr++; - } - return; - } + switch ( mat.GetNumRows() ) { + case 1: + switch ( numColumns ) { + case 6: { // 1x6 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm3 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0]; + mPtr++; + } + return; + } + } + break; + case 2: + switch ( numColumns ) { + case 6: { // 2x6 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movaps xmm1, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) + movaps xmm2, [edi] + mulps xmm2, xmm0 + movlps xmm3, [edi + 24] + movhps xmm3, [edi + 32] + mulps xmm3, xmm1 + addps xmm2, xmm3 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + movlps xmm4, [edi + 16] + movhps xmm4, [edi + 40] + mulps xmm4, xmm0 + movhlps xmm3, xmm4 + addps xmm3, xmm4 + STORE4( 0, xmm2, xmm5 ) + STORE2LO( 16, xmm3, xmm6 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1]; + mPtr++; + } + return; + } + } + break; + case 3: + switch ( numColumns ) { + case 6: { // 3x6 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movss xmm1, [esi+2*4] + movlps xmm3, [edi+( 0*6+0 )*4] + movhps xmm3, [edi+( 0*6+2 )*4] + movaps xmm4, xmm0 + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm1 + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 2 * 6 + 4 ) * 4] + mulps xmm5, xmm1 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2]; + mPtr++; + } + return; + } + } + break; + case 4: + switch ( numColumns ) { + case 6: { // 4x6 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3]; + mPtr++; + } + return; + } + } + break; + case 5: + switch ( numColumns ) { + case 6: { // 5x6 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movss xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm2 + mulps xmm4, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4]; + mPtr++; + } + return; + } + } + break; + case 6: + switch ( numColumns ) { + case 1: { // 6x1 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movhps xmm0, [esi+8] + movlps xmm1, [esi+16] + mulps xmm0, [edi] + mulps xmm1, [edi+16] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) + addps xmm0, xmm1 + movhlps xmm2, xmm0 + addss xmm2, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm2, xmm0 + STORE1( 0, xmm2, xmm3 ) + } + return; + } + case 2: { // 6x2 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm6, [edi + 0 * 4] + mulps xmm6, xmm0 + movlps xmm1, [esi + 2 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 4 * 4] + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 8 * 4] + mulps xmm7, xmm2 + addps xmm6, xmm7 + movhlps xmm3, xmm6 + addps xmm3, xmm6 + STORE2LO( 0, xmm3, xmm7 ) + } + return; + } + case 3: { // 6x3 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+( 0*3+2 )*4] + movhps xmm0, [edi+( 0*3+0 )*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm6, [esi + 0 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movss xmm1, [edi + ( 1 * 3 + 0 ) * 4] + movhps xmm1, [edi + ( 1 * 3 + 1 ) * 4] + movss xmm7, [esi + 1 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movss xmm2, [edi + ( 2 * 3 + 2 ) * 4] + movhps xmm2, [edi + ( 2 * 3 + 0 ) * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 2 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movss xmm3, [edi + ( 3 * 3 + 0 ) * 4] + movhps xmm3, [edi + ( 3 * 3 + 1 ) * 4] + movss xmm7, [esi + 3 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movss xmm4, [edi + ( 4 * 3 + 2 ) * 4] + movhps xmm4, [edi + ( 4 * 3 + 0 ) * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 4 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movss xmm5, [edi + ( 5 * 3 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 3 + 1 ) * 4] + movss xmm7, [esi + 5 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE1( 0, xmm6, xmm7 ) + STORE2HI( 4, xmm6, xmm7 ) + } + return; + } + case 4: { // 6x4 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm3, [edi+( 0*4+0 )*4] + movhps xmm3, [edi+( 0*4+2 )*4] + movss xmm4, [esi+0*4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 4 + 2 ) * 4] + movss xmm6, [esi + 1 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 4 + 2 ) * 4] + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 4 + 2 ) * 4] + movss xmm6, [esi + 3 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 4 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 4 * 4 + 2 ) * 4] + movss xmm6, [esi + 4 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 5 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 4 + 2 ) * 4] + movss xmm6, [esi + 5 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + } + return; + } + case 5: { // 6x5 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [edi+( 0*5+0 )*4] + movhps xmm6, [edi+( 0*5+2 )*4] + movss xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movlps xmm7, [edi + ( 1 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 1 * 5 + 2 ) * 4] + movss xmm1, [esi + 1 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 2 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 2 * 5 + 2 ) * 4] + movss xmm2, [esi + 2 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 3 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 3 * 5 + 2 ) * 4] + movss xmm3, [esi + 3 * 4] + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 4 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 4 * 5 + 2 ) * 4] + movss xmm4, [esi + 4 * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 5 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 5 * 5 + 2 ) * 4] + movss xmm5, [esi + 5 * 4] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE4( 0, xmm6, xmm7 ) + movss xmm6, [edi + ( 0 * 5 + 4 ) * 4] + mulss xmm6, xmm0 + movss xmm7, [edi + ( 1 * 5 + 4 ) * 4] + mulss xmm7, xmm1 + addss xmm6, xmm7 + movss xmm7, [edi + ( 2 * 5 + 4 ) * 4] + mulss xmm7, xmm2 + addss xmm6, xmm7 + movss xmm7, [edi + ( 3 * 5 + 4 ) * 4] + mulss xmm7, xmm3 + addss xmm6, xmm7 + movss xmm7, [edi + ( 4 * 5 + 4 ) * 4] + mulss xmm7, xmm4 + addss xmm6, xmm7 + movss xmm7, [edi + ( 5 * 5 + 4 ) * 4] + mulss xmm7, xmm5 + addss xmm6, xmm7 + STORE1( 16, xmm6, xmm7 ) + } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movlps xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 5 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5]; + mPtr++; + } + return; + } + } + break; + default: + int numRows = mat.GetNumRows(); + for ( int i = 0; i < numColumns; i++ ) { + mPtr = mat.ToFloatPtr() + i; + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numRows; j++ ) { + mPtr += numColumns; + sum += mPtr[0] * vPtr[j]; + } + dstPtr[i] STOREC sum; + } + break; + } + +#undef STOREC +#undef STORE4 +#undef STORE2HI +#undef STORE2LO +#undef STORE1 +} + +/* +============ +idSIMD_SSE::MatX_TransposeMultiplyAddVecX + + optimizes the following matrix multiplications: + + Nx6 * Nx1 + 6xN * 6x1 + + with N in the range [1-6] +============ +*/ +void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { +#define STORE1( offset, reg1, reg2 ) \ + __asm movss reg2, [eax+offset] \ + __asm addss reg2, reg1 \ + __asm movss [eax+offset], reg2 +#define STORE2LO( offset, reg1, reg2 ) \ + __asm movlps reg2, [eax+offset] \ + __asm addps reg2, reg1 \ + __asm movlps [eax+offset], reg2 +#define STORE2HI( offset, reg1, reg2 ) \ + __asm movhps reg2, [eax+offset] \ + __asm addps reg2, reg1 \ + __asm movhps [eax+offset], reg2 +#define STORE4( offset, reg1, reg2 ) \ + __asm movlps reg2, [eax+offset] \ + __asm movhps reg2, [eax+offset+8] \ + __asm addps reg2, reg1 \ + __asm movlps [eax+offset], reg2 \ + __asm movhps [eax+offset+8], reg2 +#define STOREC += + + int numColumns; + const float *mPtr, *vPtr; + float *dstPtr; + + assert( vec.GetSize() >= mat.GetNumRows() ); + assert( dst.GetSize() >= mat.GetNumColumns() ); + + mPtr = mat.ToFloatPtr(); + vPtr = vec.ToFloatPtr(); + dstPtr = dst.ToFloatPtr(); + numColumns = mat.GetNumColumns(); + switch ( mat.GetNumRows() ) { + case 1: + switch ( numColumns ) { + case 6: { // 1x6 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm3 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0]; + mPtr++; + } + return; + } + } + break; + case 2: + switch ( numColumns ) { + case 6: { // 2x6 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movaps xmm1, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) + movaps xmm2, [edi] + mulps xmm2, xmm0 + movlps xmm3, [edi + 24] + movhps xmm3, [edi + 32] + mulps xmm3, xmm1 + addps xmm2, xmm3 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + movlps xmm4, [edi + 16] + movhps xmm4, [edi + 40] + mulps xmm4, xmm0 + movhlps xmm3, xmm4 + addps xmm3, xmm4 + STORE4( 0, xmm2, xmm5 ) + STORE2LO( 16, xmm3, xmm6 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1]; + mPtr++; + } + return; + } + } + break; + case 3: + switch ( numColumns ) { + case 6: { // 3x6 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movss xmm1, [esi+2*4] + movlps xmm3, [edi+( 0*6+0 )*4] + movhps xmm3, [edi+( 0*6+2 )*4] + movaps xmm4, xmm0 + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm1 + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 2 * 6 + 4 ) * 4] + mulps xmm5, xmm1 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2]; + mPtr++; + } + return; + } + } + break; + case 4: + switch ( numColumns ) { + case 6: { // 4x6 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3]; + mPtr++; + } + return; + } + } + break; + case 5: + switch ( numColumns ) { + case 6: { // 5x6 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movss xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm2 + mulps xmm4, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4]; + mPtr++; + } + return; + } + } + break; + case 6: + switch ( numColumns ) { + case 1: { // 6x1 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movhps xmm0, [esi+8] + movlps xmm1, [esi+16] + mulps xmm0, [edi] + mulps xmm1, [edi+16] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) + addps xmm0, xmm1 + movhlps xmm2, xmm0 + addss xmm2, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm2, xmm0 + STORE1( 0, xmm2, xmm3 ) + } + return; + } + case 2: { // 6x2 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm6, [edi + 0 * 4] + mulps xmm6, xmm0 + movlps xmm1, [esi + 2 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 4 * 4] + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 8 * 4] + mulps xmm7, xmm2 + addps xmm6, xmm7 + movhlps xmm3, xmm6 + addps xmm3, xmm6 + STORE2LO( 0, xmm3, xmm7 ) + } + return; + } + case 3: { // 6x3 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+( 0*3+2 )*4] + movhps xmm0, [edi+( 0*3+0 )*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm6, [esi + 0 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movss xmm1, [edi + ( 1 * 3 + 0 ) * 4] + movhps xmm1, [edi + ( 1 * 3 + 1 ) * 4] + movss xmm7, [esi + 1 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movss xmm2, [edi + ( 2 * 3 + 2 ) * 4] + movhps xmm2, [edi + ( 2 * 3 + 0 ) * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 2 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movss xmm3, [edi + ( 3 * 3 + 0 ) * 4] + movhps xmm3, [edi + ( 3 * 3 + 1 ) * 4] + movss xmm7, [esi + 3 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movss xmm4, [edi + ( 4 * 3 + 2 ) * 4] + movhps xmm4, [edi + ( 4 * 3 + 0 ) * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 4 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movss xmm5, [edi + ( 5 * 3 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 3 + 1 ) * 4] + movss xmm7, [esi + 5 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE1( 0, xmm6, xmm7 ) + STORE2HI( 4, xmm6, xmm7 ) + } + return; + } + case 4: { // 6x4 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm3, [edi+( 0*4+0 )*4] + movhps xmm3, [edi+( 0*4+2 )*4] + movss xmm4, [esi+0*4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 4 + 2 ) * 4] + movss xmm6, [esi + 1 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 4 + 2 ) * 4] + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 4 + 2 ) * 4] + movss xmm6, [esi + 3 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 4 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 4 * 4 + 2 ) * 4] + movss xmm6, [esi + 4 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 5 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 4 + 2 ) * 4] + movss xmm6, [esi + 5 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + } + return; + } + case 5: { // 6x5 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [edi+( 0*5+0 )*4] + movhps xmm6, [edi+( 0*5+2 )*4] + movss xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movlps xmm7, [edi + ( 1 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 1 * 5 + 2 ) * 4] + movss xmm1, [esi + 1 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 2 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 2 * 5 + 2 ) * 4] + movss xmm2, [esi + 2 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 3 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 3 * 5 + 2 ) * 4] + movss xmm3, [esi + 3 * 4] + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 4 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 4 * 5 + 2 ) * 4] + movss xmm4, [esi + 4 * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 5 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 5 * 5 + 2 ) * 4] + movss xmm5, [esi + 5 * 4] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE4( 0, xmm6, xmm7 ) + movss xmm6, [edi + ( 0 * 5 + 4 ) * 4] + mulss xmm6, xmm0 + movss xmm7, [edi + ( 1 * 5 + 4 ) * 4] + mulss xmm7, xmm1 + addss xmm6, xmm7 + movss xmm7, [edi + ( 2 * 5 + 4 ) * 4] + mulss xmm7, xmm2 + addss xmm6, xmm7 + movss xmm7, [edi + ( 3 * 5 + 4 ) * 4] + mulss xmm7, xmm3 + addss xmm6, xmm7 + movss xmm7, [edi + ( 4 * 5 + 4 ) * 4] + mulss xmm7, xmm4 + addss xmm6, xmm7 + movss xmm7, [edi + ( 5 * 5 + 4 ) * 4] + mulss xmm7, xmm5 + addss xmm6, xmm7 + STORE1( 16, xmm6, xmm7 ) + } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movlps xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 5 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5]; + mPtr++; + } + return; + } + } + break; + default: + int numRows = mat.GetNumRows(); + for ( int i = 0; i < numColumns; i++ ) { + mPtr = mat.ToFloatPtr() + i; + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numRows; j++ ) { + mPtr += numColumns; + sum += mPtr[0] * vPtr[j]; + } + dstPtr[i] STOREC sum; + } + break; + } + +#undef STOREC +#undef STORE4 +#undef STORE2HI +#undef STORE2LO +#undef STORE1 +} + +/* +============ +void idSIMD_SSE::MatX_TransposeMultiplySubVecX + + optimizes the following matrix multiplications: + + Nx6 * Nx1 + 6xN * 6x1 + + with N in the range [1-6] +============ +*/ +void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { +#define STORE1( offset, reg1, reg2 ) \ + __asm movss reg2, [eax+offset] \ + __asm subss reg2, reg1 \ + __asm movss [eax+offset], reg2 +#define STORE2LO( offset, reg1, reg2 ) \ + __asm movlps reg2, [eax+offset] \ + __asm subps reg2, reg1 \ + __asm movlps [eax+offset], reg2 +#define STORE2HI( offset, reg1, reg2 ) \ + __asm movhps reg2, [eax+offset] \ + __asm subps reg2, reg1 \ + __asm movhps [eax+offset], reg2 +#define STORE4( offset, reg1, reg2 ) \ + __asm movlps reg2, [eax+offset] \ + __asm movhps reg2, [eax+offset+8] \ + __asm subps reg2, reg1 \ + __asm movlps [eax+offset], reg2 \ + __asm movhps [eax+offset+8], reg2 +#define STOREC -= + + int numColumns; + const float *mPtr, *vPtr; + float *dstPtr; + + assert( vec.GetSize() >= mat.GetNumRows() ); + assert( dst.GetSize() >= mat.GetNumColumns() ); + + mPtr = mat.ToFloatPtr(); + vPtr = vec.ToFloatPtr(); + dstPtr = dst.ToFloatPtr(); + numColumns = mat.GetNumColumns(); + switch ( mat.GetNumRows() ) { + case 1: + switch ( numColumns ) { + case 6: { // 1x6 * 1x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + STORE4( 0, xmm0, xmm2 ) + STORE2LO( 16, xmm1, xmm3 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0]; + mPtr++; } - break; - case 2: - switch( numColumns ) { - case 6: { // 2x6 * 2x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movaps xmm1, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) - movaps xmm2, [edi] - mulps xmm2, xmm0 - movlps xmm3, [edi+24] - movhps xmm3, [edi+32] - mulps xmm3, xmm1 - addps xmm2, xmm3 - shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm4, [edi+16] - movhps xmm4, [edi+40] - mulps xmm4, xmm0 - movhlps xmm3, xmm4 - addps xmm3, xmm4 - STORE4( 0, xmm2, xmm5 ) - STORE2LO( 16, xmm3, xmm6 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; - mPtr++; - } - return; - } + return; + } + } + break; + case 2: + switch ( numColumns ) { + case 6: { // 2x6 * 2x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movaps xmm1, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) + movaps xmm2, [edi] + mulps xmm2, xmm0 + movlps xmm3, [edi + 24] + movhps xmm3, [edi + 32] + mulps xmm3, xmm1 + addps xmm2, xmm3 + shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + movlps xmm4, [edi + 16] + movhps xmm4, [edi + 40] + mulps xmm4, xmm0 + movhlps xmm3, xmm4 + addps xmm3, xmm4 + STORE4( 0, xmm2, xmm5 ) + STORE2LO( 16, xmm3, xmm6 ) } - break; - case 3: - switch( numColumns ) { - case 6: { // 3x6 * 3x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movss xmm1, [esi+2*4] - movlps xmm3, [edi+(0*6+0)*4] - movhps xmm3, [edi+(0*6+2)*4] - movaps xmm4, xmm0 - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm1 - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(2*6+4)*4] - mulps xmm5, xmm1 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; - mPtr++; - } - return; - } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1]; + mPtr++; } - break; - case 4: - switch( numColumns ) { - case 6: { // 4x6 * 4x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*6+0)*4] - movhps xmm4, [edi+(2*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3]; - mPtr++; - } - return; - } + return; + } + } + break; + case 3: + switch ( numColumns ) { + case 6: { // 3x6 * 3x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movss xmm1, [esi+2*4] + movlps xmm3, [edi+( 0*6+0 )*4] + movhps xmm3, [edi+( 0*6+2 )*4] + movaps xmm4, xmm0 + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm1 + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 2 * 6 + 4 ) * 4] + mulps xmm5, xmm1 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) } - break; - case 5: - switch( numColumns ) { - case 6: { // 5x6 * 5x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movss xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm4, xmm2 - mulps xmm4, [edi+(4*6+0)*4] - addps xmm3, xmm4 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; - mPtr++; - } - return; - } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2]; + mPtr++; } - break; - case 6: - switch( numColumns ) { - case 1: { // 6x1 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi] - movhps xmm0, [esi+8] - movlps xmm1, [esi+16] - mulps xmm0, [edi] - mulps xmm1, [edi+16] - shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) - addps xmm0, xmm1 - movhlps xmm2, xmm0 - addss xmm2, xmm0 - shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm2, xmm0 - STORE1( 0, xmm2, xmm3 ) - } - return; - } - case 2: { // 6x2 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm6, [edi+0*4] - mulps xmm6, xmm0 - movlps xmm1, [esi+2*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+4*4] - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm2, [esi+4*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm7, [edi+8*4] - mulps xmm7, xmm2 - addps xmm6, xmm7 - movhlps xmm3, xmm6 - addps xmm3, xmm6 - STORE2LO( 0, xmm3, xmm7 ) - } - return; - } - case 3: { // 6x3 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movss xmm0, [edi+(0*3+2)*4] - movhps xmm0, [edi+(0*3+0)*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm6, [esi+0*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movss xmm1, [edi+(1*3+0)*4] - movhps xmm1, [edi+(1*3+1)*4] - movss xmm7, [esi+1*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movss xmm2, [edi+(2*3+2)*4] - movhps xmm2, [edi+(2*3+0)*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+2*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movss xmm3, [edi+(3*3+0)*4] - movhps xmm3, [edi+(3*3+1)*4] - movss xmm7, [esi+3*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movss xmm4, [edi+(4*3+2)*4] - movhps xmm4, [edi+(4*3+0)*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) - movss xmm7, [esi+4*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movss xmm5, [edi+(5*3+0)*4] - movhps xmm5, [edi+(5*3+1)*4] - movss xmm7, [esi+5*4] - shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE1( 0, xmm6, xmm7 ) - STORE2HI( 4, xmm6, xmm7 ) - } - return; - } - case 4: { // 6x4 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm3, [edi+(0*4+0)*4] - movhps xmm3, [edi+(0*4+2)*4] - movss xmm4, [esi+0*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, xmm4 - movlps xmm5, [edi+(1*4+0)*4] - movhps xmm5, [edi+(1*4+2)*4] - movss xmm6, [esi+1*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(2*4+0)*4] - movhps xmm4, [edi+(2*4+2)*4] - movss xmm6, [esi+2*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(3*4+0)*4] - movhps xmm5, [edi+(3*4+2)*4] - movss xmm6, [esi+3*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movlps xmm4, [edi+(4*4+0)*4] - movhps xmm4, [edi+(4*4+2)*4] - movss xmm6, [esi+4*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm4, xmm6 - addps xmm3, xmm4 - movlps xmm5, [edi+(5*4+0)*4] - movhps xmm5, [edi+(5*4+2)*4] - movss xmm6, [esi+5*4] - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - } - return; - } - case 5: { // 6x5 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm6, [edi+(0*5+0)*4] - movhps xmm6, [edi+(0*5+2)*4] - movss xmm0, [esi+0*4] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, xmm0 - movlps xmm7, [edi+(1*5+0)*4] - movhps xmm7, [edi+(1*5+2)*4] - movss xmm1, [esi+1*4] - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm1 - addps xmm6, xmm7 - movlps xmm7, [edi+(2*5+0)*4] - movhps xmm7, [edi+(2*5+2)*4] - movss xmm2, [esi+2*4] - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm2 - addps xmm6, xmm7 - movlps xmm7, [edi+(3*5+0)*4] - movhps xmm7, [edi+(3*5+2)*4] - movss xmm3, [esi+3*4] - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm3 - addps xmm6, xmm7 - movlps xmm7, [edi+(4*5+0)*4] - movhps xmm7, [edi+(4*5+2)*4] - movss xmm4, [esi+4*4] - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm4 - addps xmm6, xmm7 - movlps xmm7, [edi+(5*5+0)*4] - movhps xmm7, [edi+(5*5+2)*4] - movss xmm5, [esi+5*4] - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm7, xmm5 - addps xmm6, xmm7 - STORE4( 0, xmm6, xmm7 ) - movss xmm6, [edi+(0*5+4)*4] - mulss xmm6, xmm0 - movss xmm7, [edi+(1*5+4)*4] - mulss xmm7, xmm1 - addss xmm6, xmm7 - movss xmm7, [edi+(2*5+4)*4] - mulss xmm7, xmm2 - addss xmm6, xmm7 - movss xmm7, [edi+(3*5+4)*4] - mulss xmm7, xmm3 - addss xmm6, xmm7 - movss xmm7, [edi+(4*5+4)*4] - mulss xmm7, xmm4 - addss xmm6, xmm7 - movss xmm7, [edi+(5*5+4)*4] - mulss xmm7, xmm5 - addss xmm6, xmm7 - STORE1( 16, xmm6, xmm7 ) - } - return; - } - case 6: { // 6x6 * 6x1 - __asm { - mov esi, vPtr - mov edi, mPtr - mov eax, dstPtr - movlps xmm0, [esi+0*4] - movlps xmm1, [esi+2*4] - movlps xmm2, [esi+4*4] - movaps xmm3, xmm0 - shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm3, [edi+(0*6+0)*4] - movlps xmm5, [edi+(1*6+0)*4] - movhps xmm5, [edi+(1*6+2)*4] - movaps xmm6, xmm0 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(2*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm1 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(3*6+0)*4] - movhps xmm5, [edi+(3*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) - mulps xmm6, [edi+(4*6+0)*4] - addps xmm3, xmm6 - movaps xmm6, xmm2 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - movlps xmm5, [edi+(5*6+0)*4] - movhps xmm5, [edi+(5*6+2)*4] - mulps xmm5, xmm6 - addps xmm3, xmm5 - STORE4( 0, xmm3, xmm7 ) - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) - movlps xmm3, [edi+(0*6+4)*4] - movhps xmm3, [edi+(1*6+4)*4] - mulps xmm3, xmm0 - movlps xmm4, [edi+(2*6+4)*4] - movhps xmm4, [edi+(3*6+4)*4] - mulps xmm4, xmm1 - addps xmm3, xmm4 - movlps xmm5, [edi+(4*6+4)*4] - movhps xmm5, [edi+(5*6+4)*4] - mulps xmm5, xmm2 - addps xmm3, xmm5 - movhlps xmm4, xmm3 - addps xmm3, xmm4 - STORE2LO( 16, xmm3, xmm7 ) - } - return; - } - default: { - for ( int i = 0; i < numColumns; i++ ) { - dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + - *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; - mPtr++; - } - return; - } + return; + } + } + break; + case 4: + switch ( numColumns ) { + case 6: { // 4x6 * 4x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 6 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) } - break; - default: - int numRows = mat.GetNumRows(); + return; + } + default: { for ( int i = 0; i < numColumns; i++ ) { - mPtr = mat.ToFloatPtr() + i; - float sum = mPtr[0] * vPtr[0]; - for ( int j = 1; j < numRows; j++ ) { - mPtr += numColumns; - sum += mPtr[0] * vPtr[j]; - } - dstPtr[i] STOREC sum; + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3]; + mPtr++; } - break; + return; + } + } + break; + case 5: + switch ( numColumns ) { + case 6: { // 5x6 * 5x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movss xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm4, xmm2 + mulps xmm4, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm4 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4]; + mPtr++; + } + return; + } + } + break; + case 6: + switch ( numColumns ) { + case 1: { // 6x1 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi] + movhps xmm0, [esi+8] + movlps xmm1, [esi+16] + mulps xmm0, [edi] + mulps xmm1, [edi+16] + shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) + addps xmm0, xmm1 + movhlps xmm2, xmm0 + addss xmm2, xmm0 + shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm2, xmm0 + STORE1( 0, xmm2, xmm3 ) + } + return; + } + case 2: { // 6x2 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm6, [edi + 0 * 4] + mulps xmm6, xmm0 + movlps xmm1, [esi + 2 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 4 * 4] + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm2, [esi + 4 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm7, [edi + 8 * 4] + mulps xmm7, xmm2 + addps xmm6, xmm7 + movhlps xmm3, xmm6 + addps xmm3, xmm6 + STORE2LO( 0, xmm3, xmm7 ) + } + return; + } + case 3: { // 6x3 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movss xmm0, [edi+( 0*3+2 )*4] + movhps xmm0, [edi+( 0*3+0 )*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm6, [esi + 0 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movss xmm1, [edi + ( 1 * 3 + 0 ) * 4] + movhps xmm1, [edi + ( 1 * 3 + 1 ) * 4] + movss xmm7, [esi + 1 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movss xmm2, [edi + ( 2 * 3 + 2 ) * 4] + movhps xmm2, [edi + ( 2 * 3 + 0 ) * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 2 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movss xmm3, [edi + ( 3 * 3 + 0 ) * 4] + movhps xmm3, [edi + ( 3 * 3 + 1 ) * 4] + movss xmm7, [esi + 3 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movss xmm4, [edi + ( 4 * 3 + 2 ) * 4] + movhps xmm4, [edi + ( 4 * 3 + 0 ) * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) + movss xmm7, [esi + 4 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movss xmm5, [edi + ( 5 * 3 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 3 + 1 ) * 4] + movss xmm7, [esi + 5 * 4] + shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE1( 0, xmm6, xmm7 ) + STORE2HI( 4, xmm6, xmm7 ) + } + return; + } + case 4: { // 6x4 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm3, [edi+( 0*4+0 )*4] + movhps xmm3, [edi+( 0*4+2 )*4] + movss xmm4, [esi+0*4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, xmm4 + movlps xmm5, [edi + ( 1 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 4 + 2 ) * 4] + movss xmm6, [esi + 1 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 2 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 2 * 4 + 2 ) * 4] + movss xmm6, [esi + 2 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 3 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 4 + 2 ) * 4] + movss xmm6, [esi + 3 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movlps xmm4, [edi + ( 4 * 4 + 0 ) * 4] + movhps xmm4, [edi + ( 4 * 4 + 2 ) * 4] + movss xmm6, [esi + 4 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm4, xmm6 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 5 * 4 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 4 + 2 ) * 4] + movss xmm6, [esi + 5 * 4] + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + } + return; + } + case 5: { // 6x5 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm6, [edi+( 0*5+0 )*4] + movhps xmm6, [edi+( 0*5+2 )*4] + movss xmm0, [esi+0*4] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, xmm0 + movlps xmm7, [edi + ( 1 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 1 * 5 + 2 ) * 4] + movss xmm1, [esi + 1 * 4] + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm1 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 2 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 2 * 5 + 2 ) * 4] + movss xmm2, [esi + 2 * 4] + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm2 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 3 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 3 * 5 + 2 ) * 4] + movss xmm3, [esi + 3 * 4] + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm3 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 4 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 4 * 5 + 2 ) * 4] + movss xmm4, [esi + 4 * 4] + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm4 + addps xmm6, xmm7 + movlps xmm7, [edi + ( 5 * 5 + 0 ) * 4] + movhps xmm7, [edi + ( 5 * 5 + 2 ) * 4] + movss xmm5, [esi + 5 * 4] + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm7, xmm5 + addps xmm6, xmm7 + STORE4( 0, xmm6, xmm7 ) + movss xmm6, [edi + ( 0 * 5 + 4 ) * 4] + mulss xmm6, xmm0 + movss xmm7, [edi + ( 1 * 5 + 4 ) * 4] + mulss xmm7, xmm1 + addss xmm6, xmm7 + movss xmm7, [edi + ( 2 * 5 + 4 ) * 4] + mulss xmm7, xmm2 + addss xmm6, xmm7 + movss xmm7, [edi + ( 3 * 5 + 4 ) * 4] + mulss xmm7, xmm3 + addss xmm6, xmm7 + movss xmm7, [edi + ( 4 * 5 + 4 ) * 4] + mulss xmm7, xmm4 + addss xmm6, xmm7 + movss xmm7, [edi + ( 5 * 5 + 4 ) * 4] + mulss xmm7, xmm5 + addss xmm6, xmm7 + STORE1( 16, xmm6, xmm7 ) + } + return; + } + case 6: { // 6x6 * 6x1 + __asm { + mov esi, vPtr + mov edi, mPtr + mov eax, dstPtr + movlps xmm0, [esi+0*4] + movlps xmm1, [esi+2*4] + movlps xmm2, [esi+4*4] + movaps xmm3, xmm0 + shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm3, [edi + ( 0 * 6 + 0 ) * 4] + movlps xmm5, [edi + ( 1 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 1 * 6 + 2 ) * 4] + movaps xmm6, xmm0 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 2 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm1 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 3 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 3 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) + mulps xmm6, [edi + ( 4 * 6 + 0 ) * 4] + addps xmm3, xmm6 + movaps xmm6, xmm2 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + movlps xmm5, [edi + ( 5 * 6 + 0 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 2 ) * 4] + mulps xmm5, xmm6 + addps xmm3, xmm5 + STORE4( 0, xmm3, xmm7 ) + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) + movlps xmm3, [edi + ( 0 * 6 + 4 ) * 4] + movhps xmm3, [edi + ( 1 * 6 + 4 ) * 4] + mulps xmm3, xmm0 + movlps xmm4, [edi + ( 2 * 6 + 4 ) * 4] + movhps xmm4, [edi + ( 3 * 6 + 4 ) * 4] + mulps xmm4, xmm1 + addps xmm3, xmm4 + movlps xmm5, [edi + ( 4 * 6 + 4 ) * 4] + movhps xmm5, [edi + ( 5 * 6 + 4 ) * 4] + mulps xmm5, xmm2 + addps xmm3, xmm5 + movhlps xmm4, xmm3 + addps xmm3, xmm4 + STORE2LO( 16, xmm3, xmm7 ) + } + return; + } + default: { + for ( int i = 0; i < numColumns; i++ ) { + dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] + + *( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5]; + mPtr++; + } + return; + } + } + break; + default: + int numRows = mat.GetNumRows(); + for ( int i = 0; i < numColumns; i++ ) { + mPtr = mat.ToFloatPtr() + i; + float sum = mPtr[0] * vPtr[0]; + for ( int j = 1; j < numRows; j++ ) { + mPtr += numColumns; + sum += mPtr[0] * vPtr[j]; + } + dstPtr[i] STOREC sum; + } + break; } #undef STOREC @@ -8317,1280 +8413,1280 @@ void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const const float *m1Ptr, *m2Ptr; double sum; - assert( m1.GetNumColumns() == m2.GetNumRows() ); + assert( m1.GetNumColumns() == m2.GetNumRows() ); + + dstPtr = dst.ToFloatPtr(); + m1Ptr = m1.ToFloatPtr(); + m2Ptr = m2.ToFloatPtr(); + k = m1.GetNumRows(); + l = m2.GetNumColumns(); + n = m1.GetNumColumns(); + + switch ( n ) { + case 1: { + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 1: { // 1x1 * 1x6, no precision loss compared to FPU version + __asm { + mov esi, m2Ptr + mov edi, m1Ptr + mov eax, dstPtr + movss xmm0, [edi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, [esi] + mulps xmm1, xmm0 + movaps [eax], xmm1 + movlps xmm2, [esi + 16] + mulps xmm2, xmm0 + movlps [eax + 16], xmm2 + } + return; + } + case 6: { // 6x1 * 1x6, no precision loss compared to FPU version + __asm { + mov esi, m2Ptr + mov edi, m1Ptr + mov eax, dstPtr + xorps xmm1, xmm1 + movaps xmm0, [edi] + movlps xmm1, [edi+16] + movlhps xmm1, xmm0 + movhlps xmm2, xmm0 + movlhps xmm2, xmm1 + // row 0 and 1 + movaps xmm3, [esi] + movaps xmm4, xmm3 + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm5, xmm3 + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) + movaps xmm6, xmm3 + shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + movaps [eax], xmm4 + movaps [eax + 16], xmm5 + movaps [eax + 32], xmm6 + // row 2 and 3 + movaps xmm4, xmm3 + shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) + movaps xmm5, xmm3 + shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) + shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm3, xmm2 + movaps [eax + 48], xmm4 + movaps [eax + 64], xmm5 + movaps [eax + 80], xmm3 + // row 4 and 5 + movlps xmm3, [esi + 16] + movaps xmm4, xmm3 + shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm5, xmm3 + shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) + shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 ) + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm3, xmm2 + movaps [eax + 96], xmm4 + movaps [eax + 112], xmm5 + movaps [eax + 128], xmm3 + } + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0]; + m2Ptr++; + } + m1Ptr++; + } + break; + } + case 2: { + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 2: { // 2x2 * 2x6 + +#define MUL_Nx2_2x6_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movaps xmm0, [esi] \ + __asm movlps xmm1, [esi+16] \ + __asm movhps xmm1, [esi+40] \ + __asm movlps xmm2, [esi+24] \ + __asm movhps xmm2, [esi+32] + +#define MUL_Nx2_2x6_ROW2( row ) \ + __asm movaps xmm3, [edi+row*16] \ + __asm movaps xmm5, xmm0 \ + __asm movaps xmm4, xmm3 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm4 \ + __asm movaps xmm4, xmm3 \ + __asm movaps xmm6, xmm2 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm5, xmm6 \ + __asm movaps [eax+row*48], xmm5 \ + __asm movaps xmm4, xmm3 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm movaps xmm7, xmm1 \ + __asm mulps xmm7, xmm4 \ + __asm movaps xmm4, xmm3 \ + __asm movaps xmm5, xmm0 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \ + __asm mulps xmm5, xmm4 \ + __asm movaps xmm4, xmm3 \ + __asm movaps xmm6, xmm2 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm5, xmm6 \ + __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \ + __asm movaps xmm6, xmm1 \ + __asm mulps xmm6, xmm3 \ + __asm movaps xmm4, xmm7 \ + __asm movlhps xmm7, xmm6 \ + __asm movhlps xmm6, xmm4 \ + __asm addps xmm6, xmm7 \ + __asm movlps [eax+row*48+16], xmm6 \ + __asm movlps [eax+row*48+24], xmm5 \ + __asm movhps [eax+row*48+32], xmm5 \ + __asm movhps [eax+row*48+40], xmm6 + + MUL_Nx2_2x6_INIT + MUL_Nx2_2x6_ROW2( 0 ) + + return; + } + case 6: { // 6x2 * 2x6 + + MUL_Nx2_2x6_INIT + MUL_Nx2_2x6_ROW2( 0 ) + MUL_Nx2_2x6_ROW2( 1 ) + MUL_Nx2_2x6_ROW2( 2 ) + + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l]; + m2Ptr++; + } + m1Ptr += 2; + } + break; + } + case 3: { + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 3: { // 3x3 * 3x6 + __asm { + mov esi, m2Ptr + mov edi, m1Ptr + mov eax, dstPtr + movaps xmm5, xmmword ptr [esi] + movlps xmm6, qword ptr [esi+24] + movhps xmm6, qword ptr [esi+32] + movaps xmm7, xmmword ptr [esi+48] + movss xmm0, dword ptr [edi] + shufps xmm0, xmm0, 0 + mulps xmm0, xmm5 + movss xmm1, dword ptr [edi+4] + shufps xmm1, xmm1, 0 + mulps xmm1, xmm6 + movss xmm2, dword ptr [edi+8] + shufps xmm2, xmm2, 0 + mulps xmm2, xmm7 + addps xmm0, xmm1 + addps xmm0, xmm2 + movaps xmmword ptr [eax], xmm0 + movss xmm3, dword ptr [edi+12] + shufps xmm3, xmm3, 0 + mulps xmm3, xmm5 + movss xmm4, dword ptr [edi+16] + shufps xmm4, xmm4, 0 + mulps xmm4, xmm6 + movss xmm0, dword ptr [edi+20] + shufps xmm0, xmm0, 0 + mulps xmm0, xmm7 + addps xmm3, xmm4 + addps xmm0, xmm3 + movlps qword ptr [eax+24], xmm0 + movhps qword ptr [eax+32], xmm0 + movss xmm1, dword ptr [edi+24] + shufps xmm1, xmm1, 0 + mulps xmm1, xmm5 + movss xmm2, dword ptr [edi+28] + shufps xmm2, xmm2, 0 + mulps xmm2, xmm6 + movss xmm3, dword ptr [edi+32] + shufps xmm3, xmm3, 0 + mulps xmm3, xmm7 + addps xmm1, xmm2 + addps xmm1, xmm3 + movaps xmmword ptr [eax+48], xmm1 + movlps xmm5, qword ptr [esi+16] + movlps xmm6, qword ptr [esi+40] + movlps xmm7, qword ptr [esi+64] + shufps xmm5, xmm5, 0x44 + shufps xmm6, xmm6, 0x44 + shufps xmm7, xmm7, 0x44 + movaps xmm3, xmmword ptr [edi] + movlps xmm4, qword ptr [edi+16] + movaps xmm0, xmm3 + shufps xmm0, xmm0, 0xF0 + mulps xmm0, xmm5 + movaps xmm1, xmm3 + shufps xmm1, xmm4, 0x05 + mulps xmm1, xmm6 + shufps xmm3, xmm4, 0x5A + mulps xmm3, xmm7 + addps xmm1, xmm0 + addps xmm1, xmm3 + movlps qword ptr [eax+16], xmm1 + movhps qword ptr [eax+40], xmm1 + movss xmm0, dword ptr [edi+24] + shufps xmm0, xmm0, 0 + mulps xmm0, xmm5 + movss xmm2, dword ptr [edi+28] + shufps xmm2, xmm2, 0 + mulps xmm2, xmm6 + movss xmm4, dword ptr [edi+32] + shufps xmm4, xmm4, 0 + mulps xmm4, xmm7 + addps xmm0, xmm2 + addps xmm0, xmm4 + movlps qword ptr [eax+64], xmm0 + } + return; + } + case 6: { // 6x3 * 3x6 +#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movlps xmm0, [esi+ 0*4] \ + __asm movhps xmm0, [esi+ 2*4] \ + __asm movlps xmm1, [esi+ 6*4] \ + __asm movhps xmm1, [esi+ 8*4] \ + __asm movlps xmm2, [esi+12*4] \ + __asm movhps xmm2, [esi+14*4] + +#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \ + __asm movss xmm3, [edi+(row*3+0)*4] \ + __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm3, xmm0 \ + __asm movss xmm4, [edi+(row*3+1)*4] \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm4, xmm1 \ + __asm addps xmm3, xmm4 \ + __asm movss xmm5, [edi+(row*3+2)*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm3, xmm5 \ + __asm movlps [eax+(row*6+0)*4], xmm3 \ + __asm movhps [eax+(row*6+2)*4], xmm3 + +#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \ + __asm movlps xmm0, [esi+ 4*4] \ + __asm movlps xmm1, [esi+10*4] \ + __asm movlps xmm2, [esi+16*4] \ + __asm shufps xmm0, xmm0, 0x44 \ + __asm shufps xmm1, xmm1, 0x44 \ + __asm shufps xmm2, xmm2, 0x44 \ + __asm movlps xmm3, [edi+0*4] \ + __asm movhps xmm3, [edi+2*4] \ + __asm movaps xmm4, xmm3 \ + __asm movaps xmm5, xmm3 \ + __asm shufps xmm3, xmm3, 0xF0 \ + __asm mulps xmm3, xmm0 \ + __asm movlps xmm6, [edi+4*4] \ + __asm movhps xmm6, [edi+6*4] \ + __asm shufps xmm4, xmm6, 0x05 \ + __asm mulps xmm4, xmm1 \ + __asm addps xmm3, xmm4 \ + __asm shufps xmm5, xmm6, 0x5A \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm3, xmm5 \ + __asm movlps [eax+4*4], xmm3 \ + __asm movhps [eax+10*4], xmm3 \ + __asm movaps xmm5, xmm6 \ + __asm movlps xmm3, [edi+8*4] \ + __asm movhps xmm3, [edi+10*4] \ + __asm movaps xmm4, xmm3 \ + __asm shufps xmm5, xmm3, 0x5A \ + __asm mulps xmm5, xmm0 \ + __asm shufps xmm6, xmm3, 0xAF \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm5, xmm6 \ + __asm shufps xmm4, xmm4, 0xF0 \ + __asm mulps xmm4, xmm2 \ + __asm addps xmm4, xmm5 \ + __asm movlps [eax+16*4], xmm4 \ + __asm movhps [eax+22*4], xmm4 \ + __asm movlps xmm6, [edi+12*4] \ + __asm movhps xmm6, [edi+14*4] \ + __asm movaps xmm5, xmm6 \ + __asm movaps xmm4, xmm6 \ + __asm shufps xmm6, xmm6, 0xF0 \ + __asm mulps xmm6, xmm0 \ + __asm movlps xmm3, [edi+16*4] \ + __asm shufps xmm5, xmm3, 0x05 \ + __asm mulps xmm5, xmm1 \ + __asm addps xmm5, xmm6 \ + __asm shufps xmm4, xmm3, 0x5A \ + __asm mulps xmm4, xmm2 \ + __asm addps xmm4, xmm5 \ + __asm movlps [eax+28*4], xmm4 \ + __asm movhps [eax+34*4], xmm4 + + MUL_Nx3_3x6_FIRST4COLUMNS_INIT + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 ) + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 ) + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 ) + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 ) + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 ) + MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 ) + MUL_Nx3_3x6_LAST2COLUMNS_ROW6 + + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l]; + m2Ptr++; + } + m1Ptr += 3; + } + break; + } + case 4: { + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 4: { // 4x4 * 4x6 + +#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movlps xmm0, [esi+ 0*4] \ + __asm movhps xmm0, [esi+ 2*4] \ + __asm movlps xmm1, [esi+ 6*4] \ + __asm movhps xmm1, [esi+ 8*4] \ + __asm movlps xmm2, [esi+12*4] \ + __asm movhps xmm2, [esi+14*4] \ + __asm movlps xmm3, [esi+18*4] \ + __asm movhps xmm3, [esi+20*4] + +#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \ + __asm movss xmm4, [edi+row*16+0*4] \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm4, xmm0 \ + __asm movss xmm5, [edi+row*16+1*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm1 \ + __asm addps xmm4, xmm5 \ + __asm movss xmm6, [edi+row*16+2*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm4, xmm6 \ + __asm movss xmm7, [edi+row*16+3*4] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm3 \ + __asm addps xmm4, xmm7 \ + __asm movlps [eax+row*24+0], xmm4 \ + __asm movhps [eax+row*24+8], xmm4 + +#define MUL_Nx4_4x6_LAST2COLUMNS_INIT \ + __asm movlps xmm0, [esi+ 4*4] \ + __asm movlps xmm1, [esi+10*4] \ + __asm movlps xmm2, [esi+16*4] \ + __asm movlps xmm3, [esi+22*4] \ + __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) + +#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \ + __asm movlps xmm7, [edi+row*32+ 0*4] \ + __asm movhps xmm7, [edi+row*32+ 4*4] \ + __asm movaps xmm6, xmm7 \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \ + __asm mulps xmm6, xmm0 \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \ + __asm mulps xmm7, xmm1 \ + __asm addps xmm6, xmm7 \ + __asm movlps xmm4, [edi+row*32+ 2*4] \ + __asm movhps xmm4, [edi+row*32+ 6*4] \ + __asm movaps xmm5, xmm4 \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm6, xmm5 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \ + __asm mulps xmm4, xmm3 \ + __asm addps xmm6, xmm4 \ + __asm movlps [eax+row*48+ 4*4], xmm6 \ + __asm movhps [eax+row*48+10*4], xmm6 + + MUL_Nx4_4x6_FIRST4COLUMNS_INIT + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) + MUL_Nx4_4x6_LAST2COLUMNS_INIT + MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) + MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) + + return; + } + case 6: { // 6x4 * 4x6 + + MUL_Nx4_4x6_FIRST4COLUMNS_INIT + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 ) + MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 ) + MUL_Nx4_4x6_LAST2COLUMNS_INIT + MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) + MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) + MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 ) + + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] + + m1Ptr[3] * m2Ptr[3 * l]; + m2Ptr++; + } + m1Ptr += 4; + } + break; + } + case 5: { + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 5: { // 5x5 * 5x6 + +#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movlps xmm0, [esi+ 0*4] \ + __asm movhps xmm0, [esi+ 2*4] \ + __asm movlps xmm1, [esi+ 6*4] \ + __asm movhps xmm1, [esi+ 8*4] \ + __asm movlps xmm2, [esi+12*4] \ + __asm movhps xmm2, [esi+14*4] \ + __asm movlps xmm3, [esi+18*4] \ + __asm movhps xmm3, [esi+20*4] \ + __asm movlps xmm4, [esi+24*4] \ + __asm movhps xmm4, [esi+26*4] + +#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \ + __asm movss xmm6, [edi+row*20+0*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm0 \ + __asm movss xmm5, [edi+row*20+1*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm1 \ + __asm addps xmm6, xmm5 \ + __asm movss xmm5, [edi+row*20+2*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm6, xmm5 \ + __asm movss xmm5, [edi+row*20+3*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm3 \ + __asm addps xmm6, xmm5 \ + __asm movss xmm5, [edi+row*20+4*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm5, xmm4 \ + __asm addps xmm6, xmm5 \ + __asm movlps [eax+row*24+0], xmm6 \ + __asm movhps [eax+row*24+8], xmm6 + +#define MUL_Nx5_5x6_LAST2COLUMNS_INIT \ + __asm movlps xmm0, [esi+ 4*4] \ + __asm movlps xmm1, [esi+10*4] \ + __asm movlps xmm2, [esi+16*4] \ + __asm movlps xmm3, [esi+22*4] \ + __asm movlps xmm4, [esi+28*4] \ + __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) + +#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \ + __asm movlps xmm7, [edi+row*40+ 0*4] \ + __asm movhps xmm7, [edi+row*40+ 6*4] \ + __asm movaps xmm6, xmm7 \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \ + __asm mulps xmm6, xmm0 \ + __asm movaps xmm5, xmm7 \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ + __asm mulps xmm5, xmm1 \ + __asm addps xmm6, xmm5 \ + __asm movlps xmm7, [edi+row*40+ 2*4] \ + __asm movhps xmm7, [edi+row*40+ 8*4] \ + __asm movaps xmm5, xmm7 \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm6, xmm5 \ + __asm movaps xmm5, xmm7 \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ + __asm mulps xmm5, xmm3 \ + __asm addps xmm6, xmm5 \ + __asm movlps xmm5, [edi+row*40+ 4*4] \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm5, xmm4 \ + __asm addps xmm6, xmm5 \ + __asm movlps [eax+row*48+ 4*4], xmm6 \ + __asm movhps [eax+row*48+10*4], xmm6 + +#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \ + __asm movlps xmm6, [edi+20*4+0*4] \ + __asm unpcklps xmm6, xmm6 \ + __asm mulps xmm6, xmm0 \ + __asm movlps xmm5, [edi+20*4+2*4] \ + __asm unpcklps xmm5, xmm5 \ + __asm mulps xmm5, xmm2 \ + __asm addps xmm6, xmm5 \ + __asm movss xmm5, [edi+20*4+4*4] \ + __asm unpcklps xmm5, xmm5 \ + __asm mulps xmm5, xmm4 \ + __asm addps xmm6, xmm5 \ + __asm movhlps xmm7, xmm6 \ + __asm addps xmm6, xmm7 \ + __asm movlps [eax+row*24+4*4], xmm6 + + MUL_Nx5_5x6_FIRST4COLUMNS_INIT + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) + MUL_Nx5_5x6_LAST2COLUMNS_INIT + MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) + MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) + MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 ) + + return; + } + case 6: { // 6x5 * 5x6 + + MUL_Nx5_5x6_FIRST4COLUMNS_INIT + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) + MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 ) + MUL_Nx5_5x6_LAST2COLUMNS_INIT + MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) + MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) + MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 ) + + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] + + m1Ptr[3] * m2Ptr[3 * l] + m1Ptr[4] * m2Ptr[4 * l]; + m2Ptr++; + } + m1Ptr += 5; + } + break; + } + case 6: { + switch ( k ) { + case 1: { + if ( !( l ^ 1 ) ) { // 1x6 * 6x1 + dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] + + m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5]; + return; + } + break; + } + case 2: { + if ( !( l ^ 2 ) ) { // 2x6 * 6x2 + +#define MUL_Nx6_6x2_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movaps xmm0, [esi] \ + __asm movaps xmm1, [esi+16] \ + __asm movaps xmm2, [esi+32] + +#define MUL_Nx6_6x2_ROW2( row ) \ + __asm movaps xmm7, [edi+row*48+0*4] \ + __asm movaps xmm6, xmm7 \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm7, xmm0 \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movaps xmm6, [edi+row*48+4*4] \ + __asm movaps xmm5, xmm6 \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \ + __asm mulps xmm5, xmm0 \ + __asm movaps xmm6, [edi+row*48+24+2*4] \ + __asm movaps xmm4, xmm6 \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm5, xmm6 \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \ + __asm mulps xmm4, xmm2 \ + __asm addps xmm5, xmm4 \ + __asm movaps xmm4, xmm5 \ + __asm movhlps xmm5, xmm7 \ + __asm movlhps xmm7, xmm4 \ + __asm addps xmm7, xmm5 \ + __asm movaps [eax+row*16], xmm7 + + MUL_Nx6_6x2_INIT + MUL_Nx6_6x2_ROW2( 0 ) + + return; + } + break; + } + case 3: { + if ( !( l ^ 3 ) ) { // 3x6 * 6x3 + +#define MUL_Nx6_6x3_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movss xmm0, [esi+ 0*4] \ + __asm movhps xmm0, [esi+ 1*4] \ + __asm movss xmm1, [esi+ 3*4] \ + __asm movhps xmm1, [esi+ 4*4] \ + __asm movss xmm2, [esi+ 6*4] \ + __asm movhps xmm2, [esi+ 7*4] \ + __asm movss xmm3, [esi+ 9*4] \ + __asm movhps xmm3, [esi+10*4] \ + __asm movss xmm4, [esi+12*4] \ + __asm movhps xmm4, [esi+13*4] \ + __asm movss xmm5, [esi+15*4] \ + __asm movhps xmm5, [esi+16*4] + +#define MUL_Nx6_6x3_ROW( row ) \ + __asm movss xmm7, [edi+row*24+0] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm0 \ + __asm movss xmm6, [edi+row*24+4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+8] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+12] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+16] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+20] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm movss [eax+row*12+0], xmm7 \ + __asm movhps [eax+row*12+4], xmm7 + + MUL_Nx6_6x3_INIT + MUL_Nx6_6x3_ROW( 0 ) + MUL_Nx6_6x3_ROW( 1 ) + MUL_Nx6_6x3_ROW( 2 ) + + return; + } + break; + } + case 4: { + if ( !( l ^ 4 ) ) { // 4x6 * 6x4 + +#define MUL_Nx6_6x4_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movaps xmm0, [esi] \ + __asm movaps xmm1, [esi+16] \ + __asm movaps xmm2, [esi+32] \ + __asm movaps xmm3, [esi+48] \ + __asm movaps xmm4, [esi+64] \ + __asm movaps xmm5, [esi+80] + +#define MUL_Nx6_6x4_ROW( row ) \ + __asm movss xmm7, [edi+row*24+0] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm0 \ + __asm movss xmm6, [edi+row*24+4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+8] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+12] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+16] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+row*24+20] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm movaps [eax+row*16], xmm7 - dstPtr = dst.ToFloatPtr(); - m1Ptr = m1.ToFloatPtr(); - m2Ptr = m2.ToFloatPtr(); - k = m1.GetNumRows(); - l = m2.GetNumColumns(); - n = m1.GetNumColumns(); + MUL_Nx6_6x4_INIT + MUL_Nx6_6x4_ROW( 0 ) + MUL_Nx6_6x4_ROW( 1 ) + MUL_Nx6_6x4_ROW( 2 ) + MUL_Nx6_6x4_ROW( 3 ) - switch( n ) { - case 1: { - if ( !(l^6) ) { - switch( k ) { - case 1: { // 1x1 * 1x6, no precision loss compared to FPU version - __asm { - mov esi, m2Ptr - mov edi, m1Ptr - mov eax, dstPtr - movss xmm0, [edi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, [esi] - mulps xmm1, xmm0 - movaps [eax], xmm1 - movlps xmm2, [esi+16] - mulps xmm2, xmm0 - movlps [eax+16], xmm2 - } - return; - } - case 6: { // 6x1 * 1x6, no precision loss compared to FPU version - __asm { - mov esi, m2Ptr - mov edi, m1Ptr - mov eax, dstPtr - xorps xmm1, xmm1 - movaps xmm0, [edi] - movlps xmm1, [edi+16] - movlhps xmm1, xmm0 - movhlps xmm2, xmm0 - movlhps xmm2, xmm1 - // row 0 and 1 - movaps xmm3, [esi] - movaps xmm4, xmm3 - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm5, xmm3 - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) - movaps xmm6, xmm3 - shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - movaps [eax], xmm4 - movaps [eax+16], xmm5 - movaps [eax+32], xmm6 - // row 2 and 3 - movaps xmm4, xmm3 - shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) - movaps xmm5, xmm3 - shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) - shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm3, xmm2 - movaps [eax+48], xmm4 - movaps [eax+64], xmm5 - movaps [eax+80], xmm3 - // row 4 and 5 - movlps xmm3, [esi+16] - movaps xmm4, xmm3 - shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm5, xmm3 - shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) - shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 ) - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm3, xmm2 - movaps [eax+96], xmm4 - movaps [eax+112], xmm5 - movaps [eax+128], xmm3 - } - return; - } - } - } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0]; - m2Ptr++; - } - m1Ptr++; + return; } break; } - case 2: { - if ( !(l^6) ) { - switch( k ) { - case 2: { // 2x2 * 2x6 - - #define MUL_Nx2_2x6_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movaps xmm0, [esi] \ - __asm movlps xmm1, [esi+16] \ - __asm movhps xmm1, [esi+40] \ - __asm movlps xmm2, [esi+24] \ - __asm movhps xmm2, [esi+32] - - #define MUL_Nx2_2x6_ROW2( row ) \ - __asm movaps xmm3, [edi+row*16] \ - __asm movaps xmm5, xmm0 \ - __asm movaps xmm4, xmm3 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm4 \ - __asm movaps xmm4, xmm3 \ - __asm movaps xmm6, xmm2 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm5, xmm6 \ - __asm movaps [eax+row*48], xmm5 \ - __asm movaps xmm4, xmm3 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm movaps xmm7, xmm1 \ - __asm mulps xmm7, xmm4 \ - __asm movaps xmm4, xmm3 \ - __asm movaps xmm5, xmm0 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \ - __asm mulps xmm5, xmm4 \ - __asm movaps xmm4, xmm3 \ - __asm movaps xmm6, xmm2 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm5, xmm6 \ - __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \ - __asm movaps xmm6, xmm1 \ - __asm mulps xmm6, xmm3 \ - __asm movaps xmm4, xmm7 \ - __asm movlhps xmm7, xmm6 \ - __asm movhlps xmm6, xmm4 \ - __asm addps xmm6, xmm7 \ - __asm movlps [eax+row*48+16], xmm6 \ - __asm movlps [eax+row*48+24], xmm5 \ - __asm movhps [eax+row*48+32], xmm5 \ - __asm movhps [eax+row*48+40], xmm6 - - MUL_Nx2_2x6_INIT - MUL_Nx2_2x6_ROW2( 0 ) - - return; - } - case 6: { // 6x2 * 2x6 - - MUL_Nx2_2x6_INIT - MUL_Nx2_2x6_ROW2( 0 ) - MUL_Nx2_2x6_ROW2( 1 ) - MUL_Nx2_2x6_ROW2( 2 ) - - return; - } - } - } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l]; - m2Ptr++; - } - m1Ptr += 2; + case 5: { + if ( !( l ^ 5 ) ) { // 5x6 * 6x5 + +#define MUL_Nx6_6x5_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movaps xmm0, [esi] \ + __asm movlps xmm1, [esi+20] \ + __asm movhps xmm1, [esi+28] \ + __asm movlps xmm2, [esi+40] \ + __asm movhps xmm2, [esi+48] \ + __asm movlps xmm3, [esi+60] \ + __asm movhps xmm3, [esi+68] \ + __asm movaps xmm4, [esi+80] \ + __asm movlps xmm5, [esi+100] \ + __asm movhps xmm5, [esi+108] + +#define MUL_Nx6_6x5_ROW( row ) \ + __asm movss xmm7, [edi+row*24+0] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm0 \ + __asm fld dword ptr [edi+(row*6+0)*4] \ + __asm fmul dword ptr [esi+(4+0*5)*4] \ + __asm movss xmm6, [edi+row*24+4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm fld dword ptr [edi+(row*6+1)*4] \ + __asm fmul dword ptr [esi+(4+1*5)*4] \ + __asm faddp st(1),st \ + __asm movss xmm6, [edi+row*24+8] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm fld dword ptr [edi+(row*6+2)*4] \ + __asm fmul dword ptr [esi+(4+2*5)*4] \ + __asm faddp st(1),st \ + __asm movss xmm6, [edi+row*24+12] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm fld dword ptr [edi+(row*6+3)*4] \ + __asm fmul dword ptr [esi+(4+3*5)*4] \ + __asm faddp st(1),st \ + __asm movss xmm6, [edi+row*24+16] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm fld dword ptr [edi+(row*6+4)*4] \ + __asm fmul dword ptr [esi+(4+4*5)*4] \ + __asm faddp st(1),st \ + __asm movss xmm6, [edi+row*24+20] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm fld dword ptr [edi+(row*6+5)*4] \ + __asm fmul dword ptr [esi+(4+5*5)*4] \ + __asm faddp st(1),st \ + __asm fstp dword ptr [eax+(row*5+4)*4] \ + __asm movlps [eax+row*20], xmm7 \ + __asm movhps [eax+row*20+8], xmm7 + + MUL_Nx6_6x5_INIT + MUL_Nx6_6x5_ROW( 0 ) + MUL_Nx6_6x5_ROW( 1 ) + MUL_Nx6_6x5_ROW( 2 ) + MUL_Nx6_6x5_ROW( 3 ) + MUL_Nx6_6x5_ROW( 4 ) + + return; } break; } - case 3: { - if ( !(l^6) ) { - switch( k ) { - case 3: { // 3x3 * 3x6 - __asm { - mov esi, m2Ptr - mov edi, m1Ptr - mov eax, dstPtr - movaps xmm5, xmmword ptr [esi] - movlps xmm6, qword ptr [esi+24] - movhps xmm6, qword ptr [esi+32] - movaps xmm7, xmmword ptr [esi+48] - movss xmm0, dword ptr [edi] - shufps xmm0, xmm0, 0 - mulps xmm0, xmm5 - movss xmm1, dword ptr [edi+4] - shufps xmm1, xmm1, 0 - mulps xmm1, xmm6 - movss xmm2, dword ptr [edi+8] - shufps xmm2, xmm2, 0 - mulps xmm2, xmm7 - addps xmm0, xmm1 - addps xmm0, xmm2 - movaps xmmword ptr [eax], xmm0 - movss xmm3, dword ptr [edi+12] - shufps xmm3, xmm3, 0 - mulps xmm3, xmm5 - movss xmm4, dword ptr [edi+16] - shufps xmm4, xmm4, 0 - mulps xmm4, xmm6 - movss xmm0, dword ptr [edi+20] - shufps xmm0, xmm0, 0 - mulps xmm0, xmm7 - addps xmm3, xmm4 - addps xmm0, xmm3 - movlps qword ptr [eax+24], xmm0 - movhps qword ptr [eax+32], xmm0 - movss xmm1, dword ptr [edi+24] - shufps xmm1, xmm1, 0 - mulps xmm1, xmm5 - movss xmm2, dword ptr [edi+28] - shufps xmm2, xmm2, 0 - mulps xmm2, xmm6 - movss xmm3, dword ptr [edi+32] - shufps xmm3, xmm3, 0 - mulps xmm3, xmm7 - addps xmm1, xmm2 - addps xmm1, xmm3 - movaps xmmword ptr [eax+48], xmm1 - movlps xmm5, qword ptr [esi+16] - movlps xmm6, qword ptr [esi+40] - movlps xmm7, qword ptr [esi+64] - shufps xmm5, xmm5, 0x44 - shufps xmm6, xmm6, 0x44 - shufps xmm7, xmm7, 0x44 - movaps xmm3, xmmword ptr [edi] - movlps xmm4, qword ptr [edi+16] - movaps xmm0, xmm3 - shufps xmm0, xmm0, 0xF0 - mulps xmm0, xmm5 - movaps xmm1, xmm3 - shufps xmm1, xmm4, 0x05 - mulps xmm1, xmm6 - shufps xmm3, xmm4, 0x5A - mulps xmm3, xmm7 - addps xmm1, xmm0 - addps xmm1, xmm3 - movlps qword ptr [eax+16], xmm1 - movhps qword ptr [eax+40], xmm1 - movss xmm0, dword ptr [edi+24] - shufps xmm0, xmm0, 0 - mulps xmm0, xmm5 - movss xmm2, dword ptr [edi+28] - shufps xmm2, xmm2, 0 - mulps xmm2, xmm6 - movss xmm4, dword ptr [edi+32] - shufps xmm4, xmm4, 0 - mulps xmm4, xmm7 - addps xmm0, xmm2 - addps xmm0, xmm4 - movlps qword ptr [eax+64], xmm0 - } - return; - } - case 6: { // 6x3 * 3x6 - #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movlps xmm0, [esi+ 0*4] \ - __asm movhps xmm0, [esi+ 2*4] \ - __asm movlps xmm1, [esi+ 6*4] \ - __asm movhps xmm1, [esi+ 8*4] \ - __asm movlps xmm2, [esi+12*4] \ - __asm movhps xmm2, [esi+14*4] - - #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \ - __asm movss xmm3, [edi+(row*3+0)*4] \ - __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm3, xmm0 \ - __asm movss xmm4, [edi+(row*3+1)*4] \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm4, xmm1 \ - __asm addps xmm3, xmm4 \ - __asm movss xmm5, [edi+(row*3+2)*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm3, xmm5 \ - __asm movlps [eax+(row*6+0)*4], xmm3 \ - __asm movhps [eax+(row*6+2)*4], xmm3 - - #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \ - __asm movlps xmm0, [esi+ 4*4] \ - __asm movlps xmm1, [esi+10*4] \ - __asm movlps xmm2, [esi+16*4] \ - __asm shufps xmm0, xmm0, 0x44 \ - __asm shufps xmm1, xmm1, 0x44 \ - __asm shufps xmm2, xmm2, 0x44 \ - __asm movlps xmm3, [edi+0*4] \ - __asm movhps xmm3, [edi+2*4] \ - __asm movaps xmm4, xmm3 \ - __asm movaps xmm5, xmm3 \ - __asm shufps xmm3, xmm3, 0xF0 \ - __asm mulps xmm3, xmm0 \ - __asm movlps xmm6, [edi+4*4] \ - __asm movhps xmm6, [edi+6*4] \ - __asm shufps xmm4, xmm6, 0x05 \ - __asm mulps xmm4, xmm1 \ - __asm addps xmm3, xmm4 \ - __asm shufps xmm5, xmm6, 0x5A \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm3, xmm5 \ - __asm movlps [eax+4*4], xmm3 \ - __asm movhps [eax+10*4], xmm3 \ - __asm movaps xmm5, xmm6 \ - __asm movlps xmm3, [edi+8*4] \ - __asm movhps xmm3, [edi+10*4] \ - __asm movaps xmm4, xmm3 \ - __asm shufps xmm5, xmm3, 0x5A \ - __asm mulps xmm5, xmm0 \ - __asm shufps xmm6, xmm3, 0xAF \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm5, xmm6 \ - __asm shufps xmm4, xmm4, 0xF0 \ - __asm mulps xmm4, xmm2 \ - __asm addps xmm4, xmm5 \ - __asm movlps [eax+16*4], xmm4 \ - __asm movhps [eax+22*4], xmm4 \ - __asm movlps xmm6, [edi+12*4] \ - __asm movhps xmm6, [edi+14*4] \ - __asm movaps xmm5, xmm6 \ - __asm movaps xmm4, xmm6 \ - __asm shufps xmm6, xmm6, 0xF0 \ - __asm mulps xmm6, xmm0 \ - __asm movlps xmm3, [edi+16*4] \ - __asm shufps xmm5, xmm3, 0x05 \ - __asm mulps xmm5, xmm1 \ - __asm addps xmm5, xmm6 \ - __asm shufps xmm4, xmm3, 0x5A \ - __asm mulps xmm4, xmm2 \ - __asm addps xmm4, xmm5 \ - __asm movlps [eax+28*4], xmm4 \ - __asm movhps [eax+34*4], xmm4 - - MUL_Nx3_3x6_FIRST4COLUMNS_INIT - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 ) - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 ) - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 ) - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 ) - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 ) - MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 ) - MUL_Nx3_3x6_LAST2COLUMNS_ROW6 - - return; - } + case 6: { + switch ( l ) { + case 1: { // 6x6 * 6x1 + __asm { + mov esi, m2Ptr + mov edi, m1Ptr + mov eax, dstPtr + movlps xmm7, qword ptr [esi] + movlps xmm6, qword ptr [esi+8] + shufps xmm7, xmm7, 0x44 + shufps xmm6, xmm6, 0x44 + movlps xmm0, qword ptr [edi ] + movhps xmm0, qword ptr [edi+ 24] + mulps xmm0, xmm7 + movlps xmm3, qword ptr [edi+ 8] + movhps xmm3, qword ptr [edi+ 32] + mulps xmm3, xmm6 + movlps xmm1, qword ptr [edi+ 48] + movhps xmm1, qword ptr [edi+ 72] + mulps xmm1, xmm7 + movlps xmm2, qword ptr [edi+ 96] + movhps xmm2, qword ptr [edi+120] + mulps xmm2, xmm7 + movlps xmm4, qword ptr [edi+ 56] + movhps xmm4, qword ptr [edi+ 80] + movlps xmm5, qword ptr [edi+104] + movhps xmm5, qword ptr [edi+128] + mulps xmm4, xmm6 + movlps xmm7, qword ptr [esi+16] + addps xmm0, xmm3 + shufps xmm7, xmm7, 0x44 + mulps xmm5, xmm6 + addps xmm1, xmm4 + movlps xmm3, qword ptr [edi+ 16] + movhps xmm3, qword ptr [edi+ 40] + addps xmm2, xmm5 + movlps xmm4, qword ptr [edi+ 64] + movhps xmm4, qword ptr [edi+ 88] + mulps xmm3, xmm7 + movlps xmm5, qword ptr [edi+112] + movhps xmm5, qword ptr [edi+136] + addps xmm0, xmm3 + mulps xmm4, xmm7 + mulps xmm5, xmm7 + addps xmm1, xmm4 + addps xmm2, xmm5 + movaps xmm6, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm6, xmm1, 0xDD + movaps xmm7, xmm2 + shufps xmm7, xmm2, 0x88 + shufps xmm2, xmm2, 0xDD + addps xmm0, xmm6 + addps xmm2, xmm7 + movlps [eax], xmm0 + movhps [eax+8], xmm0 + movlps [eax+16], xmm2 } + return; } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l]; - m2Ptr++; - } - m1Ptr += 3; + case 2: { // 6x6 * 6x2 + + MUL_Nx6_6x2_INIT + MUL_Nx6_6x2_ROW2( 0 ) + MUL_Nx6_6x2_ROW2( 1 ) + MUL_Nx6_6x2_ROW2( 2 ) + + return; } - break; - } - case 4: { - if ( !(l^6) ) { - switch( k ) { - case 4: { // 4x4 * 4x6 - - #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movlps xmm0, [esi+ 0*4] \ - __asm movhps xmm0, [esi+ 2*4] \ - __asm movlps xmm1, [esi+ 6*4] \ - __asm movhps xmm1, [esi+ 8*4] \ - __asm movlps xmm2, [esi+12*4] \ - __asm movhps xmm2, [esi+14*4] \ - __asm movlps xmm3, [esi+18*4] \ - __asm movhps xmm3, [esi+20*4] - - #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \ - __asm movss xmm4, [edi+row*16+0*4] \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm4, xmm0 \ - __asm movss xmm5, [edi+row*16+1*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm1 \ - __asm addps xmm4, xmm5 \ - __asm movss xmm6, [edi+row*16+2*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm4, xmm6 \ - __asm movss xmm7, [edi+row*16+3*4] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm3 \ - __asm addps xmm4, xmm7 \ - __asm movlps [eax+row*24+0], xmm4 \ - __asm movhps [eax+row*24+8], xmm4 - - #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \ - __asm movlps xmm0, [esi+ 4*4] \ - __asm movlps xmm1, [esi+10*4] \ - __asm movlps xmm2, [esi+16*4] \ - __asm movlps xmm3, [esi+22*4] \ - __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) - - #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \ - __asm movlps xmm7, [edi+row*32+ 0*4] \ - __asm movhps xmm7, [edi+row*32+ 4*4] \ - __asm movaps xmm6, xmm7 \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \ - __asm mulps xmm6, xmm0 \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \ - __asm mulps xmm7, xmm1 \ - __asm addps xmm6, xmm7 \ - __asm movlps xmm4, [edi+row*32+ 2*4] \ - __asm movhps xmm4, [edi+row*32+ 6*4] \ - __asm movaps xmm5, xmm4 \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm6, xmm5 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \ - __asm mulps xmm4, xmm3 \ - __asm addps xmm6, xmm4 \ - __asm movlps [eax+row*48+ 4*4], xmm6 \ - __asm movhps [eax+row*48+10*4], xmm6 - - MUL_Nx4_4x6_FIRST4COLUMNS_INIT - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) - MUL_Nx4_4x6_LAST2COLUMNS_INIT - MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) - MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) - - return; - } - case 6: { // 6x4 * 4x6 - - MUL_Nx4_4x6_FIRST4COLUMNS_INIT - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 ) - MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 ) - MUL_Nx4_4x6_LAST2COLUMNS_INIT - MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) - MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) - MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 ) - - return; - } - } + case 3: { // 6x6 * 6x3 + + MUL_Nx6_6x3_INIT + MUL_Nx6_6x3_ROW( 0 ) + MUL_Nx6_6x3_ROW( 1 ) + MUL_Nx6_6x3_ROW( 2 ) + MUL_Nx6_6x3_ROW( 3 ) + MUL_Nx6_6x3_ROW( 4 ) + MUL_Nx6_6x3_ROW( 5 ) + + return; } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + - m1Ptr[3] * m2Ptr[3*l]; - m2Ptr++; - } - m1Ptr += 4; + case 4: { // 6x6 * 6x4 + + MUL_Nx6_6x4_INIT + MUL_Nx6_6x4_ROW( 0 ) + MUL_Nx6_6x4_ROW( 1 ) + MUL_Nx6_6x4_ROW( 2 ) + MUL_Nx6_6x4_ROW( 3 ) + MUL_Nx6_6x4_ROW( 4 ) + MUL_Nx6_6x4_ROW( 5 ) + + return; } - break; - } - case 5: { - if ( !(l^6) ) { - switch( k ) { - case 5: { // 5x5 * 5x6 - - #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movlps xmm0, [esi+ 0*4] \ - __asm movhps xmm0, [esi+ 2*4] \ - __asm movlps xmm1, [esi+ 6*4] \ - __asm movhps xmm1, [esi+ 8*4] \ - __asm movlps xmm2, [esi+12*4] \ - __asm movhps xmm2, [esi+14*4] \ - __asm movlps xmm3, [esi+18*4] \ - __asm movhps xmm3, [esi+20*4] \ - __asm movlps xmm4, [esi+24*4] \ - __asm movhps xmm4, [esi+26*4] - - #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \ - __asm movss xmm6, [edi+row*20+0*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm0 \ - __asm movss xmm5, [edi+row*20+1*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm1 \ - __asm addps xmm6, xmm5 \ - __asm movss xmm5, [edi+row*20+2*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm6, xmm5 \ - __asm movss xmm5, [edi+row*20+3*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm3 \ - __asm addps xmm6, xmm5 \ - __asm movss xmm5, [edi+row*20+4*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm5, xmm4 \ - __asm addps xmm6, xmm5 \ - __asm movlps [eax+row*24+0], xmm6 \ - __asm movhps [eax+row*24+8], xmm6 - - #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \ - __asm movlps xmm0, [esi+ 4*4] \ - __asm movlps xmm1, [esi+10*4] \ - __asm movlps xmm2, [esi+16*4] \ - __asm movlps xmm3, [esi+22*4] \ - __asm movlps xmm4, [esi+28*4] \ - __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) - - #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \ - __asm movlps xmm7, [edi+row*40+ 0*4] \ - __asm movhps xmm7, [edi+row*40+ 6*4] \ - __asm movaps xmm6, xmm7 \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \ - __asm mulps xmm6, xmm0 \ - __asm movaps xmm5, xmm7 \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ - __asm mulps xmm5, xmm1 \ - __asm addps xmm6, xmm5 \ - __asm movlps xmm7, [edi+row*40+ 2*4] \ - __asm movhps xmm7, [edi+row*40+ 8*4] \ - __asm movaps xmm5, xmm7 \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm6, xmm5 \ - __asm movaps xmm5, xmm7 \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ - __asm mulps xmm5, xmm3 \ - __asm addps xmm6, xmm5 \ - __asm movlps xmm5, [edi+row*40+ 4*4] \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm5, xmm4 \ - __asm addps xmm6, xmm5 \ - __asm movlps [eax+row*48+ 4*4], xmm6 \ - __asm movhps [eax+row*48+10*4], xmm6 - - #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \ - __asm movlps xmm6, [edi+20*4+0*4] \ - __asm unpcklps xmm6, xmm6 \ - __asm mulps xmm6, xmm0 \ - __asm movlps xmm5, [edi+20*4+2*4] \ - __asm unpcklps xmm5, xmm5 \ - __asm mulps xmm5, xmm2 \ - __asm addps xmm6, xmm5 \ - __asm movss xmm5, [edi+20*4+4*4] \ - __asm unpcklps xmm5, xmm5 \ - __asm mulps xmm5, xmm4 \ - __asm addps xmm6, xmm5 \ - __asm movhlps xmm7, xmm6 \ - __asm addps xmm6, xmm7 \ - __asm movlps [eax+row*24+4*4], xmm6 - - MUL_Nx5_5x6_FIRST4COLUMNS_INIT - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) - MUL_Nx5_5x6_LAST2COLUMNS_INIT - MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) - MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) - MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 ) - - return; - } - case 6: { // 6x5 * 5x6 - - MUL_Nx5_5x6_FIRST4COLUMNS_INIT - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) - MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 ) - MUL_Nx5_5x6_LAST2COLUMNS_INIT - MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) - MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) - MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 ) - - return; - } - } + case 5: { // 6x6 * 6x5 + + MUL_Nx6_6x5_INIT + MUL_Nx6_6x5_ROW( 0 ) + MUL_Nx6_6x5_ROW( 1 ) + MUL_Nx6_6x5_ROW( 2 ) + MUL_Nx6_6x5_ROW( 3 ) + MUL_Nx6_6x5_ROW( 4 ) + MUL_Nx6_6x5_ROW( 5 ) + + return; } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + - m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l]; - m2Ptr++; + case 6: { // 6x6 * 6x6 + __asm { + mov ecx, dword ptr m2Ptr + movlps xmm3, qword ptr [ecx+72] + mov edx, dword ptr m1Ptr + // Loading first 4 columns (upper 4 rows) of m2Ptr. + movaps xmm0, xmmword ptr [ecx] + movlps xmm1, qword ptr [ecx+24] + movhps xmm1, qword ptr [ecx+32] + movaps xmm2, xmmword ptr [ecx+48] + movhps xmm3, qword ptr [ecx+80] + // Calculating first 4 elements in the first row of the destination matrix. + movss xmm4, dword ptr [edx] + movss xmm5, dword ptr [edx+4] + mov eax, dword ptr dstPtr + shufps xmm4, xmm4, 0 + movss xmm6, dword ptr [edx+8] + shufps xmm5, xmm5, 0 + movss xmm7, dword ptr [edx+12] + mulps xmm4, xmm0 + shufps xmm6, xmm6, 0 + shufps xmm7, xmm7, 0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm5, xmm4 + mulps xmm7, xmm3 + addps xmm6, xmm5 + addps xmm7, xmm6 + movaps xmmword ptr [eax], xmm7 + // Calculating first 4 elements in the second row of the destination matrix. + movss xmm4, dword ptr [edx+24] + shufps xmm4, xmm4, 0 + mulps xmm4, xmm0 + movss xmm5, dword ptr [edx+28] + shufps xmm5, xmm5, 0 + mulps xmm5, xmm1 + movss xmm6, dword ptr [edx+32] + shufps xmm6, xmm6, 0 + movss xmm7, dword ptr [edx+36] + shufps xmm7, xmm7, 0 + mulps xmm6, xmm2 + mulps xmm7, xmm3 + addps xmm7, xmm6 + addps xmm5, xmm4 + addps xmm7, xmm5 + // Calculating first 4 elements in the third row of the destination matrix. + movss xmm4, dword ptr [edx+48] + movss xmm5, dword ptr [edx+52] + movlps qword ptr [eax+24], xmm7 ; save 2nd + movhps qword ptr [eax+32], xmm7 ; row + movss xmm6, dword ptr [edx+56] + movss xmm7, dword ptr [edx+60] + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + shufps xmm6, xmm6, 0 + shufps xmm7, xmm7, 0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + mulps xmm7, xmm3 + addps xmm5, xmm4 + addps xmm7, xmm6 + addps xmm7, xmm5 + movaps xmmword ptr [eax+48], xmm7 + // Calculating first 4 elements in the fourth row of the destination matrix. + movss xmm4, dword ptr [edx+72] + movss xmm5, dword ptr [edx+76] + movss xmm6, dword ptr [edx+80] + movss xmm7, dword ptr [edx+84] + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + shufps xmm6, xmm6, 0 + shufps xmm7, xmm7, 0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + mulps xmm7, xmm3 + addps xmm4, xmm5 + addps xmm6, xmm4 + addps xmm7, xmm6 + movlps qword ptr [eax+72], xmm7 + movhps qword ptr [eax+80], xmm7 + // Calculating first 4 elements in the fifth row of the destination matrix. + movss xmm4, dword ptr [edx+96] + movss xmm5, dword ptr [edx+100] + movss xmm6, dword ptr [edx+104] + movss xmm7, dword ptr [edx+108] + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + shufps xmm6, xmm6, 0 + shufps xmm7, xmm7, 0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + mulps xmm7, xmm3 + addps xmm5, xmm4 + addps xmm7, xmm6 + addps xmm7, xmm5 + movaps xmmword ptr [eax+96], xmm7 + // Calculating first 4 elements in the sixth row of the destination matrix. + movss xmm4, dword ptr [edx+120] + movss xmm5, dword ptr [edx+124] + movss xmm6, dword ptr [edx+128] + movss xmm7, dword ptr [edx+132] + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + shufps xmm6, xmm6, 0 + shufps xmm7, xmm7, 0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + mulps xmm7, xmm3 + addps xmm4, xmm5 + addps xmm6, xmm4 + addps xmm7, xmm6 + movhps qword ptr [eax+128], xmm7 + movlps qword ptr [eax+120], xmm7 + // Loading first 4 columns (lower 2 rows) of m2Ptr. + movlps xmm0, qword ptr [ecx+96] + movhps xmm0, qword ptr [ecx+104] + movlps xmm1, qword ptr [ecx+120] + movhps xmm1, qword ptr [ecx+128] + // Calculating first 4 elements in the first row of the destination matrix. + movss xmm2, dword ptr [edx+16] + shufps xmm2, xmm2, 0 + movss xmm4, dword ptr [edx+40] + movss xmm3, dword ptr [edx+20] + movss xmm5, dword ptr [edx+44] + movaps xmm6, xmmword ptr [eax] + movlps xmm7, qword ptr [eax+24] + shufps xmm3, xmm3, 0 + shufps xmm5, xmm5, 0 + movhps xmm7, qword ptr [eax+32] + shufps xmm4, xmm4, 0 + mulps xmm5, xmm1 + mulps xmm2, xmm0 + mulps xmm3, xmm1 + mulps xmm4, xmm0 + addps xmm6, xmm2 + addps xmm7, xmm4 + addps xmm7, xmm5 + addps xmm6, xmm3 + movlps qword ptr [eax+24], xmm7 + movaps xmmword ptr [eax], xmm6 + movhps qword ptr [eax+32], xmm7 + // Calculating first 4 elements in the third row of the destination matrix. + movss xmm2, dword ptr [edx+64] + movss xmm4, dword ptr [edx+88] + movss xmm5, dword ptr [edx+92] + movss xmm3, dword ptr [edx+68] + movaps xmm6, xmmword ptr [eax+48] + movlps xmm7, qword ptr [eax+72] + movhps xmm7, qword ptr [eax+80] + shufps xmm2, xmm2, 0 + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + shufps xmm3, xmm3, 0 + mulps xmm2, xmm0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm3, xmm1 + addps xmm6, xmm2 + addps xmm6, xmm3 + addps xmm7, xmm4 + addps xmm7, xmm5 + movlps qword ptr [eax+72], xmm7 + movaps xmmword ptr [eax+48], xmm6 + movhps qword ptr [eax+80], xmm7 + // Calculating first 4 elements in the fifth row of the destination matrix. + movss xmm2, dword ptr [edx+112] + movss xmm3, dword ptr [edx+116] + movaps xmm6, xmmword ptr [eax+96] + shufps xmm2, xmm2, 0 + shufps xmm3, xmm3, 0 + mulps xmm2, xmm0 + mulps xmm3, xmm1 + addps xmm6, xmm2 + addps xmm6, xmm3 + movaps xmmword ptr [eax+96], xmm6 + // Calculating first 4 elements in the sixth row of the destination matrix. + movss xmm4, dword ptr [edx+136] + movss xmm5, dword ptr [edx+140] + movhps xmm7, qword ptr [eax+128] + movlps xmm7, qword ptr [eax+120] + shufps xmm4, xmm4, 0 + shufps xmm5, xmm5, 0 + mulps xmm4, xmm0 + mulps xmm5, xmm1 + addps xmm7, xmm4 + addps xmm7, xmm5 + // Calculating last 2 columns of the destination matrix. + movlps xmm0, qword ptr [ecx+16] + movhps xmm0, qword ptr [ecx+40] + movhps qword ptr [eax+128], xmm7 + movlps qword ptr [eax+120], xmm7 + movlps xmm2, qword ptr [ecx+64] + movhps xmm2, qword ptr [ecx+88] + movaps xmm3, xmm2 + shufps xmm3, xmm3, 4Eh + movlps xmm4, qword ptr [ecx+112] + movhps xmm4, qword ptr [ecx+136] + movaps xmm5, xmm4 + shufps xmm5, xmm5, 4Eh + movlps xmm6, qword ptr [edx] + movhps xmm6, qword ptr [edx+24] + movaps xmm7, xmm6 + shufps xmm7, xmm7, 0F0h + mulps xmm7, xmm0 + shufps xmm6, xmm6, 0A5h + movaps xmm1, xmm0 + shufps xmm1, xmm1, 4Eh + mulps xmm1, xmm6 + addps xmm7, xmm1 + movlps xmm6, qword ptr [edx+8] + movhps xmm6, qword ptr [edx+32] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm2 + mulps xmm6, xmm3 + addps xmm7, xmm1 + addps xmm7, xmm6 + movhps xmm6, qword ptr [edx+40] + movlps xmm6, qword ptr [edx+16] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm4 + mulps xmm6, xmm5 + addps xmm7, xmm1 + addps xmm7, xmm6 + movlps qword ptr [eax+16], xmm7 + movhps qword ptr [eax+40], xmm7 + movlps xmm6, qword ptr [edx+48] + movhps xmm6, qword ptr [edx+72] + movaps xmm7, xmm6 + shufps xmm7, xmm7, 0F0h + mulps xmm7, xmm0 + shufps xmm6, xmm6, 0A5h + movaps xmm1, xmm0 + shufps xmm1, xmm1, 4Eh + mulps xmm1, xmm6 + addps xmm7, xmm1 + movhps xmm6, qword ptr [edx+80] + movlps xmm6, qword ptr [edx+56] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm2 + mulps xmm6, xmm3 + addps xmm7, xmm1 + addps xmm7, xmm6 + movlps xmm6, qword ptr [edx+64] + movhps xmm6, qword ptr [edx+88] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm4 + mulps xmm6, xmm5 + addps xmm7, xmm1 + addps xmm7, xmm6 + movlps qword ptr [eax+64], xmm7 + movhps qword ptr [eax+88], xmm7 + movlps xmm6, qword ptr [edx+96] + movhps xmm6, qword ptr [edx+120] + movaps xmm7, xmm6 + shufps xmm7, xmm7, 0F0h + mulps xmm7, xmm0 + shufps xmm6, xmm6, 0A5h + movaps xmm1, xmm0 + shufps xmm1, xmm1, 4Eh + mulps xmm1, xmm6 + addps xmm7, xmm1 + movlps xmm6, qword ptr [edx+104] + movhps xmm6, qword ptr [edx+128] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm2 + mulps xmm6, xmm3 + addps xmm7, xmm1 + addps xmm7, xmm6 + movlps xmm6, qword ptr [edx+112] + movhps xmm6, qword ptr [edx+136] + movaps xmm1, xmm6 + shufps xmm1, xmm1, 0F0h + shufps xmm6, xmm6, 0A5h + mulps xmm1, xmm4 + mulps xmm6, xmm5 + addps xmm7, xmm1 + addps xmm7, xmm6 + movlps qword ptr [eax+112], xmm7 + movhps qword ptr [eax+136], xmm7 } - m1Ptr += 5; + return; } - break; - } - case 6: { - switch( k ) { - case 1: { - if ( !(l^1) ) { // 1x6 * 6x1 - dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] + - m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5]; - return; - } - break; - } - case 2: { - if ( !(l^2) ) { // 2x6 * 6x2 - - #define MUL_Nx6_6x2_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movaps xmm0, [esi] \ - __asm movaps xmm1, [esi+16] \ - __asm movaps xmm2, [esi+32] - - #define MUL_Nx6_6x2_ROW2( row ) \ - __asm movaps xmm7, [edi+row*48+0*4] \ - __asm movaps xmm6, xmm7 \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm7, xmm0 \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movaps xmm6, [edi+row*48+4*4] \ - __asm movaps xmm5, xmm6 \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \ - __asm mulps xmm5, xmm0 \ - __asm movaps xmm6, [edi+row*48+24+2*4] \ - __asm movaps xmm4, xmm6 \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm5, xmm6 \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \ - __asm mulps xmm4, xmm2 \ - __asm addps xmm5, xmm4 \ - __asm movaps xmm4, xmm5 \ - __asm movhlps xmm5, xmm7 \ - __asm movlhps xmm7, xmm4 \ - __asm addps xmm7, xmm5 \ - __asm movaps [eax+row*16], xmm7 - - MUL_Nx6_6x2_INIT - MUL_Nx6_6x2_ROW2( 0 ) - - return; - } - break; - } - case 3: { - if ( !(l^3) ) { // 3x6 * 6x3 - - #define MUL_Nx6_6x3_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movss xmm0, [esi+ 0*4] \ - __asm movhps xmm0, [esi+ 1*4] \ - __asm movss xmm1, [esi+ 3*4] \ - __asm movhps xmm1, [esi+ 4*4] \ - __asm movss xmm2, [esi+ 6*4] \ - __asm movhps xmm2, [esi+ 7*4] \ - __asm movss xmm3, [esi+ 9*4] \ - __asm movhps xmm3, [esi+10*4] \ - __asm movss xmm4, [esi+12*4] \ - __asm movhps xmm4, [esi+13*4] \ - __asm movss xmm5, [esi+15*4] \ - __asm movhps xmm5, [esi+16*4] - - #define MUL_Nx6_6x3_ROW( row ) \ - __asm movss xmm7, [edi+row*24+0] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm0 \ - __asm movss xmm6, [edi+row*24+4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+8] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+12] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+16] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+20] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm movss [eax+row*12+0], xmm7 \ - __asm movhps [eax+row*12+4], xmm7 - - MUL_Nx6_6x3_INIT - MUL_Nx6_6x3_ROW( 0 ) - MUL_Nx6_6x3_ROW( 1 ) - MUL_Nx6_6x3_ROW( 2 ) - - return; - } - break; - } - case 4: { - if ( !(l^4) ) { // 4x6 * 6x4 - - #define MUL_Nx6_6x4_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movaps xmm0, [esi] \ - __asm movaps xmm1, [esi+16] \ - __asm movaps xmm2, [esi+32] \ - __asm movaps xmm3, [esi+48] \ - __asm movaps xmm4, [esi+64] \ - __asm movaps xmm5, [esi+80] - - #define MUL_Nx6_6x4_ROW( row ) \ - __asm movss xmm7, [edi+row*24+0] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm0 \ - __asm movss xmm6, [edi+row*24+4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+8] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+12] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+16] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+row*24+20] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm movaps [eax+row*16], xmm7 - - MUL_Nx6_6x4_INIT - MUL_Nx6_6x4_ROW( 0 ) - MUL_Nx6_6x4_ROW( 1 ) - MUL_Nx6_6x4_ROW( 2 ) - MUL_Nx6_6x4_ROW( 3 ) - - return; - } - break; - } - case 5: { - if ( !(l^5) ) { // 5x6 * 6x5 - - #define MUL_Nx6_6x5_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movaps xmm0, [esi] \ - __asm movlps xmm1, [esi+20] \ - __asm movhps xmm1, [esi+28] \ - __asm movlps xmm2, [esi+40] \ - __asm movhps xmm2, [esi+48] \ - __asm movlps xmm3, [esi+60] \ - __asm movhps xmm3, [esi+68] \ - __asm movaps xmm4, [esi+80] \ - __asm movlps xmm5, [esi+100] \ - __asm movhps xmm5, [esi+108] - - #define MUL_Nx6_6x5_ROW( row ) \ - __asm movss xmm7, [edi+row*24+0] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm0 \ - __asm fld dword ptr [edi+(row*6+0)*4] \ - __asm fmul dword ptr [esi+(4+0*5)*4] \ - __asm movss xmm6, [edi+row*24+4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm fld dword ptr [edi+(row*6+1)*4] \ - __asm fmul dword ptr [esi+(4+1*5)*4] \ - __asm faddp st(1),st \ - __asm movss xmm6, [edi+row*24+8] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm fld dword ptr [edi+(row*6+2)*4] \ - __asm fmul dword ptr [esi+(4+2*5)*4] \ - __asm faddp st(1),st \ - __asm movss xmm6, [edi+row*24+12] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm fld dword ptr [edi+(row*6+3)*4] \ - __asm fmul dword ptr [esi+(4+3*5)*4] \ - __asm faddp st(1),st \ - __asm movss xmm6, [edi+row*24+16] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm fld dword ptr [edi+(row*6+4)*4] \ - __asm fmul dword ptr [esi+(4+4*5)*4] \ - __asm faddp st(1),st \ - __asm movss xmm6, [edi+row*24+20] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm fld dword ptr [edi+(row*6+5)*4] \ - __asm fmul dword ptr [esi+(4+5*5)*4] \ - __asm faddp st(1),st \ - __asm fstp dword ptr [eax+(row*5+4)*4] \ - __asm movlps [eax+row*20], xmm7 \ - __asm movhps [eax+row*20+8], xmm7 - - MUL_Nx6_6x5_INIT - MUL_Nx6_6x5_ROW( 0 ) - MUL_Nx6_6x5_ROW( 1 ) - MUL_Nx6_6x5_ROW( 2 ) - MUL_Nx6_6x5_ROW( 3 ) - MUL_Nx6_6x5_ROW( 4 ) - - return; - } - break; - } - case 6: { - switch( l ) { - case 1: { // 6x6 * 6x1 - __asm { - mov esi, m2Ptr - mov edi, m1Ptr - mov eax, dstPtr - movlps xmm7, qword ptr [esi] - movlps xmm6, qword ptr [esi+8] - shufps xmm7, xmm7, 0x44 - shufps xmm6, xmm6, 0x44 - movlps xmm0, qword ptr [edi ] - movhps xmm0, qword ptr [edi+ 24] - mulps xmm0, xmm7 - movlps xmm3, qword ptr [edi+ 8] - movhps xmm3, qword ptr [edi+ 32] - mulps xmm3, xmm6 - movlps xmm1, qword ptr [edi+ 48] - movhps xmm1, qword ptr [edi+ 72] - mulps xmm1, xmm7 - movlps xmm2, qword ptr [edi+ 96] - movhps xmm2, qword ptr [edi+120] - mulps xmm2, xmm7 - movlps xmm4, qword ptr [edi+ 56] - movhps xmm4, qword ptr [edi+ 80] - movlps xmm5, qword ptr [edi+104] - movhps xmm5, qword ptr [edi+128] - mulps xmm4, xmm6 - movlps xmm7, qword ptr [esi+16] - addps xmm0, xmm3 - shufps xmm7, xmm7, 0x44 - mulps xmm5, xmm6 - addps xmm1, xmm4 - movlps xmm3, qword ptr [edi+ 16] - movhps xmm3, qword ptr [edi+ 40] - addps xmm2, xmm5 - movlps xmm4, qword ptr [edi+ 64] - movhps xmm4, qword ptr [edi+ 88] - mulps xmm3, xmm7 - movlps xmm5, qword ptr [edi+112] - movhps xmm5, qword ptr [edi+136] - addps xmm0, xmm3 - mulps xmm4, xmm7 - mulps xmm5, xmm7 - addps xmm1, xmm4 - addps xmm2, xmm5 - movaps xmm6, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm6, xmm1, 0xDD - movaps xmm7, xmm2 - shufps xmm7, xmm2, 0x88 - shufps xmm2, xmm2, 0xDD - addps xmm0, xmm6 - addps xmm2, xmm7 - movlps [eax], xmm0 - movhps [eax+8], xmm0 - movlps [eax+16], xmm2 - } - return; - } - case 2: { // 6x6 * 6x2 - - MUL_Nx6_6x2_INIT - MUL_Nx6_6x2_ROW2( 0 ) - MUL_Nx6_6x2_ROW2( 1 ) - MUL_Nx6_6x2_ROW2( 2 ) - - return; - } - case 3: { // 6x6 * 6x3 - - MUL_Nx6_6x3_INIT - MUL_Nx6_6x3_ROW( 0 ) - MUL_Nx6_6x3_ROW( 1 ) - MUL_Nx6_6x3_ROW( 2 ) - MUL_Nx6_6x3_ROW( 3 ) - MUL_Nx6_6x3_ROW( 4 ) - MUL_Nx6_6x3_ROW( 5 ) - - return; - } - case 4: { // 6x6 * 6x4 - - MUL_Nx6_6x4_INIT - MUL_Nx6_6x4_ROW( 0 ) - MUL_Nx6_6x4_ROW( 1 ) - MUL_Nx6_6x4_ROW( 2 ) - MUL_Nx6_6x4_ROW( 3 ) - MUL_Nx6_6x4_ROW( 4 ) - MUL_Nx6_6x4_ROW( 5 ) - - return; - } - case 5: { // 6x6 * 6x5 - - MUL_Nx6_6x5_INIT - MUL_Nx6_6x5_ROW( 0 ) - MUL_Nx6_6x5_ROW( 1 ) - MUL_Nx6_6x5_ROW( 2 ) - MUL_Nx6_6x5_ROW( 3 ) - MUL_Nx6_6x5_ROW( 4 ) - MUL_Nx6_6x5_ROW( 5 ) - - return; - } - case 6: { // 6x6 * 6x6 - __asm { - mov ecx, dword ptr m2Ptr - movlps xmm3, qword ptr [ecx+72] - mov edx, dword ptr m1Ptr - // Loading first 4 columns (upper 4 rows) of m2Ptr. - movaps xmm0, xmmword ptr [ecx] - movlps xmm1, qword ptr [ecx+24] - movhps xmm1, qword ptr [ecx+32] - movaps xmm2, xmmword ptr [ecx+48] - movhps xmm3, qword ptr [ecx+80] - // Calculating first 4 elements in the first row of the destination matrix. - movss xmm4, dword ptr [edx] - movss xmm5, dword ptr [edx+4] - mov eax, dword ptr dstPtr - shufps xmm4, xmm4, 0 - movss xmm6, dword ptr [edx+8] - shufps xmm5, xmm5, 0 - movss xmm7, dword ptr [edx+12] - mulps xmm4, xmm0 - shufps xmm6, xmm6, 0 - shufps xmm7, xmm7, 0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - addps xmm5, xmm4 - mulps xmm7, xmm3 - addps xmm6, xmm5 - addps xmm7, xmm6 - movaps xmmword ptr [eax], xmm7 - // Calculating first 4 elements in the second row of the destination matrix. - movss xmm4, dword ptr [edx+24] - shufps xmm4, xmm4, 0 - mulps xmm4, xmm0 - movss xmm5, dword ptr [edx+28] - shufps xmm5, xmm5, 0 - mulps xmm5, xmm1 - movss xmm6, dword ptr [edx+32] - shufps xmm6, xmm6, 0 - movss xmm7, dword ptr [edx+36] - shufps xmm7, xmm7, 0 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm7, xmm6 - addps xmm5, xmm4 - addps xmm7, xmm5 - // Calculating first 4 elements in the third row of the destination matrix. - movss xmm4, dword ptr [edx+48] - movss xmm5, dword ptr [edx+52] - movlps qword ptr [eax+24], xmm7 ; save 2nd - movhps qword ptr [eax+32], xmm7 ; row - movss xmm6, dword ptr [edx+56] - movss xmm7, dword ptr [edx+60] - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - shufps xmm6, xmm6, 0 - shufps xmm7, xmm7, 0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm5, xmm4 - addps xmm7, xmm6 - addps xmm7, xmm5 - movaps xmmword ptr [eax+48], xmm7 - // Calculating first 4 elements in the fourth row of the destination matrix. - movss xmm4, dword ptr [edx+72] - movss xmm5, dword ptr [edx+76] - movss xmm6, dword ptr [edx+80] - movss xmm7, dword ptr [edx+84] - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - shufps xmm6, xmm6, 0 - shufps xmm7, xmm7, 0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm4, xmm5 - addps xmm6, xmm4 - addps xmm7, xmm6 - movlps qword ptr [eax+72], xmm7 - movhps qword ptr [eax+80], xmm7 - // Calculating first 4 elements in the fifth row of the destination matrix. - movss xmm4, dword ptr [edx+96] - movss xmm5, dword ptr [edx+100] - movss xmm6, dword ptr [edx+104] - movss xmm7, dword ptr [edx+108] - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - shufps xmm6, xmm6, 0 - shufps xmm7, xmm7, 0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm5, xmm4 - addps xmm7, xmm6 - addps xmm7, xmm5 - movaps xmmword ptr [eax+96], xmm7 - // Calculating first 4 elements in the sixth row of the destination matrix. - movss xmm4, dword ptr [edx+120] - movss xmm5, dword ptr [edx+124] - movss xmm6, dword ptr [edx+128] - movss xmm7, dword ptr [edx+132] - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - shufps xmm6, xmm6, 0 - shufps xmm7, xmm7, 0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm4, xmm5 - addps xmm6, xmm4 - addps xmm7, xmm6 - movhps qword ptr [eax+128], xmm7 - movlps qword ptr [eax+120], xmm7 - // Loading first 4 columns (lower 2 rows) of m2Ptr. - movlps xmm0, qword ptr [ecx+96] - movhps xmm0, qword ptr [ecx+104] - movlps xmm1, qword ptr [ecx+120] - movhps xmm1, qword ptr [ecx+128] - // Calculating first 4 elements in the first row of the destination matrix. - movss xmm2, dword ptr [edx+16] - shufps xmm2, xmm2, 0 - movss xmm4, dword ptr [edx+40] - movss xmm3, dword ptr [edx+20] - movss xmm5, dword ptr [edx+44] - movaps xmm6, xmmword ptr [eax] - movlps xmm7, qword ptr [eax+24] - shufps xmm3, xmm3, 0 - shufps xmm5, xmm5, 0 - movhps xmm7, qword ptr [eax+32] - shufps xmm4, xmm4, 0 - mulps xmm5, xmm1 - mulps xmm2, xmm0 - mulps xmm3, xmm1 - mulps xmm4, xmm0 - addps xmm6, xmm2 - addps xmm7, xmm4 - addps xmm7, xmm5 - addps xmm6, xmm3 - movlps qword ptr [eax+24], xmm7 - movaps xmmword ptr [eax], xmm6 - movhps qword ptr [eax+32], xmm7 - // Calculating first 4 elements in the third row of the destination matrix. - movss xmm2, dword ptr [edx+64] - movss xmm4, dword ptr [edx+88] - movss xmm5, dword ptr [edx+92] - movss xmm3, dword ptr [edx+68] - movaps xmm6, xmmword ptr [eax+48] - movlps xmm7, qword ptr [eax+72] - movhps xmm7, qword ptr [eax+80] - shufps xmm2, xmm2, 0 - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - shufps xmm3, xmm3, 0 - mulps xmm2, xmm0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm3, xmm1 - addps xmm6, xmm2 - addps xmm6, xmm3 - addps xmm7, xmm4 - addps xmm7, xmm5 - movlps qword ptr [eax+72], xmm7 - movaps xmmword ptr [eax+48], xmm6 - movhps qword ptr [eax+80], xmm7 - // Calculating first 4 elements in the fifth row of the destination matrix. - movss xmm2, dword ptr [edx+112] - movss xmm3, dword ptr [edx+116] - movaps xmm6, xmmword ptr [eax+96] - shufps xmm2, xmm2, 0 - shufps xmm3, xmm3, 0 - mulps xmm2, xmm0 - mulps xmm3, xmm1 - addps xmm6, xmm2 - addps xmm6, xmm3 - movaps xmmword ptr [eax+96], xmm6 - // Calculating first 4 elements in the sixth row of the destination matrix. - movss xmm4, dword ptr [edx+136] - movss xmm5, dword ptr [edx+140] - movhps xmm7, qword ptr [eax+128] - movlps xmm7, qword ptr [eax+120] - shufps xmm4, xmm4, 0 - shufps xmm5, xmm5, 0 - mulps xmm4, xmm0 - mulps xmm5, xmm1 - addps xmm7, xmm4 - addps xmm7, xmm5 - // Calculating last 2 columns of the destination matrix. - movlps xmm0, qword ptr [ecx+16] - movhps xmm0, qword ptr [ecx+40] - movhps qword ptr [eax+128], xmm7 - movlps qword ptr [eax+120], xmm7 - movlps xmm2, qword ptr [ecx+64] - movhps xmm2, qword ptr [ecx+88] - movaps xmm3, xmm2 - shufps xmm3, xmm3, 4Eh - movlps xmm4, qword ptr [ecx+112] - movhps xmm4, qword ptr [ecx+136] - movaps xmm5, xmm4 - shufps xmm5, xmm5, 4Eh - movlps xmm6, qword ptr [edx] - movhps xmm6, qword ptr [edx+24] - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0F0h - mulps xmm7, xmm0 - shufps xmm6, xmm6, 0A5h - movaps xmm1, xmm0 - shufps xmm1, xmm1, 4Eh - mulps xmm1, xmm6 - addps xmm7, xmm1 - movlps xmm6, qword ptr [edx+8] - movhps xmm6, qword ptr [edx+32] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm2 - mulps xmm6, xmm3 - addps xmm7, xmm1 - addps xmm7, xmm6 - movhps xmm6, qword ptr [edx+40] - movlps xmm6, qword ptr [edx+16] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm4 - mulps xmm6, xmm5 - addps xmm7, xmm1 - addps xmm7, xmm6 - movlps qword ptr [eax+16], xmm7 - movhps qword ptr [eax+40], xmm7 - movlps xmm6, qword ptr [edx+48] - movhps xmm6, qword ptr [edx+72] - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0F0h - mulps xmm7, xmm0 - shufps xmm6, xmm6, 0A5h - movaps xmm1, xmm0 - shufps xmm1, xmm1, 4Eh - mulps xmm1, xmm6 - addps xmm7, xmm1 - movhps xmm6, qword ptr [edx+80] - movlps xmm6, qword ptr [edx+56] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm2 - mulps xmm6, xmm3 - addps xmm7, xmm1 - addps xmm7, xmm6 - movlps xmm6, qword ptr [edx+64] - movhps xmm6, qword ptr [edx+88] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm4 - mulps xmm6, xmm5 - addps xmm7, xmm1 - addps xmm7, xmm6 - movlps qword ptr [eax+64], xmm7 - movhps qword ptr [eax+88], xmm7 - movlps xmm6, qword ptr [edx+96] - movhps xmm6, qword ptr [edx+120] - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0F0h - mulps xmm7, xmm0 - shufps xmm6, xmm6, 0A5h - movaps xmm1, xmm0 - shufps xmm1, xmm1, 4Eh - mulps xmm1, xmm6 - addps xmm7, xmm1 - movlps xmm6, qword ptr [edx+104] - movhps xmm6, qword ptr [edx+128] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm2 - mulps xmm6, xmm3 - addps xmm7, xmm1 - addps xmm7, xmm6 - movlps xmm6, qword ptr [edx+112] - movhps xmm6, qword ptr [edx+136] - movaps xmm1, xmm6 - shufps xmm1, xmm1, 0F0h - shufps xmm6, xmm6, 0A5h - mulps xmm1, xmm4 - mulps xmm6, xmm5 - addps xmm7, xmm1 - addps xmm7, xmm6 - movlps qword ptr [eax+112], xmm7 - movhps qword ptr [eax+136], xmm7 - } - return; - } - } - } } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + - m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l]; - m2Ptr++; - } - m1Ptr += 6; + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] + + m1Ptr[3] * m2Ptr[3 * l] + m1Ptr[4] * m2Ptr[4 * l] + m1Ptr[5] * m2Ptr[5 * l]; + m2Ptr++; } - break; + m1Ptr += 6; } - default: { - for ( i = 0; i < k; i++ ) { - for ( j = 0; j < l; j++ ) { - m2Ptr = m2.ToFloatPtr() + j; - sum = m1Ptr[0] * m2Ptr[0]; - for ( n = 1; n < m1.GetNumColumns(); n++ ) { - m2Ptr += l; - sum += m1Ptr[n] * m2Ptr[0]; - } - *dstPtr++ = sum; + break; + } + default: { + for ( i = 0; i < k; i++ ) { + for ( j = 0; j < l; j++ ) { + m2Ptr = m2.ToFloatPtr() + j; + sum = m1Ptr[0] * m2Ptr[0]; + for ( n = 1; n < m1.GetNumColumns(); n++ ) { + m2Ptr += l; + sum += m1Ptr[n] * m2Ptr[0]; } - m1Ptr += m1.GetNumColumns(); + *dstPtr++ = sum; } - break; + m1Ptr += m1.GetNumColumns(); } + break; + } } } @@ -9620,35 +9716,35 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m k = m1.GetNumColumns(); l = m2.GetNumColumns(); - switch( m1.GetNumRows() ) { - case 1: - if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1 - __asm { - mov esi, m2Ptr - mov edi, m1Ptr - mov eax, dstPtr - movss xmm0, [esi] - shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movaps xmm1, xmm0 - mulps xmm0, [edi] - mulps xmm1, [edi+16] - movaps [eax], xmm0 - movlps [eax+16], xmm1 - } - return; + switch ( m1.GetNumRows() ) { + case 1: + if ( !( ( k ^ 6 ) | ( l ^ 1 ) ) ) { // 1x6 * 1x1 + __asm { + mov esi, m2Ptr + mov edi, m1Ptr + mov eax, dstPtr + movss xmm0, [esi] + shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) + movaps xmm1, xmm0 + mulps xmm0, [edi] + mulps xmm1, [edi + 16] + movaps [eax], xmm0 + movlps [eax + 16], xmm1 } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0]; - m2Ptr++; - } - m1Ptr++; + return; + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0]; + m2Ptr++; } - break; - case 2: - if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2 - #define MUL_2xN_2x2_INIT \ + m1Ptr++; + } + break; + case 2: + if ( !( ( k ^ 6 ) | ( l ^ 2 ) ) ) { // 2x6 * 2x2 +#define MUL_2xN_2x2_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ @@ -9657,7 +9753,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movlps xmm1, [esi+8] \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) - #define MUL_2xN_2x2_ROW2( N, row ) \ +#define MUL_2xN_2x2_ROW2( N, row ) \ __asm movlps xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm movlps xmm7, [edi+(row+1*N)*4] \ @@ -9667,26 +9763,26 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm addps xmm6, xmm7 \ __asm movaps [eax+(row*2)*4], xmm6 - MUL_2xN_2x2_INIT - MUL_2xN_2x2_ROW2( 6, 0 ) - MUL_2xN_2x2_ROW2( 6, 2 ) - MUL_2xN_2x2_ROW2( 6, 4 ) + MUL_2xN_2x2_INIT + MUL_2xN_2x2_ROW2( 6, 0 ) + MUL_2xN_2x2_ROW2( 6, 2 ) + MUL_2xN_2x2_ROW2( 6, 4 ) - return; - } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l]; - m2Ptr++; - } - m1Ptr++; + return; + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l]; + m2Ptr++; } - break; - case 3: - if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3 + m1Ptr++; + } + break; + case 3: + if ( !( ( k ^ 6 ) | ( l ^ 3 ) ) ) { // 3x6 * 3x3 - #define MUL_3xN_3x3_INIT \ +#define MUL_3xN_3x3_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ @@ -9697,12 +9793,12 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movss xmm2, [esi+(2*3+0)*4] \ __asm movhps xmm2, [esi+(2*3+1)*4] - #define MUL_3xN_3x3_INIT_ROW4 \ +#define MUL_3xN_3x3_INIT_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 ) - #define MUL_3xN_3x3_ROW4( N, row ) \ +#define MUL_3xN_3x3_ROW4( N, row ) \ __asm movlps xmm3, [edi+(row+0*N+0)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+0)*4] \ @@ -9746,17 +9842,17 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+8)*4], xmm3 - #define MUL_3xN_3x3_INIT_ROW4_ROW4 \ +#define MUL_3xN_3x3_INIT_ROW4_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - #define MUL_3xN_3x3_INIT_ROW4_ROW \ +#define MUL_3xN_3x3_INIT_ROW4_ROW \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 ) - #define MUL_3xN_3x3_ROW( N, row ) \ +#define MUL_3xN_3x3_ROW( N, row ) \ __asm movss xmm3, [edi+(row+0*N)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm movss xmm4, [edi+(row+1*N)*4] \ @@ -9771,28 +9867,28 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movss [eax+(row*3+0)*4], xmm3 \ __asm movhps [eax+(row*3+1)*4], xmm3 - MUL_3xN_3x3_INIT - MUL_3xN_3x3_INIT_ROW4 - MUL_3xN_3x3_ROW4( 6, 0 ) - MUL_3xN_3x3_INIT_ROW4_ROW - MUL_3xN_3x3_ROW( 6, 4 ) - MUL_3xN_3x3_ROW( 6, 5 ) + MUL_3xN_3x3_INIT + MUL_3xN_3x3_INIT_ROW4 + MUL_3xN_3x3_ROW4( 6, 0 ) + MUL_3xN_3x3_INIT_ROW4_ROW + MUL_3xN_3x3_ROW( 6, 4 ) + MUL_3xN_3x3_ROW( 6, 5 ) - return; - } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l]; - m2Ptr++; - } - m1Ptr++; + return; + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l]; + m2Ptr++; } - break; - case 4: - if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4 + m1Ptr++; + } + break; + case 4: + if ( !( ( k ^ 6 ) | ( l ^ 4 ) ) ) { // 4x6 * 4x4 - #define MUL_4xN_4x4_INIT \ +#define MUL_4xN_4x4_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ @@ -9801,7 +9897,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movaps xmm2, [esi+32] \ __asm movaps xmm3, [esi+48] - #define MUL_4xN_4x4_ROW( N, row ) \ +#define MUL_4xN_4x4_ROW( N, row ) \ __asm movss xmm7, [edi+(row+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ @@ -9819,30 +9915,30 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm addps xmm7, xmm6 \ __asm movaps [eax+row*16], xmm7 - MUL_4xN_4x4_INIT - MUL_4xN_4x4_ROW( 6, 0 ) - MUL_4xN_4x4_ROW( 6, 1 ) - MUL_4xN_4x4_ROW( 6, 2 ) - MUL_4xN_4x4_ROW( 6, 3 ) - MUL_4xN_4x4_ROW( 6, 4 ) - MUL_4xN_4x4_ROW( 6, 5 ) + MUL_4xN_4x4_INIT + MUL_4xN_4x4_ROW( 6, 0 ) + MUL_4xN_4x4_ROW( 6, 1 ) + MUL_4xN_4x4_ROW( 6, 2 ) + MUL_4xN_4x4_ROW( 6, 3 ) + MUL_4xN_4x4_ROW( 6, 4 ) + MUL_4xN_4x4_ROW( 6, 5 ) - return; - } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + - m1Ptr[3*k] * m2Ptr[3*l]; - m2Ptr++; - } - m1Ptr++; + return; + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] + + m1Ptr[3 * k] * m2Ptr[3 * l]; + m2Ptr++; } - break; - case 5: - if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5 + m1Ptr++; + } + break; + case 5: + if ( !( ( k ^ 6 ) | ( l ^ 5 ) ) ) { // 5x6 * 5x5 - #define MUL_5xN_5x5_INIT \ +#define MUL_5xN_5x5_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ @@ -9857,7 +9953,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movlps xmm4, [esi+20*4] \ __asm movhps xmm4, [esi+22*4] - #define MUL_5xN_5x5_ROW( N, row ) \ +#define MUL_5xN_5x5_ROW( N, row ) \ __asm movss xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm0 \ @@ -9895,240 +9991,240 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m __asm movlps [eax+(row*5+0)*4], xmm6 \ __asm movhps [eax+(row*5+2)*4], xmm6 - MUL_5xN_5x5_INIT - MUL_5xN_5x5_ROW( 6, 0 ) - MUL_5xN_5x5_ROW( 6, 1 ) - MUL_5xN_5x5_ROW( 6, 2 ) - MUL_5xN_5x5_ROW( 6, 3 ) - MUL_5xN_5x5_ROW( 6, 4 ) - MUL_5xN_5x5_ROW( 6, 5 ) + MUL_5xN_5x5_INIT + MUL_5xN_5x5_ROW( 6, 0 ) + MUL_5xN_5x5_ROW( 6, 1 ) + MUL_5xN_5x5_ROW( 6, 2 ) + MUL_5xN_5x5_ROW( 6, 3 ) + MUL_5xN_5x5_ROW( 6, 4 ) + MUL_5xN_5x5_ROW( 6, 5 ) + + return; + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] + + m1Ptr[3 * k] * m2Ptr[3 * l] + m1Ptr[4 * k] * m2Ptr[4 * l]; + m2Ptr++; + } + m1Ptr++; + } + break; + case 6: + if ( !( l ^ 6 ) ) { + switch ( k ) { + case 1: { // 6x1 * 6x6 +#define MUL_6xN_6x6_FIRST4COLUMNS_INIT \ + __asm mov esi, m2Ptr \ + __asm mov edi, m1Ptr \ + __asm mov eax, dstPtr \ + __asm movlps xmm0, [esi+ 0*4] \ + __asm movhps xmm0, [esi+ 2*4] \ + __asm movlps xmm1, [esi+ 6*4] \ + __asm movhps xmm1, [esi+ 8*4] \ + __asm movlps xmm2, [esi+12*4] \ + __asm movhps xmm2, [esi+14*4] \ + __asm movlps xmm3, [esi+18*4] \ + __asm movhps xmm3, [esi+20*4] \ + __asm movlps xmm4, [esi+24*4] \ + __asm movhps xmm4, [esi+26*4] \ + __asm movlps xmm5, [esi+30*4] \ + __asm movhps xmm5, [esi+32*4] + +#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \ + __asm movss xmm7, [edi+(row+0*N)*4] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm0 \ + __asm movss xmm6, [edi+(row+1*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(row+2*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(row+3*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(row+4*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(row+5*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm movlps [eax+(row*6+0)*4], xmm7 \ + __asm movhps [eax+(row*6+2)*4], xmm7 + +#define MUL_6xN_6x6_LAST2COLUMNS_INIT \ + __asm movlps xmm0, [esi+ 4*4] \ + __asm movlps xmm1, [esi+10*4] \ + __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm movlps xmm2, [esi+16*4] \ + __asm movlps xmm3, [esi+22*4] \ + __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm movlps xmm4, [esi+28*4] \ + __asm movlps xmm5, [esi+34*4] \ + __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ + __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 ) + +#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \ + __asm movlps xmm7, [edi+(row*2+0*N)*4] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm7, xmm0 \ + __asm movlps xmm6, [edi+(row*2+1*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movlps xmm6, [edi+(row*2+2*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm movlps xmm6, [edi+(row*2+3*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm movlps xmm6, [edi+(row*2+4*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm movlps xmm6, [edi+(row*2+5*N)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm movlps [eax+(row*12+ 4)*4], xmm7 \ + __asm movhps [eax+(row*12+10)*4], xmm7 + +#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \ + __asm movss xmm7, [edi+(1*N-1)*4] \ + __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm7, xmm0 \ + __asm movss xmm6, [edi+(2*N-1)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm1 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(3*N-1)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm2 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(4*N-1)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm3 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(5*N-1)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm4 \ + __asm addps xmm7, xmm6 \ + __asm movss xmm6, [edi+(6*N-1)*4] \ + __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ + __asm mulps xmm6, xmm5 \ + __asm addps xmm7, xmm6 \ + __asm movlps [eax+(row*6+4)*4], xmm7 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 ) + + return; + } + case 2: { // 6x2 * 6x6 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 ) return; } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + - m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l]; - m2Ptr++; - } - m1Ptr++; + case 3: { // 6x3 * 6x6 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 ) + + return; } - break; - case 6: - if ( !(l^6) ) { - switch( k ) { - case 1: { // 6x1 * 6x6 - #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \ - __asm mov esi, m2Ptr \ - __asm mov edi, m1Ptr \ - __asm mov eax, dstPtr \ - __asm movlps xmm0, [esi+ 0*4] \ - __asm movhps xmm0, [esi+ 2*4] \ - __asm movlps xmm1, [esi+ 6*4] \ - __asm movhps xmm1, [esi+ 8*4] \ - __asm movlps xmm2, [esi+12*4] \ - __asm movhps xmm2, [esi+14*4] \ - __asm movlps xmm3, [esi+18*4] \ - __asm movhps xmm3, [esi+20*4] \ - __asm movlps xmm4, [esi+24*4] \ - __asm movhps xmm4, [esi+26*4] \ - __asm movlps xmm5, [esi+30*4] \ - __asm movhps xmm5, [esi+32*4] - - #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \ - __asm movss xmm7, [edi+(row+0*N)*4] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm0 \ - __asm movss xmm6, [edi+(row+1*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(row+2*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(row+3*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(row+4*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(row+5*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm movlps [eax+(row*6+0)*4], xmm7 \ - __asm movhps [eax+(row*6+2)*4], xmm7 - - #define MUL_6xN_6x6_LAST2COLUMNS_INIT \ - __asm movlps xmm0, [esi+ 4*4] \ - __asm movlps xmm1, [esi+10*4] \ - __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm movlps xmm2, [esi+16*4] \ - __asm movlps xmm3, [esi+22*4] \ - __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm movlps xmm4, [esi+28*4] \ - __asm movlps xmm5, [esi+34*4] \ - __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ - __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 ) - - #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \ - __asm movlps xmm7, [edi+(row*2+0*N)*4] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm7, xmm0 \ - __asm movlps xmm6, [edi+(row*2+1*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movlps xmm6, [edi+(row*2+2*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm movlps xmm6, [edi+(row*2+3*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm movlps xmm6, [edi+(row*2+4*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm movlps xmm6, [edi+(row*2+5*N)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm movlps [eax+(row*12+ 4)*4], xmm7 \ - __asm movhps [eax+(row*12+10)*4], xmm7 - - #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \ - __asm movss xmm7, [edi+(1*N-1)*4] \ - __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm7, xmm0 \ - __asm movss xmm6, [edi+(2*N-1)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm1 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(3*N-1)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm2 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(4*N-1)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm3 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(5*N-1)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm4 \ - __asm addps xmm7, xmm6 \ - __asm movss xmm6, [edi+(6*N-1)*4] \ - __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ - __asm mulps xmm6, xmm5 \ - __asm addps xmm7, xmm6 \ - __asm movlps [eax+(row*6+4)*4], xmm7 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 ) - - return; - } - case 2: { // 6x2 * 6x6 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 ) - - return; - } - case 3: { // 6x3 * 6x6 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 ) - - return; - } - case 4: { // 6x4 * 6x6 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 ) - - return; - } - case 5: { // 6x5 * 6x6 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 ) - - return; - } - case 6: { // 6x6 * 6x6 - - MUL_6xN_6x6_FIRST4COLUMNS_INIT - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 ) - MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 ) - MUL_6xN_6x6_LAST2COLUMNS_INIT - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 ) - MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 ) - - return; - } - } + case 4: { // 6x4 * 6x6 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 ) + + return; } - for ( i = 0; i < k; i++ ) { - m2Ptr = m2.ToFloatPtr(); - for ( j = 0; j < l; j++ ) { - *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + - m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l]; - m2Ptr++; - } - m1Ptr++; + case 5: { // 6x5 * 6x6 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 ) + + return; } - break; - default: - for ( i = 0; i < k; i++ ) { - for ( j = 0; j < l; j++ ) { - m1Ptr = m1.ToFloatPtr() + i; - m2Ptr = m2.ToFloatPtr() + j; - sum = m1Ptr[0] * m2Ptr[0]; - for ( n = 1; n < m1.GetNumRows(); n++ ) { - m1Ptr += k; - m2Ptr += l; - sum += m1Ptr[0] * m2Ptr[0]; - } - *dstPtr++ = sum; + case 6: { // 6x6 * 6x6 + + MUL_6xN_6x6_FIRST4COLUMNS_INIT + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 ) + MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 ) + MUL_6xN_6x6_LAST2COLUMNS_INIT + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 ) + MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 ) + + return; + } + } + } + for ( i = 0; i < k; i++ ) { + m2Ptr = m2.ToFloatPtr(); + for ( j = 0; j < l; j++ ) { + *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] + + m1Ptr[3 * k] * m2Ptr[3 * l] + m1Ptr[4 * k] * m2Ptr[4 * l] + m1Ptr[5 * k] * m2Ptr[5 * l]; + m2Ptr++; + } + m1Ptr++; + } + break; + default: + for ( i = 0; i < k; i++ ) { + for ( j = 0; j < l; j++ ) { + m1Ptr = m1.ToFloatPtr() + i; + m2Ptr = m2.ToFloatPtr() + j; + sum = m1Ptr[0] * m2Ptr[0]; + for ( n = 1; n < m1.GetNumRows(); n++ ) { + m1Ptr += k; + m2Ptr += l; + sum += m1Ptr[0] * m2Ptr[0]; } + *dstPtr++ = sum; } + } break; } } @@ -10156,54 +10252,86 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co // unrolled cases for n < 8 if ( n < 8 ) { - #define NSKIP( n, s ) ((n<<3)|(s&7)) - switch( NSKIP( n, skip ) ) { - case NSKIP( 1, 0 ): x[0] = b[0]; - return; - case NSKIP( 2, 0 ): x[0] = b[0]; - case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - return; - case NSKIP( 3, 0 ): x[0] = b[0]; - case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - return; - case NSKIP( 4, 0 ): x[0] = b[0]; - case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; - return; - case NSKIP( 5, 0 ): x[0] = b[0]; - case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; - case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; - return; - case NSKIP( 6, 0 ): x[0] = b[0]; - case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; - case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; - case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; - return; - case NSKIP( 7, 0 ): x[0] = b[0]; - case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; - case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; - case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; - case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; - case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; - return; +#define NSKIP( n, s ) ((n<<3)|(s&7)) + switch ( NSKIP( n, skip ) ) { + case NSKIP( 1, 0 ): + x[0] = b[0]; + return; + case NSKIP( 2, 0 ): + x[0] = b[0]; + case NSKIP( 2, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + return; + case NSKIP( 3, 0 ): + x[0] = b[0]; + case NSKIP( 3, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case NSKIP( 3, 2 ): + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + return; + case NSKIP( 4, 0 ): + x[0] = b[0]; + case NSKIP( 4, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case NSKIP( 4, 2 ): + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + case NSKIP( 4, 3 ): + x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; + return; + case NSKIP( 5, 0 ): + x[0] = b[0]; + case NSKIP( 5, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case NSKIP( 5, 2 ): + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + case NSKIP( 5, 3 ): + x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; + case NSKIP( 5, 4 ): + x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; + return; + case NSKIP( 6, 0 ): + x[0] = b[0]; + case NSKIP( 6, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case NSKIP( 6, 2 ): + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + case NSKIP( 6, 3 ): + x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; + case NSKIP( 6, 4 ): + x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; + case NSKIP( 6, 5 ): + x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4]; + return; + case NSKIP( 7, 0 ): + x[0] = b[0]; + case NSKIP( 7, 1 ): + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case NSKIP( 7, 2 ): + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + case NSKIP( 7, 3 ): + x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; + case NSKIP( 7, 4 ): + x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; + case NSKIP( 7, 5 ): + x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4]; + case NSKIP( 7, 6 ): + x[6] = b[6] - lptr[6 * nc + 0] * x[0] - lptr[6 * nc + 1] * x[1] - lptr[6 * nc + 2] * x[2] - lptr[6 * nc + 3] * x[3] - lptr[6 * nc + 4] * x[4] - lptr[6 * nc + 5] * x[5]; + return; } return; } // process first 4 rows - switch( skip ) { - case 0: x[0] = b[0]; - case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; - case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; - case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; - skip = 4; + switch ( skip ) { + case 0: + x[0] = b[0]; + case 1: + x[1] = b[1] - lptr[1 * nc + 0] * x[0]; + case 2: + x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; + case 3: + x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; + skip = 4; } lptr = L[skip]; @@ -10230,56 +10358,56 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co jnz loopurow // aligned - looprow: + looprow: mov ecx, eax neg ecx movaps xmm0, [esi+ecx] mulps xmm0, [edi+ecx] add ecx, 12*4 jg donedot8 - dot8: - movaps xmm1, [esi+ecx-(8*4)] - mulps xmm1, [edi+ecx-(8*4)] + dot8: + movaps xmm1, [esi+ecx-( 8*4 )] + mulps xmm1, [edi+ecx-( 8*4 )] addps xmm0, xmm1 - movaps xmm3, [esi+ecx-(4*4)] - mulps xmm3, [edi+ecx-(4*4)] + movaps xmm3, [esi+ecx-( 4*4 )] + mulps xmm3, [edi+ecx-( 4*4 )] addps xmm0, xmm3 add ecx, 8*4 jle dot8 - donedot8: + donedot8: sub ecx, 4*4 jg donedot4 - //dot4: - movaps xmm1, [esi+ecx-(4*4)] - mulps xmm1, [edi+ecx-(4*4)] + //dot4: + movaps xmm1, [esi+ecx-( 4*4 )] + mulps xmm1, [edi+ecx-( 4*4 )] addps xmm0, xmm1 add ecx, 4*4 - donedot4: + donedot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 - sub ecx, 4*4 + sub ecx, 4 * 4 jz dot0 add ecx, 4 jz dot1 add ecx, 4 jz dot2 - //dot3: - movss xmm1, [esi-(3*4)] - mulss xmm1, [edi-(3*4)] + //dot3: + movss xmm1, [esi - ( 3 * 4 )] + mulss xmm1, [edi - ( 3 * 4 )] addss xmm0, xmm1 - dot2: - movss xmm3, [esi-(2*4)] - mulss xmm3, [edi-(2*4)] + dot2: + movss xmm3, [esi - ( 2 * 4 )] + mulss xmm3, [edi - ( 2 * 4 )] addss xmm0, xmm3 - dot1: - movss xmm5, [esi-(1*4)] - mulss xmm5, [edi-(1*4)] + dot1: + movss xmm5, [esi - ( 1 * 4 )] + mulss xmm5, [edi - ( 1 * 4 )] addss xmm0, xmm5 - dot0: - movss xmm1, [ebx+eax] + dot0: + movss xmm1, [ebx + eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 @@ -10293,63 +10421,63 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co jmp looprow // unaligned - loopurow: + loopurow: mov ecx, eax neg ecx - movups xmm0, [esi+ecx] - movups xmm1, [edi+ecx] + movups xmm0, [esi + ecx] + movups xmm1, [edi + ecx] mulps xmm0, xmm1 - add ecx, 12*4 + add ecx, 12 * 4 jg doneudot8 - udot8: - movups xmm1, [esi+ecx-(8*4)] - movups xmm2, [edi+ecx-(8*4)] + udot8: + movups xmm1, [esi + ecx - ( 8 * 4 )] + movups xmm2, [edi + ecx - ( 8 * 4 )] mulps xmm1, xmm2 addps xmm0, xmm1 - movups xmm3, [esi+ecx-(4*4)] - movups xmm4, [edi+ecx-(4*4)] + movups xmm3, [esi + ecx - ( 4 * 4 )] + movups xmm4, [edi + ecx - ( 4 * 4 )] mulps xmm3, xmm4 addps xmm0, xmm3 - add ecx, 8*4 + add ecx, 8 * 4 jle udot8 - doneudot8: - sub ecx, 4*4 + doneudot8: + sub ecx, 4 * 4 jg doneudot4 - //udot4: - movups xmm1, [esi+ecx-(4*4)] - movups xmm2, [edi+ecx-(4*4)] + //udot4: + movups xmm1, [esi + ecx - ( 4 * 4 )] + movups xmm2, [edi + ecx - ( 4 * 4 )] mulps xmm1, xmm2 addps xmm0, xmm1 - add ecx, 4*4 - doneudot4: + add ecx, 4 * 4 + doneudot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 - sub ecx, 4*4 + sub ecx, 4 * 4 jz udot0 add ecx, 4 jz udot1 add ecx, 4 jz udot2 - //udot3: - movss xmm1, [esi-(3*4)] - movss xmm2, [edi-(3*4)] + //udot3: + movss xmm1, [esi - ( 3 * 4 )] + movss xmm2, [edi - ( 3 * 4 )] mulss xmm1, xmm2 addss xmm0, xmm1 - udot2: - movss xmm3, [esi-(2*4)] - movss xmm4, [edi-(2*4)] + udot2: + movss xmm3, [esi - ( 2 * 4 )] + movss xmm4, [edi - ( 2 * 4 )] mulss xmm3, xmm4 addss xmm0, xmm3 - udot1: - movss xmm5, [esi-(1*4)] - movss xmm6, [edi-(1*4)] + udot1: + movss xmm5, [esi - ( 1 * 4 )] + movss xmm6, [edi - ( 1 * 4 )] mulss xmm5, xmm6 addss xmm0, xmm5 - udot0: - movss xmm1, [ebx+eax] + udot0: + movss xmm1, [ebx + eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 @@ -10361,7 +10489,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co add edi, ecx add edi, 4 jmp loopurow - done: + done: pop ebx } } @@ -10384,51 +10512,51 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo // unrolled cases for n < 8 if ( n < 8 ) { - switch( n ) { - case 0: - return; - case 1: - x[0] = b[0]; - return; - case 2: - x[1] = b[1]; - x[0] = b[0] - lptr[1*nc+0] * x[1]; - return; - case 3: - x[2] = b[2]; - x[1] = b[1] - lptr[2*nc+1] * x[2]; - x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; - return; - case 4: - x[3] = b[3]; - x[2] = b[2] - lptr[3*nc+2] * x[3]; - x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; - x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; - return; - case 5: - x[4] = b[4]; - x[3] = b[3] - lptr[4*nc+3] * x[4]; - x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; - x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; - x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; - return; - case 6: - x[5] = b[5]; - x[4] = b[4] - lptr[5*nc+4] * x[5]; - x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; - x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; - x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; - x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; - return; - case 7: - x[6] = b[6]; - x[5] = b[5] - lptr[6*nc+5] * x[6]; - x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5]; - x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; - x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; - x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; - x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; - return; + switch ( n ) { + case 0: + return; + case 1: + x[0] = b[0]; + return; + case 2: + x[1] = b[1]; + x[0] = b[0] - lptr[1 * nc + 0] * x[1]; + return; + case 3: + x[2] = b[2]; + x[1] = b[1] - lptr[2 * nc + 1] * x[2]; + x[0] = b[0] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1]; + return; + case 4: + x[3] = b[3]; + x[2] = b[2] - lptr[3 * nc + 2] * x[3]; + x[1] = b[1] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2]; + x[0] = b[0] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1]; + return; + case 5: + x[4] = b[4]; + x[3] = b[3] - lptr[4 * nc + 3] * x[4]; + x[2] = b[2] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3]; + x[1] = b[1] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2]; + x[0] = b[0] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1]; + return; + case 6: + x[5] = b[5]; + x[4] = b[4] - lptr[5 * nc + 4] * x[5]; + x[3] = b[3] - lptr[5 * nc + 3] * x[5] - lptr[4 * nc + 3] * x[4]; + x[2] = b[2] - lptr[5 * nc + 2] * x[5] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3]; + x[1] = b[1] - lptr[5 * nc + 1] * x[5] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2]; + x[0] = b[0] - lptr[5 * nc + 0] * x[5] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1]; + return; + case 7: + x[6] = b[6]; + x[5] = b[5] - lptr[6 * nc + 5] * x[6]; + x[4] = b[4] - lptr[6 * nc + 4] * x[6] - lptr[5 * nc + 4] * x[5]; + x[3] = b[3] - lptr[6 * nc + 3] * x[6] - lptr[5 * nc + 3] * x[5] - lptr[4 * nc + 3] * x[4]; + x[2] = b[2] - lptr[6 * nc + 2] * x[6] - lptr[5 * nc + 2] * x[5] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3]; + x[1] = b[1] - lptr[6 * nc + 1] * x[6] - lptr[5 * nc + 1] * x[5] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2]; + x[0] = b[0] - lptr[6 * nc + 0] * x[6] - lptr[5 * nc + 0] * x[5] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1]; + return; } return; } @@ -10458,49 +10586,49 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 - process4rows_1: + process4rows_1: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] xor ecx, ecx sub eax, m neg eax jz done4x4_1 - process4x4_1: // process 4x4 blocks + process4x4_1: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm3, [edi+0] - movhps xmm3, [edi+8] + movlps xmm3, [edi + 0] + movhps xmm3, [edi + 8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+4] + movss xmm1, [esi + 4 * ecx + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm4, [edi+0] - movhps xmm4, [edi+8] + movlps xmm4, [edi + 0] + movhps xmm4, [edi + 8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+8] + movss xmm1, [esi + 4 * ecx + 8] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm5, [edi+0] - movhps xmm5, [edi+8] + movlps xmm5, [edi + 0] + movhps xmm5, [edi + 8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+12] + movss xmm1, [esi + 4 * ecx + 12] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax mulps xmm1, xmm5 subps xmm0, xmm1 jl process4x4_1 - done4x4_1: // process left over of the 4 rows - movlps xmm2, [edi+0] - movhps xmm2, [edi+8] - movss xmm1, [esi+4*ecx] + done4x4_1: // process left over of the 4 rows + movlps xmm2, [edi + 0] + movhps xmm2, [edi + 8] + movss xmm1, [esi + 4 * ecx] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm2 subps xmm0, xmm1 @@ -10517,34 +10645,34 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) sub edi, edx - movss [esi-4], xmm3 // xptr[-1] = s3 + movss [esi - 4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 - mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 - mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 - mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 + mulss xmm3, [edi + 8] // lptr[-1*nc+2] * s3 + mulss xmm4, [edi + 4] // lptr[-1*nc+1] * s3 + mulss xmm5, [edi + 0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 - movss [esi-8], xmm2 // xptr[-2] = s2 + movss [esi - 8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 - mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 - mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 + mulss xmm2, [edi + 4] // lptr[-2*nc+1] * s2 + mulss xmm6, [edi + 0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 - movss [esi-12], xmm1 // xptr[-3] = s1 + movss [esi - 12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 - mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 + mulss xmm1, [edi + 0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 - movss [esi-16], xmm0 // xptr[-4] = s0 + movss [esi - 16], xmm0 // xptr[-4] = s0 jl done4rows_1 sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows_1 - done4rows_1: + done4rows_1: pop ebx } @@ -10560,39 +10688,39 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 - process4rows: + process4rows: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] sub eax, m jz done4x4 neg eax xor ecx, ecx - process4x4: // process 4x4 blocks + process4x4: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm3, [edi+0] - movhps xmm3, [edi+8] + movlps xmm3, [edi + 0] + movhps xmm3, [edi + 8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+4] + movss xmm1, [esi + 4 * ecx + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm4, [edi+0] - movhps xmm4, [edi+8] + movlps xmm4, [edi + 0] + movhps xmm4, [edi + 8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+8] + movss xmm1, [esi + 4 * ecx + 8] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps xmm5, [edi+0] - movhps xmm5, [edi+8] + movlps xmm5, [edi + 0] + movhps xmm5, [edi + 8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 - movss xmm1, [esi+4*ecx+12] + movss xmm1, [esi + 4 * ecx + 12] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax @@ -10602,7 +10730,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo imul ecx, edx sub edi, ecx neg eax - done4x4: // process left over of the 4 rows + done4x4: // process left over of the 4 rows add eax, m sub eax, 4 movaps xmm1, xmm0 @@ -10612,44 +10740,44 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) sub edi, edx - movss [esi-4], xmm3 // xptr[-1] = s3 + movss [esi - 4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 - mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 - mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 - mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 + mulss xmm3, [edi + 8] // lptr[-1*nc+2] * s3 + mulss xmm4, [edi + 4] // lptr[-1*nc+1] * s3 + mulss xmm5, [edi + 0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 - movss [esi-8], xmm2 // xptr[-2] = s2 + movss [esi - 8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 - mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 - mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 + mulss xmm2, [edi + 4] // lptr[-2*nc+1] * s2 + mulss xmm6, [edi + 0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 - movss [esi-12], xmm1 // xptr[-3] = s1 + movss [esi - 12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 - mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 + mulss xmm1, [edi + 0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 - movss [esi-16], xmm0 // xptr[-4] = s0 + movss [esi - 16], xmm0 // xptr[-4] = s0 jl done4rows sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows - done4rows: + done4rows: pop ebx } } // process left over rows - for ( i = (m&3)-1; i >= 0; i-- ) { + for ( i = ( m & 3 ) - 1; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < n; j++ ) { - s0 -= lptr[j*nc] * x[j]; + s0 -= lptr[j * nc] * x[j]; } x[i] = s0; } @@ -10671,14 +10799,14 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { - s0 = b[i-4]; - s1 = b[i-3]; - s2 = b[i-2]; - s3 = b[i-1]; + s0 = b[i - 4]; + s1 = b[i - 3]; + s2 = b[i - 2]; + s3 = b[i - 1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; - for ( j = 0; j < m-i; j += 4 ) { + for ( j = 0; j < m - i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; @@ -10740,14 +10868,14 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { - s0 = b[i-4]; - s1 = b[i-3]; - s2 = b[i-2]; - s3 = b[i-1]; + s0 = b[i - 4]; + s1 = b[i - 3]; + s2 = b[i - 2]; + s3 = b[i - 1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; - for ( j = 0; j < m-i; j += 4 ) { + for ( j = 0; j < m - i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; @@ -10803,7 +10931,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < m; j++ ) { - s0 -= lptr[j*nc] * x[j]; + s0 -= lptr[j * nc] * x[j]; } x[i] = s0; } @@ -10827,8 +10955,8 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int float *v, *diag, *invDiagPtr, *mptr; double s0, s1, s2, sum, d; - v = (float *) _alloca16( n * sizeof( float ) ); - diag = (float *) _alloca16( n * sizeof( float ) ); + v = ( float * ) _alloca16( n * sizeof( float ) ); + diag = ( float * ) _alloca16( n * sizeof( float ) ); invDiagPtr = invDiag.ToFloatPtr(); nc = mat.GetNumColumns(); @@ -10856,12 +10984,13 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 1; j < n; j++ ) { - mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; + mptr[j * nc + 0] = ( mptr[j * nc + 0] ) * d; } mptr = mat[1]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { @@ -10878,13 +11007,15 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 2; j < n; j++ ) { - mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; + mptr[j * nc + 1] = ( mptr[j * nc + 1] - v[0] * mptr[j * nc + 0] ) * d; } mptr = mat[2]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; - v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; + v[1] = diag[1] * mptr[1]; + s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { @@ -10901,14 +11032,17 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 3; j < n; j++ ) { - mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; + mptr[j * nc + 2] = ( mptr[j * nc + 2] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] ) * d; } mptr = mat[3]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; - v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; - v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; + v[1] = diag[1] * mptr[1]; + s1 = v[1] * mptr[1]; + v[2] = diag[2] * mptr[2]; + s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { @@ -10925,7 +11059,7 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 4; j < n; j++ ) { - mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; + mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d; } int ncf = nc * sizeof( float ); @@ -10939,158 +11073,158 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int push ebx mov ebx, 4 - loopRow: - cmp ebx, n - jge done - - mov ecx, ebx // esi = i - shl ecx, 2 // esi = i * 4 - mov edx, diag // edx = diag - add edx, ecx // edx = &diag[i] - mov edi, ebx // edi = i - imul edi, ncf // edi = i * nc * sizeof( float ) - add edi, mptr // edi = mat[i] - add edi, ecx // edi = &mat[i][i] - mov esi, v // ecx = v - add esi, ecx // ecx = &v[i] - mov eax, invDiagPtr // eax = invDiagPtr - add eax, ecx // eax = &invDiagPtr[i] - neg ecx - - movaps xmm0, [edx+ecx] - mulps xmm0, [edi+ecx] - movaps [esi+ecx], xmm0 - mulps xmm0, [edi+ecx] - add ecx, 12*4 - jg doneDot8 + loopRow: + cmp ebx, n + jge done + + mov ecx, ebx // esi = i + shl ecx, 2 // esi = i * 4 + mov edx, diag // edx = diag + add edx, ecx // edx = &diag[i] + mov edi, ebx // edi = i + imul edi, ncf // edi = i * nc * sizeof( float ) + add edi, mptr // edi = mat[i] + add edi, ecx // edi = &mat[i][i] + mov esi, v // ecx = v + add esi, ecx // ecx = &v[i] + mov eax, invDiagPtr // eax = invDiagPtr + add eax, ecx // eax = &invDiagPtr[i] + neg ecx + + movaps xmm0, [edx+ecx] + mulps xmm0, [edi+ecx] + movaps [esi+ecx], xmm0 + mulps xmm0, [edi+ecx] + add ecx, 12*4 + jg doneDot8 dot8: - movaps xmm1, [edx+ecx-(8*4)] - mulps xmm1, [edi+ecx-(8*4)] - movaps [esi+ecx-(8*4)], xmm1 - mulps xmm1, [edi+ecx-(8*4)] - addps xmm0, xmm1 - movaps xmm2, [edx+ecx-(4*4)] - mulps xmm2, [edi+ecx-(4*4)] - movaps [esi+ecx-(4*4)], xmm2 - mulps xmm2, [edi+ecx-(4*4)] - addps xmm0, xmm2 - add ecx, 8*4 - jle dot8 + movaps xmm1, [edx+ecx-( 8*4 )] + mulps xmm1, [edi+ecx-( 8*4 )] + movaps [esi+ecx-( 8*4 )], xmm1 + mulps xmm1, [edi+ecx-( 8*4 )] + addps xmm0, xmm1 + movaps xmm2, [edx+ecx-( 4*4 )] + mulps xmm2, [edi+ecx-( 4*4 )] + movaps [esi+ecx-( 4*4 )], xmm2 + mulps xmm2, [edi+ecx-( 4*4 )] + addps xmm0, xmm2 + add ecx, 8*4 + jle dot8 doneDot8: - sub ecx, 4*4 - jg doneDot4 - movaps xmm1, [edx+ecx-(4*4)] - mulps xmm1, [edi+ecx-(4*4)] - movaps [esi+ecx-(4*4)], xmm1 - mulps xmm1, [edi+ecx-(4*4)] - addps xmm0, xmm1 - add ecx, 4*4 + sub ecx, 4*4 + jg doneDot4 + movaps xmm1, [edx+ecx-( 4*4 )] + mulps xmm1, [edi+ecx-( 4*4 )] + movaps [esi+ecx-( 4*4 )], xmm1 + mulps xmm1, [edi+ecx-( 4*4 )] + addps xmm0, xmm1 + add ecx, 4*4 doneDot4: - sub ecx, 2*4 - jg doneDot2 - movlps xmm3, [edx+ecx-(2*4)] - movlps xmm4, [edi+ecx-(2*4)] - mulps xmm3, xmm4 - movlps [esi+ecx-(2*4)], xmm3 - mulps xmm3, xmm4 - addps xmm0, xmm3 - add ecx, 2*4 + sub ecx, 2*4 + jg doneDot2 + movlps xmm3, [edx+ecx-( 2*4 )] + movlps xmm4, [edi+ecx-( 2*4 )] + mulps xmm3, xmm4 + movlps [esi+ecx-( 2*4 )], xmm3 + mulps xmm3, xmm4 + addps xmm0, xmm3 + add ecx, 2*4 doneDot2: - sub ecx, 1*4 - jg doneDot1 - movss xmm3, [edx+ecx-(1*4)] - movss xmm4, [edi+ecx-(1*4)] - mulss xmm3, xmm4 - movss [esi+ecx-(1*4)], xmm3 - mulss xmm3, xmm4 - addss xmm0, xmm3 + sub ecx, 1*4 + jg doneDot1 + movss xmm3, [edx+ecx-( 1*4 )] + movss xmm4, [edi+ecx-( 1*4 )] + mulss xmm3, xmm4 + movss [esi+ecx-( 1*4 )], xmm3 + mulss xmm3, xmm4 + addss xmm0, xmm3 doneDot1: - movhlps xmm2, xmm0 - addps xmm0, xmm2 - movaps xmm2, xmm0 - shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm2 - movss xmm1, [edi] - subss xmm1, xmm0 - movss [edi], xmm1 // mptr[i] = sum; - movss [edx], xmm1 // diag[i] = sum; - - // if ( sum == 0.0f ) return false; - movaps xmm2, xmm1 - cmpeqss xmm2, SIMD_SP_zero - andps xmm2, SIMD_SP_tiny - orps xmm1, xmm2 - - rcpss xmm7, xmm1 - mulss xmm1, xmm7 - mulss xmm1, xmm7 - addss xmm7, xmm7 - subss xmm7, xmm1 - movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum; - - mov edx, n // edx = n - sub edx, ebx // edx = n - i - dec edx // edx = n - i - 1 - jle doneSubRow // if ( i + 1 >= n ) return true; - - mov eax, ebx // eax = i - shl eax, 2 // eax = i * 4 - neg eax + movhlps xmm2, xmm0 + addps xmm0, xmm2 + movaps xmm2, xmm0 + shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm2 + movss xmm1, [edi] + subss xmm1, xmm0 + movss [edi], xmm1 // mptr[i] = sum; + movss [edx], xmm1 // diag[i] = sum; + + // if ( sum == 0.0f ) return false; + movaps xmm2, xmm1 + cmpeqss xmm2, SIMD_SP_zero + andps xmm2, SIMD_SP_tiny + orps xmm1, xmm2 + + rcpss xmm7, xmm1 + mulss xmm1, xmm7 + mulss xmm1, xmm7 + addss xmm7, xmm7 + subss xmm7, xmm1 + movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum; + + mov edx, n // edx = n + sub edx, ebx // edx = n - i + dec edx // edx = n - i - 1 + jle doneSubRow // if ( i + 1 >= n ) return true; + + mov eax, ebx // eax = i + shl eax, 2 // eax = i * 4 + neg eax loopSubRow: - add edi, ncf - mov ecx, eax - movaps xmm0, [esi+ecx] - mulps xmm0, [edi+ecx] - add ecx, 12*4 - jg doneSubDot8 - subDot8: - movaps xmm1, [esi+ecx-(8*4)] - mulps xmm1, [edi+ecx-(8*4)] - addps xmm0, xmm1 - movaps xmm2, [esi+ecx-(4*4)] - mulps xmm2, [edi+ecx-(4*4)] - addps xmm0, xmm2 - add ecx, 8*4 - jle subDot8 - doneSubDot8: - sub ecx, 4*4 - jg doneSubDot4 - movaps xmm1, [esi+ecx-(4*4)] - mulps xmm1, [edi+ecx-(4*4)] - addps xmm0, xmm1 - add ecx, 4*4 - doneSubDot4: - sub ecx, 2*4 - jg doneSubDot2 - movlps xmm3, [esi+ecx-(2*4)] - movlps xmm4, [edi+ecx-(2*4)] - mulps xmm3, xmm4 - addps xmm0, xmm3 - add ecx, 2*4 - doneSubDot2: - sub ecx, 1*4 - jg doneSubDot1 - movss xmm3, [esi+ecx-(1*4)] - movss xmm4, [edi+ecx-(1*4)] - mulss xmm3, xmm4 - addss xmm0, xmm3 - doneSubDot1: - movhlps xmm2, xmm0 - addps xmm0, xmm2 - movaps xmm2, xmm0 - shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) - addss xmm0, xmm2 - movss xmm1, [edi] - subss xmm1, xmm0 - mulss xmm1, xmm7 - movss [edi], xmm1 - dec edx - jg loopSubRow + add edi, ncf + mov ecx, eax + movaps xmm0, [esi + ecx] + mulps xmm0, [edi + ecx] + add ecx, 12 * 4 + jg doneSubDot8 + subDot8: + movaps xmm1, [esi + ecx - ( 8 * 4 )] + mulps xmm1, [edi + ecx - ( 8 * 4 )] + addps xmm0, xmm1 + movaps xmm2, [esi + ecx - ( 4 * 4 )] + mulps xmm2, [edi + ecx - ( 4 * 4 )] + addps xmm0, xmm2 + add ecx, 8 * 4 + jle subDot8 + doneSubDot8: + sub ecx, 4 * 4 + jg doneSubDot4 + movaps xmm1, [esi + ecx - ( 4 * 4 )] + mulps xmm1, [edi + ecx - ( 4 * 4 )] + addps xmm0, xmm1 + add ecx, 4 * 4 + doneSubDot4: + sub ecx, 2 * 4 + jg doneSubDot2 + movlps xmm3, [esi + ecx - ( 2 * 4 )] + movlps xmm4, [edi + ecx - ( 2 * 4 )] + mulps xmm3, xmm4 + addps xmm0, xmm3 + add ecx, 2 * 4 + doneSubDot2: + sub ecx, 1 * 4 + jg doneSubDot1 + movss xmm3, [esi + ecx - ( 1 * 4 )] + movss xmm4, [edi + ecx - ( 1 * 4 )] + mulss xmm3, xmm4 + addss xmm0, xmm3 + doneSubDot1: + movhlps xmm2, xmm0 + addps xmm0, xmm2 + movaps xmm2, xmm0 + shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) + addss xmm0, xmm2 + movss xmm1, [edi] + subss xmm1, xmm0 + mulss xmm1, xmm7 + movss [edi], xmm1 + dec edx + jg loopSubRow doneSubRow: - inc ebx - jmp loopRow - done: + inc ebx + jmp loopRow + done: pop ebx } @@ -11102,8 +11236,8 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int float *v, *diag, *mptr; double s0, s1, s2, s3, sum, d; - v = (float *) _alloca16( n * sizeof( float ) ); - diag = (float *) _alloca16( n * sizeof( float ) ); + v = ( float * ) _alloca16( n * sizeof( float ) ); + diag = ( float * ) _alloca16( n * sizeof( float ) ); nc = mat.GetNumColumns(); @@ -11128,12 +11262,13 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 1; j < n; j++ ) { - mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; + mptr[j * nc + 0] = ( mptr[j * nc + 0] ) * d; } mptr = mat[1]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { @@ -11150,13 +11285,15 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 2; j < n; j++ ) { - mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; + mptr[j * nc + 1] = ( mptr[j * nc + 1] - v[0] * mptr[j * nc + 0] ) * d; } mptr = mat[2]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; - v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; + v[1] = diag[1] * mptr[1]; + s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { @@ -11173,14 +11310,17 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 3; j < n; j++ ) { - mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; + mptr[j * nc + 2] = ( mptr[j * nc + 2] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] ) * d; } mptr = mat[3]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; - v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; - v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; + v[1] = diag[1] * mptr[1]; + s1 = v[1] * mptr[1]; + v[2] = diag[2] * mptr[2]; + s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { @@ -11197,27 +11337,41 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int mptr = mat[0]; for ( j = 4; j < n; j++ ) { - mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; + mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d; } for ( i = 4; i < n; i++ ) { mptr = mat[i]; - v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; - v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; - v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; - v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3]; - for ( k = 4; k < i-3; k += 4 ) { - v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0]; - v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; - v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2]; - v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3]; + v[0] = diag[0] * mptr[0]; + s0 = v[0] * mptr[0]; + v[1] = diag[1] * mptr[1]; + s1 = v[1] * mptr[1]; + v[2] = diag[2] * mptr[2]; + s2 = v[2] * mptr[2]; + v[3] = diag[3] * mptr[3]; + s3 = v[3] * mptr[3]; + for ( k = 4; k < i - 3; k += 4 ) { + v[k + 0] = diag[k + 0] * mptr[k + 0]; + s0 += v[k + 0] * mptr[k + 0]; + v[k + 1] = diag[k + 1] * mptr[k + 1]; + s1 += v[k + 1] * mptr[k + 1]; + v[k + 2] = diag[k + 2] * mptr[k + 2]; + s2 += v[k + 2] * mptr[k + 2]; + v[k + 3] = diag[k + 3] * mptr[k + 3]; + s3 += v[k + 3] * mptr[k + 3]; } - switch( i - k ) { - case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2]; - case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; - case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0]; + switch ( i - k ) { + case 3: + v[k + 2] = diag[k + 2] * mptr[k + 2]; + s0 += v[k + 2] * mptr[k + 2]; + case 2: + v[k + 1] = diag[k + 1] * mptr[k + 1]; + s1 += v[k + 1] * mptr[k + 1]; + case 1: + v[k + 0] = diag[k + 0] * mptr[k + 0]; + s2 += v[k + 0] * mptr[k + 0]; } sum = s3; sum += s2; @@ -11237,30 +11391,37 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int return true; } - mptr = mat[i+1]; - for ( j = i+1; j < n; j++ ) { + mptr = mat[i + 1]; + for ( j = i + 1; j < n; j++ ) { s0 = mptr[0] * v[0]; s1 = mptr[1] * v[1]; s2 = mptr[2] * v[2]; s3 = mptr[3] * v[3]; - for ( k = 4; k < i-7; k += 8 ) { - s0 += mptr[k+0] * v[k+0]; - s1 += mptr[k+1] * v[k+1]; - s2 += mptr[k+2] * v[k+2]; - s3 += mptr[k+3] * v[k+3]; - s0 += mptr[k+4] * v[k+4]; - s1 += mptr[k+5] * v[k+5]; - s2 += mptr[k+6] * v[k+6]; - s3 += mptr[k+7] * v[k+7]; + for ( k = 4; k < i - 7; k += 8 ) { + s0 += mptr[k + 0] * v[k + 0]; + s1 += mptr[k + 1] * v[k + 1]; + s2 += mptr[k + 2] * v[k + 2]; + s3 += mptr[k + 3] * v[k + 3]; + s0 += mptr[k + 4] * v[k + 4]; + s1 += mptr[k + 5] * v[k + 5]; + s2 += mptr[k + 6] * v[k + 6]; + s3 += mptr[k + 7] * v[k + 7]; } - switch( i - k ) { - case 7: s0 += mptr[k+6] * v[k+6]; - case 6: s1 += mptr[k+5] * v[k+5]; - case 5: s2 += mptr[k+4] * v[k+4]; - case 4: s3 += mptr[k+3] * v[k+3]; - case 3: s0 += mptr[k+2] * v[k+2]; - case 2: s1 += mptr[k+1] * v[k+1]; - case 1: s2 += mptr[k+0] * v[k+0]; + switch ( i - k ) { + case 7: + s0 += mptr[k + 6] * v[k + 6]; + case 6: + s1 += mptr[k + 5] * v[k + 5]; + case 5: + s2 += mptr[k + 4] * v[k + 4]; + case 4: + s3 += mptr[k + 3] * v[k + 3]; + case 3: + s0 += mptr[k + 2] * v[k + 2]; + case 2: + s1 += mptr[k + 1] * v[k + 1]; + case 1: + s2 += mptr[k + 0] * v[k + 0]; } sum = s3; sum += s2; @@ -11313,7 +11474,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble ALIGN16( float blendQuat3[4] ); for ( int j = 0; j < 4; j++ ) { - int n = index[i+j]; + int n = index[i + j]; jointVert0[j] = joints[n].t[0]; jointVert1[j] = joints[n].t[1]; @@ -11551,15 +11712,15 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble cosom[2] += jointQuat3[2] * blendQuat3[2]; cosom[3] += jointQuat3[3] * blendQuat3[3]; - signBit[0] = (*(unsigned int *)&cosom[0]) & ( 1 << 31 ); - signBit[1] = (*(unsigned int *)&cosom[1]) & ( 1 << 31 ); - signBit[2] = (*(unsigned int *)&cosom[2]) & ( 1 << 31 ); - signBit[3] = (*(unsigned int *)&cosom[3]) & ( 1 << 31 ); + signBit[0] = ( *( unsigned int * )&cosom[0] ) & ( 1 << 31 ); + signBit[1] = ( *( unsigned int * )&cosom[1] ) & ( 1 << 31 ); + signBit[2] = ( *( unsigned int * )&cosom[2] ) & ( 1 << 31 ); + signBit[3] = ( *( unsigned int * )&cosom[3] ) & ( 1 << 31 ); - (*(unsigned int *)&cosom[0]) ^= signBit[0]; - (*(unsigned int *)&cosom[1]) ^= signBit[1]; - (*(unsigned int *)&cosom[2]) ^= signBit[2]; - (*(unsigned int *)&cosom[3]) ^= signBit[3]; + ( *( unsigned int * )&cosom[0] ) ^= signBit[0]; + ( *( unsigned int * )&cosom[1] ) ^= signBit[1]; + ( *( unsigned int * )&cosom[2] ) ^= signBit[2]; + ( *( unsigned int * )&cosom[3] ) ^= signBit[3]; scale0[0] = 1.0f - cosom[0] * cosom[0]; scale0[1] = 1.0f - cosom[1] * cosom[1]; @@ -11606,10 +11767,10 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2]; scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3]; - (*(unsigned int *)&scale1[0]) ^= signBit[0]; - (*(unsigned int *)&scale1[1]) ^= signBit[1]; - (*(unsigned int *)&scale1[2]) ^= signBit[2]; - (*(unsigned int *)&scale1[3]) ^= signBit[3]; + ( *( unsigned int * )&scale1[0] ) ^= signBit[0]; + ( *( unsigned int * )&scale1[1] ) ^= signBit[1]; + ( *( unsigned int * )&scale1[2] ) ^= signBit[2]; + ( *( unsigned int * )&scale1[3] ) ^= signBit[3]; jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0]; jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1]; @@ -11634,7 +11795,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble #endif for ( int j = 0; j < 4; j++ ) { - int n = index[i+j]; + int n = index[i + j]; joints[n].t[0] = jointVert0[j]; joints[n].t[1] = jointVert1[j]; @@ -11669,9 +11830,9 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; - signBit = (*(unsigned int *)&cosom) & ( 1 << 31 ); + signBit = ( *( unsigned int * )&cosom ) & ( 1 << 31 ); - (*(unsigned int *)&cosom) ^= signBit; + ( *( unsigned int * )&cosom ) ^= signBit; scale0 = 1.0f - cosom * cosom; scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0; @@ -11680,7 +11841,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom; scale1 = idMath::Sin16( lerp * omega ) * sinom; - (*(unsigned int *)&scale1) ^= signBit; + ( *( unsigned int * )&scale1 ) ^= signBit; jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x; jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y; @@ -11698,16 +11859,16 @@ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, con assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); - assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); + assert( ( int )( &( ( idJointQuat * )0 )->t ) == ( int )( &( ( idJointQuat * )0 )->q ) + ( int )sizeof( ( ( idJointQuat * )0 )->q ) ); for ( int i = 0; i < numJoints; i++ ) { const float *q = jointQuats[i].q.ToFloatPtr(); float *m = jointMats[i].ToFloatPtr(); - m[0*4+3] = q[4]; - m[1*4+3] = q[5]; - m[2*4+3] = q[6]; + m[0 * 4 + 3] = q[4]; + m[1 * 4 + 3] = q[5]; + m[2 * 4 + 3] = q[6]; float x2 = q[0] + q[0]; float y2 = q[1] + q[1]; @@ -11718,33 +11879,33 @@ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, con float yy = q[1] * y2; float zz = q[2] * z2; - m[0*4+0] = 1.0f - yy - zz; - m[1*4+1] = 1.0f - xx - zz; - m[2*4+2] = 1.0f - xx - yy; + m[0 * 4 + 0] = 1.0f - yy - zz; + m[1 * 4 + 1] = 1.0f - xx - zz; + m[2 * 4 + 2] = 1.0f - xx - yy; } { float yz = q[1] * z2; float wx = q[3] * x2; - m[2*4+1] = yz - wx; - m[1*4+2] = yz + wx; + m[2 * 4 + 1] = yz - wx; + m[1 * 4 + 2] = yz + wx; } { float xy = q[0] * y2; float wz = q[3] * z2; - m[1*4+0] = xy - wz; - m[0*4+1] = xy + wz; + m[1 * 4 + 0] = xy - wz; + m[0 * 4 + 1] = xy + wz; } { float xz = q[0] * z2; float wy = q[3] * y2; - m[0*4+2] = xz - wy; - m[2*4+0] = xz + wy; + m[0 * 4 + 2] = xz - wy; + m[2 * 4 + 0] = xz + wy; } } } @@ -11758,7 +11919,7 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); - assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); + assert( ( int )( &( ( idJointQuat * )0 )->t ) == ( int )( &( ( idJointQuat * )0 )->q ) + ( int )sizeof( ( ( idJointQuat * )0 )->q ) ); #if 1 @@ -11774,7 +11935,7 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c add esi, eax neg eax - loopMat4: + loopMat4: movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4] movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4] movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4] @@ -11783,9 +11944,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4] - movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4] - movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4] + movss xmm0, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 0 * 4] + movss xmm1, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 1 * 4] + movss xmm2, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 2 * 4] movss xmm5, xmm0 movss xmm6, xmm1 @@ -11795,9 +11956,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4] - movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4] - movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4] + movss xmm0, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 0 * 4] + movss xmm1, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 1 * 4] + movss xmm2, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 2 * 4] movss xmm5, xmm0 movss xmm6, xmm1 @@ -11807,9 +11968,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] - movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] - movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] + movss xmm0, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 0 * 4] + movss xmm1, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 1 * 4] + movss xmm2, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 2 * 4] movss xmm5, xmm0 movss xmm6, xmm1 @@ -11881,38 +12042,38 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c // ------------------- - add edi, 4*JOINTQUAT_SIZE + add edi, 4 * JOINTQUAT_SIZE - movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0 - movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; + movzx ecx, byte ptr shuffle[0 * 4 + 0] // ecx = k0 + movss [edi + ecx * 4 - 4 * JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; - movzx edx, byte ptr shuffle[0*4+1] // edx = k1 - movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] + movzx edx, byte ptr shuffle[0 * 4 + 1] // edx = k1 + movss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 0 * 4] xorps xmm4, xmm2 - subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] + subss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 1 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; + movss [edi + edx * 4 - 4 * JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; - movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2 - movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] + movzx ecx, byte ptr shuffle[0 * 4 + 2] // ecx = k2 + movss xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 2 * 4] xorps xmm3, xmm1 - subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] + subss xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 0 * 4] mulss xmm3, xmm6 - movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; + movss [edi + ecx * 4 - 4 * JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; - movzx edx, byte ptr shuffle[0*4+3] // edx = k3 - movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] + movzx edx, byte ptr shuffle[0 * 4 + 3] // edx = k3 + movss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 1 * 4] xorps xmm4, xmm0 - subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] + subss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 2 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; + movss [edi + edx * 4 - 4 * JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; - mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] - mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; - mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] - mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; - mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] - mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; + mov ecx, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 3 * 4] + mov [edi - 4 * JOINTQUAT_SIZE + 16], ecx // q[4] = m[0 * 4 + 3]; + mov edx, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 3 * 4] + mov [edi - 4 * JOINTQUAT_SIZE + 20], edx // q[5] = m[1 * 4 + 3]; + mov ecx, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 3 * 4] + mov [edi - 4 * JOINTQUAT_SIZE + 24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) @@ -11920,36 +12081,36 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0 - movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; + movzx ecx, byte ptr shuffle[1 * 4 + 0] // ecx = k0 + movss [edi + ecx * 4 - 3 * JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; - movzx edx, byte ptr shuffle[1*4+1] // edx = k1 - movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4] + movzx edx, byte ptr shuffle[1 * 4 + 1] // edx = k1 + movss xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 0 * 4] xorps xmm4, xmm2 - subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4] + subss xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 1 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; + movss [edi + edx * 4 - 3 * JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; - movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2 - movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4] + movzx ecx, byte ptr shuffle[1 * 4 + 2] // ecx = k2 + movss xmm3, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 2 * 4] xorps xmm3, xmm1 - subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4] + subss xmm3, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 0 * 4] mulss xmm3, xmm6 - movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; + movss [edi + ecx * 4 - 3 * JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; - movzx edx, byte ptr shuffle[1*4+3] // edx = k3 - movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4] + movzx edx, byte ptr shuffle[1 * 4 + 3] // edx = k3 + movss xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 1 * 4] xorps xmm4, xmm0 - subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4] + subss xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 2 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; + movss [edi + edx * 4 - 3 * JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; - mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4] - mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; - mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4] - mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; - mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4] - mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; + mov ecx, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 3 * 4] + mov [edi - 3 * JOINTQUAT_SIZE + 16], ecx // q[4] = m[0 * 4 + 3]; + mov edx, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 3 * 4] + mov [edi - 3 * JOINTQUAT_SIZE + 20], edx // q[5] = m[1 * 4 + 3]; + mov ecx, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 3 * 4] + mov [edi - 3 * JOINTQUAT_SIZE + 24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) @@ -11957,36 +12118,36 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0 - movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; + movzx ecx, byte ptr shuffle[2 * 4 + 0] // ecx = k0 + movss [edi + ecx * 4 - 2 * JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; - movzx edx, byte ptr shuffle[2*4+1] // edx = k1 - movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4] + movzx edx, byte ptr shuffle[2 * 4 + 1] // edx = k1 + movss xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 0 * 4] xorps xmm4, xmm2 - subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4] + subss xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 1 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; + movss [edi + edx * 4 - 2 * JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; - movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2 - movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4] + movzx ecx, byte ptr shuffle[2 * 4 + 2] // ecx = k2 + movss xmm3, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 2 * 4] xorps xmm3, xmm1 - subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4] + subss xmm3, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 0 * 4] mulss xmm3, xmm6 - movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; + movss [edi + ecx * 4 - 2 * JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; - movzx edx, byte ptr shuffle[2*4+3] // edx = k3 - movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4] + movzx edx, byte ptr shuffle[2 * 4 + 3] // edx = k3 + movss xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 1 * 4] xorps xmm4, xmm0 - subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4] + subss xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 2 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; + movss [edi + edx * 4 - 2 * JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; - mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4] - mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; - mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4] - mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; - mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4] - mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; + mov ecx, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 3 * 4] + mov [edi - 2 * JOINTQUAT_SIZE + 16], ecx // q[4] = m[0 * 4 + 3]; + mov edx, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 3 * 4] + mov [edi - 2 * JOINTQUAT_SIZE + 20], edx // q[5] = m[1 * 4 + 3]; + mov ecx, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 3 * 4] + mov [edi - 2 * JOINTQUAT_SIZE + 24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) @@ -11994,41 +12155,41 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0 - movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; + movzx ecx, byte ptr shuffle[3 * 4 + 0] // ecx = k0 + movss [edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; - movzx edx, byte ptr shuffle[3*4+1] // edx = k1 - movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4] + movzx edx, byte ptr shuffle[3 * 4 + 1] // edx = k1 + movss xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 0 * 4] xorps xmm4, xmm2 - subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4] + subss xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 1 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; + movss [edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; - movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2 - movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4] + movzx ecx, byte ptr shuffle[3 * 4 + 2] // ecx = k2 + movss xmm3, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 2 * 4] xorps xmm3, xmm1 - subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4] + subss xmm3, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 0 * 4] mulss xmm3, xmm6 - movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; + movss [edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; - movzx edx, byte ptr shuffle[3*4+3] // edx = k3 - movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4] + movzx edx, byte ptr shuffle[3 * 4 + 3] // edx = k3 + movss xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 1 * 4] xorps xmm4, xmm0 - subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4] + subss xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 2 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; + movss [edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; - mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; - mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; - mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; + mov ecx, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 16], ecx // q[4] = m[0 * 4 + 3]; + mov edx, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 20], edx // q[5] = m[1 * 4 + 3]; + mov ecx, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 24], ecx // q[6] = m[2 * 4 + 3]; - add eax, 4*JOINTMAT_SIZE + add eax, 4 * JOINTMAT_SIZE jl loopMat4 - done4: + done4: mov eax, numJoints and eax, 3 jz done1 @@ -12036,10 +12197,10 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c add esi, eax neg eax - loopMat1: - movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] - movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] - movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] + loopMat1: + movss xmm5, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 0 * 4] + movss xmm6, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 1 * 4] + movss xmm7, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 2 * 4] // ------------------- @@ -12109,40 +12270,40 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c movzx ecx, byte ptr shuffle[0] // ecx = k0 add edi, JOINTQUAT_SIZE - movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; + movss [edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[1] // edx = k1 - movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] + movss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 0 * 4] xorps xmm4, xmm2 - subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] + subss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 1 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; + movss [edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[2] // ecx = k2 - movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] + movss xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 2 * 4] xorps xmm3, xmm1 - subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] + subss xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 0 * 4] mulss xmm3, xmm6 - movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; + movss [edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[3] // edx = k3 - movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] + movss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 1 * 4] xorps xmm4, xmm0 - subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] + subss xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 2 * 4] mulss xmm4, xmm6 - movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; + movss [edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; - mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; - mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; - mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] - mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; + mov ecx, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 16], ecx // q[4] = m[0 * 4 + 3]; + mov edx, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 20], edx // q[5] = m[1 * 4 + 3]; + mov ecx, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 3 * 4] + mov [edi - 1 * JOINTQUAT_SIZE + 24], ecx // q[6] = m[2 * 4 + 3]; add eax, JOINTMAT_SIZE jl loopMat1 - done1: + done1: } #elif 0 @@ -12291,7 +12452,7 @@ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *paren add edi, eax neg eax - loopJoint: + loopJoint: movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 mov edx, [edi+eax] @@ -12303,70 +12464,70 @@ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *paren shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm0 - movss xmm5, [esi+edx+ 4] + movss xmm5, [esi + edx + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm1 addps xmm4, xmm5 - movss xmm6, [esi+edx+ 8] + movss xmm6, [esi + edx + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm2 addps xmm4, xmm6 - movss xmm5, [esi+edx+16] + movss xmm5, [esi + edx + 16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm0 - movss xmm7, [esi+edx+12] + movss xmm7, [esi + edx + 12] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm4, xmm7 - movaps [esi+ecx+ 0], xmm4 + movaps [esi + ecx + 0], xmm4 - movss xmm6, [esi+edx+20] + movss xmm6, [esi + edx + 20] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm1 addps xmm5, xmm6 - movss xmm7, [esi+edx+24] + movss xmm7, [esi + edx + 24] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm5, xmm7 - movss xmm6, [esi+edx+32] + movss xmm6, [esi + edx + 32] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 - movss xmm3, [esi+edx+28] + movss xmm3, [esi + edx + 28] shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm5, xmm3 - movaps [esi+ecx+16], xmm5 + movaps [esi + ecx + 16], xmm5 - movss xmm7, [esi+edx+36] + movss xmm7, [esi + edx + 36] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 - movss xmm3, [esi+edx+40] + movss xmm3, [esi + edx + 40] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm2 addps xmm6, xmm3 - movss xmm7, [esi+edx+44] + movss xmm7, [esi + edx + 44] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm6, xmm7 - movaps [esi+ecx+32], xmm6 + movaps [esi + ecx + 32], xmm6 add ecx, JOINTMAT_SIZE add eax, 4 jle loopJoint - done: + done: } #else int i; - for( i = firstJoint; i <= lastJoint; i++ ) { + for ( i = firstJoint; i <= lastJoint; i++ ) { assert( parents[i] < i ); jointMats[i] *= jointMats[parents[i]]; } @@ -12398,7 +12559,7 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *par add edi, edx imul eax, 4 - loopJoint: + loopJoint: movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 mov edx, [edi+eax] @@ -12409,66 +12570,66 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *par movss xmm6, [esi+edx+12] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm0, xmm6 - movss xmm7, [esi+edx+28] + movss xmm7, [esi + edx + 28] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm1, xmm7 - movss xmm3, [esi+edx+44] + movss xmm3, [esi + edx + 44] shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm2, xmm3 - movss xmm4, [esi+edx+ 0] + movss xmm4, [esi + edx + 0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm0 - movss xmm5, [esi+edx+16] + movss xmm5, [esi + edx + 16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm1 addps xmm4, xmm5 - movss xmm6, [esi+edx+32] + movss xmm6, [esi + edx + 32] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm2 addps xmm4, xmm6 - movaps [esi+ecx+ 0], xmm4 + movaps [esi + ecx + 0], xmm4 - movss xmm5, [esi+edx+ 4] + movss xmm5, [esi + edx + 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm0 - movss xmm6, [esi+edx+20] + movss xmm6, [esi + edx + 20] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm1 addps xmm5, xmm6 - movss xmm7, [esi+edx+36] + movss xmm7, [esi + edx + 36] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm5, xmm7 - movaps [esi+ecx+16], xmm5 + movaps [esi + ecx + 16], xmm5 - movss xmm6, [esi+edx+ 8] + movss xmm6, [esi + edx + 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 - movss xmm7, [esi+edx+24] + movss xmm7, [esi + edx + 24] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 - movss xmm3, [esi+edx+40] + movss xmm3, [esi + edx + 40] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm2 addps xmm6, xmm3 - movaps [esi+ecx+32], xmm6 + movaps [esi + ecx + 32], xmm6 sub ecx, JOINTMAT_SIZE sub eax, 4 jge loopJoint - done: + done: } #else int i; - for( i = lastJoint; i >= firstJoint; i-- ) { + for ( i = lastJoint; i >= firstJoint; i-- ) { assert( parents[i] < i ); jointMats[i] /= jointMats[parents[i]]; } @@ -12485,12 +12646,11 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); - __asm - { + __asm { mov eax, numVerts test eax, eax jz done @@ -12504,7 +12664,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c add ecx, eax neg eax - loopVert: + loopVert: mov ebx, [edx] movaps xmm2, [esi] add edx, 8 @@ -12520,7 +12680,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c jne doneWeight - loopWeight: + loopWeight: mov ebx, [edx] movaps xmm5, [esi] add edx, 8 @@ -12540,7 +12700,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c je loopWeight - doneWeight: + doneWeight: add eax, DRAWVERT_SIZE movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0 @@ -12559,24 +12719,24 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8 addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2 - movss [ecx+eax-DRAWVERT_SIZE+8], xmm5 + movss [ecx + eax - DRAWVERT_SIZE + 8], xmm5 jl loopVert - done: + done: } #else int i, j; - const byte *jointsPtr = (byte *)joints; + const byte *jointsPtr = ( byte * )joints; - for( j = i = 0; i < numVerts; i++ ) { + for ( j = i = 0; i < numVerts; i++ ) { idVec3 v; - v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; - while( index[j*2+1] == 0 ) { + v = ( *( idJointMat * )( jointsPtr + index[j * 2 + 0] ) ) * weights[j]; + while ( index[j * 2 + 1] == 0 ) { j++; - v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; + v += ( *( idJointMat * )( jointsPtr + index[j * 2 + 0] ) ) * weights[j]; } j++; @@ -12595,7 +12755,7 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { push ebx @@ -12628,13 +12788,13 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo add esi, eax neg eax - loopVert: - movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0] + loopVert: + movss xmm4, [esi + eax + DRAWVERT_XYZ_OFFSET + 0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4] + movss xmm5, [esi + eax + DRAWVERT_XYZ_OFFSET + 4] mulps xmm4, xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) - movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8] + movss xmm6, [esi + eax + DRAWVERT_XYZ_OFFSET + 8] mulps xmm5, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) addps xmm4, xmm5 @@ -12652,10 +12812,10 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo inc edi or dl, cl add eax, DRAWVERT_SIZE - mov byte ptr [edi-1], cl + mov byte ptr [edi - 1], cl jl loopVert - done: + done: mov esi, totalOr mov byte ptr [esi], dl pop ebx @@ -12725,7 +12885,7 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c ALIGN16( float p7[4] ); assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov ecx, planes @@ -12749,13 +12909,13 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c movaps p2, xmm2 movaps p3, xmm3 - movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X - movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51 + movlps xmm4, [ecx + 64] // xmm4 = p40, p41, X, X + movhps xmm4, [ecx + 80] // xmm4 = p40, p41, p50, p51 movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51 - movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X - movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53 + movlps xmm6, [ecx + 72] // xmm6 = p42, p43, X, X + movhps xmm6, [ecx + 88] // xmm6 = p42, p43, p52, p53 movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53 @@ -12774,18 +12934,18 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c add esi, eax neg eax - loopVert2: + loopVert2: movaps xmm6, p0 - movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 - movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 - movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] + movss xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 @@ -12795,16 +12955,16 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c movmskps ecx, xmm6 movaps xmm6, p0 - movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm3, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm3 movaps xmm7, p1 - movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm4, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movaps xmm7, p2 - movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] + movss xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 @@ -12832,30 +12992,30 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c mov dh, dl shl dl, 4 shl dh, 2 - and edx, (3<<4)|(3<<12) + and edx, ( 3 << 4 ) | ( 3 << 12 ) or ecx, edx - add eax, 2*DRAWVERT_SIZE - mov word ptr [edi-2], cx + add eax, 2 * DRAWVERT_SIZE + mov word ptr [edi - 2], cx jl loopVert2 - done2: + done2: mov eax, numVerts and eax, 1 jz done movaps xmm6, p0 - movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0] + movss xmm0, [esi + DRAWVERT_XYZ_OFFSET + 0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 - movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [esi + DRAWVERT_XYZ_OFFSET + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 - movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8] + movss xmm2, [esi + DRAWVERT_XYZ_OFFSET + 8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 @@ -12880,7 +13040,7 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c mov byte ptr [edi], cl - done: + done: } @@ -12891,8 +13051,8 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c for ( i = 0; i < numVerts; i += 2 ) { unsigned short bits0, bits1; float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; - const idVec3 &v0 = verts[i+0].xyz; - const idVec3 &v1 = verts[i+1].xyz; + const idVec3 &v0 = verts[i + 0].xyz; + const idVec3 &v1 = verts[i + 1].xyz; d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3]; d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3]; @@ -12909,21 +13069,21 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3]; d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3]; - bits0 = FLOATSIGNBITSET( d0 ) << (0+0); - bits0 |= FLOATSIGNBITSET( d1 ) << (0+1); - bits0 |= FLOATSIGNBITSET( d2 ) << (0+2); - bits0 |= FLOATSIGNBITSET( d3 ) << (0+3); - bits0 |= FLOATSIGNBITSET( d4 ) << (0+4); - bits0 |= FLOATSIGNBITSET( d5 ) << (0+5); - - bits1 = FLOATSIGNBITSET( d6 ) << (8+0); - bits1 |= FLOATSIGNBITSET( d7 ) << (8+1); - bits1 |= FLOATSIGNBITSET( d8 ) << (8+2); - bits1 |= FLOATSIGNBITSET( d9 ) << (8+3); - bits1 |= FLOATSIGNBITSET( d10 ) << (8+4); - bits1 |= FLOATSIGNBITSET( d11 ) << (8+5); - - *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F; + bits0 = FLOATSIGNBITSET( d0 ) << ( 0 + 0 ); + bits0 |= FLOATSIGNBITSET( d1 ) << ( 0 + 1 ); + bits0 |= FLOATSIGNBITSET( d2 ) << ( 0 + 2 ); + bits0 |= FLOATSIGNBITSET( d3 ) << ( 0 + 3 ); + bits0 |= FLOATSIGNBITSET( d4 ) << ( 0 + 4 ); + bits0 |= FLOATSIGNBITSET( d5 ) << ( 0 + 5 ); + + bits1 = FLOATSIGNBITSET( d6 ) << ( 8 + 0 ); + bits1 |= FLOATSIGNBITSET( d7 ) << ( 8 + 1 ); + bits1 |= FLOATSIGNBITSET( d8 ) << ( 8 + 2 ); + bits1 |= FLOATSIGNBITSET( d9 ) << ( 8 + 3 ); + bits1 |= FLOATSIGNBITSET( d10 ) << ( 8 + 4 ); + bits1 |= FLOATSIGNBITSET( d11 ) << ( 8 + 5 ); + + *( unsigned short * )( cullBits + i ) = ( bits0 | bits1 ) ^ 0x3F3F; } if ( numVerts & 1 ) { @@ -12962,7 +13122,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov eax, numVerts @@ -12975,16 +13135,16 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con movss xmm5, [ecx+16] shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) - movss xmm5, [ecx+ 4] - movss xmm6, [ecx+20] + movss xmm5, [ecx + 4] + movss xmm6, [ecx + 20] shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) - movss xmm6, [ecx+ 8] - movss xmm7, [ecx+24] + movss xmm6, [ecx + 8] + movss xmm7, [ecx + 24] shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) - movss xmm7, [ecx+12] - movss xmm0, [ecx+28] + movss xmm7, [ecx + 12] + movss xmm0, [ecx + 28] shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 ) @@ -12993,17 +13153,17 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con add edi, eax neg eax - loopVert2: - movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + loopVert2: + movss xmm0, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movss xmm1, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm4 - movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] - movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] + movss xmm2, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm5 - movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] + movss xmm2, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movss xmm3, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 @@ -13015,27 +13175,27 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 ) - add edx, 2*DRAWVERT_SIZE + add edx, 2 * DRAWVERT_SIZE movmskps ecx, xmm0 - mov byte ptr [edi+eax+0], cl - add esi, 4*4 + mov byte ptr [edi + eax + 0], cl + add esi, 4 * 4 movmskps ecx, xmm1 - mov byte ptr [edi+eax+1], cl + mov byte ptr [edi + eax + 1], cl add eax, 2 jl loopVert2 - done2: + done2: mov eax, numVerts and eax, 1 jz done - movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm0, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm4 - movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm5 - movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] + movss xmm2, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 @@ -13049,7 +13209,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con movmskps ecx, xmm0 mov byte ptr [edi], cl - done: + done: } #else @@ -13061,18 +13221,18 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con unsigned short bits; float d0, d1, d2, d3; - const idVec3 &v0 = verts[i+0].xyz; - const idVec3 &v1 = verts[i+1].xyz; + const idVec3 &v0 = verts[i + 0].xyz; + const idVec3 &v1 = verts[i + 1].xyz; d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3]; d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3]; - texCoords[i+0][0] = d0; - texCoords[i+0][1] = d1; - texCoords[i+1][0] = d2; - texCoords[i+1][1] = d3; + texCoords[i + 0][0] = d0; + texCoords[i + 0][1] = d1; + texCoords[i + 1][0] = d2; + texCoords[i + 1][1] = d3; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; @@ -13089,7 +13249,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con bits |= FLOATSIGNBITSET( d2 ) << 10; bits |= FLOATSIGNBITSET( d3 ) << 11; - *(unsigned short *)(cullBits + i) = bits; + *( unsigned short * )( cullBits + i ) = bits; } if ( numVerts & 1 ) { @@ -13130,7 +13290,7 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov eax, numIndexes @@ -13145,7 +13305,7 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert add eax, 4*12 jge done4 - loopPlane4: + loopPlane4: mov ebx, [edi+eax-4*12+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax-4*12+0] @@ -13167,129 +13327,129 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm3, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm3, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] - movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm4, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm4, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] - movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm5, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm5, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] - mov ebx, [edi+eax-3*12+4] + mov ebx, [edi + eax - 3 * 12 + 4] imul ebx, DRAWVERT_SIZE - mov ecx, [edi+eax-3*12+0] + mov ecx, [edi + eax - 3 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm0, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm1, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm2, xmm6 - mov ebx, [edi+eax-3*12+8] + mov ebx, [edi + eax - 3 * 12 + 8] imul ebx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm3, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm4, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm5, xmm7 - mov ebx, [edi+eax-2*12+4] + mov ebx, [edi + eax - 2 * 12 + 4] imul ebx, DRAWVERT_SIZE - mov ecx, [edi+eax-2*12+0] + mov ecx, [edi + eax - 2 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm0, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm1, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm2, xmm6 - mov ebx, [edi+eax-2*12+8] + mov ebx, [edi + eax - 2 * 12 + 8] imul ebx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm3, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm4, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm5, xmm7 - mov ebx, [edi+eax-1*12+4] + mov ebx, [edi + eax - 1 * 12 + 4] imul ebx, DRAWVERT_SIZE - mov ecx, [edi+eax-1*12+0] + mov ecx, [edi + eax - 1 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm0, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm1, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm2, xmm6 - mov ebx, [edi+eax-1*12+8] + mov ebx, [edi + eax - 1 * 12 + 8] imul ebx, DRAWVERT_SIZE - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] movss xmm3, xmm7 - movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] movss xmm4, xmm6 - movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm5, xmm7 movaps xmm6, xmm4 @@ -13317,121 +13477,121 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert addps xmm3, xmm6 rsqrtps xmm3, xmm3 - add edx, 4*16 - mov ecx, [edi+eax-1*12+0] + add edx, 4 * 16 + mov ecx, [edi + eax - 1 * 12 + 0] imul ecx, DRAWVERT_SIZE mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 - movss [edx-1*16+0], xmm0 - movss [edx-1*16+4], xmm1 - movss [edx-1*16+8], xmm2 + movss [edx - 1 * 16 + 0], xmm0 + movss [edx - 1 * 16 + 4], xmm1 + movss [edx - 1 * 16 + 8], xmm2 - mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] - mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] - mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] + mulss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] + mulss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 - movss [edx-1*16+12], xmm0 + movss [edx - 1 * 16 + 12], xmm0 - mov ecx, [edi+eax-2*12+0] + mov ecx, [edi + eax - 2 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [edx-2*16+0], xmm0 - movss [edx-2*16+4], xmm1 - movss [edx-2*16+8], xmm2 + movss [edx - 2 * 16 + 0], xmm0 + movss [edx - 2 * 16 + 4], xmm1 + movss [edx - 2 * 16 + 8], xmm2 - mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] - mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] - mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] + mulss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] + mulss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 - movss [edx-2*16+12], xmm0 + movss [edx - 2 * 16 + 12], xmm0 - mov ecx, [edi+eax-3*12+0] + mov ecx, [edi + eax - 3 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [edx-3*16+0], xmm0 - movss [edx-3*16+4], xmm1 - movss [edx-3*16+8], xmm2 + movss [edx - 3 * 16 + 0], xmm0 + movss [edx - 3 * 16 + 4], xmm1 + movss [edx - 3 * 16 + 8], xmm2 - mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] - mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] - mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] + mulss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] + mulss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 - movss [edx-3*16+12], xmm0 + movss [edx - 3 * 16 + 12], xmm0 - mov ecx, [edi+eax-4*12+0] + mov ecx, [edi + eax - 4 * 12 + 0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [edx-4*16+0], xmm0 - movss [edx-4*16+4], xmm1 - movss [edx-4*16+8], xmm2 + movss [edx - 4 * 16 + 0], xmm0 + movss [edx - 4 * 16 + 4], xmm1 + movss [edx - 4 * 16 + 8], xmm2 - mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] - mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] - mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] + mulss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] + mulss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 - movss [edx-4*16+12], xmm0 + movss [edx - 4 * 16 + 12], xmm0 - add eax, 4*12 + add eax, 4 * 12 jle loopPlane4 - done4: + done4: - sub eax, 4*12 + sub eax, 4 * 12 jge done - loopPlane1: - mov ebx, [edi+eax+4] + loopPlane1: + mov ebx, [edi + eax + 4] imul ebx, DRAWVERT_SIZE - mov ecx, [edi+eax+0] + mov ecx, [edi + eax + 0] imul ecx, DRAWVERT_SIZE - movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm0, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] - movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] - movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm2, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] - mov ebx, [edi+eax+8] + mov ebx, [edi + eax + 8] imul ebx, DRAWVERT_SIZE - movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] - subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] + movss xmm3, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0] + subss xmm3, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] - movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] - subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] + movss xmm4, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4] + subss xmm4, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] - movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] - subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + movss xmm5, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8] + subss xmm5, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] movss xmm6, xmm4 mulss xmm6, xmm2 @@ -13458,29 +13618,29 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert addss xmm3, xmm6 rsqrtss xmm3, xmm3 - add edx, 1*16 + add edx, 1 * 16 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 - movss [edx-1*16+0], xmm0 - movss [edx-1*16+4], xmm1 - movss [edx-1*16+8], xmm2 + movss [edx - 1 * 16 + 0], xmm0 + movss [edx - 1 * 16 + 4], xmm1 + movss [edx - 1 * 16 + 8], xmm2 - mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] - mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] - mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] + mulss xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0] + mulss xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4] + mulss xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 - movss [edx-1*16+12], xmm0 + movss [edx - 1 * 16 + 12], xmm0 - add eax, 1*12 + add eax, 1 * 12 jl loopPlane1 - done: + done: } #else @@ -13646,9 +13806,9 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons int i; assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); assert( planes != NULL ); assert( verts != NULL ); @@ -13661,7 +13821,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons } #endif - bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); + bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( i = 0; i <= numIndexes - 12; i += 12 ) { @@ -13969,10 +14129,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons tmp[2] -= d4[2] * d8[2]; tmp[3] -= d4[3] * d8[3]; - signBit[0] = ( *(unsigned int *)&tmp[0] ) & ( 1 << 31 ); - signBit[1] = ( *(unsigned int *)&tmp[1] ) & ( 1 << 31 ); - signBit[2] = ( *(unsigned int *)&tmp[2] ) & ( 1 << 31 ); - signBit[3] = ( *(unsigned int *)&tmp[3] ) & ( 1 << 31 ); + signBit[0] = ( *( unsigned int * )&tmp[0] ) & ( 1 << 31 ); + signBit[1] = ( *( unsigned int * )&tmp[1] ) & ( 1 << 31 ); + signBit[2] = ( *( unsigned int * )&tmp[2] ) & ( 1 << 31 ); + signBit[3] = ( *( unsigned int * )&tmp[3] ) & ( 1 << 31 ); // first tangent t0[0] = d0[0] * d9[0]; @@ -14025,10 +14185,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); - *(unsigned int *)&tmp[0] ^= signBit[0]; - *(unsigned int *)&tmp[1] ^= signBit[1]; - *(unsigned int *)&tmp[2] ^= signBit[2]; - *(unsigned int *)&tmp[3] ^= signBit[3]; + *( unsigned int * )&tmp[0] ^= signBit[0]; + *( unsigned int * )&tmp[1] ^= signBit[1]; + *( unsigned int * )&tmp[2] ^= signBit[2]; + *( unsigned int * )&tmp[3] ^= signBit[3]; t0[0] *= tmp[0]; t0[1] *= tmp[1]; @@ -14096,10 +14256,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); - *(unsigned int *)&tmp[0] ^= signBit[0]; - *(unsigned int *)&tmp[1] ^= signBit[1]; - *(unsigned int *)&tmp[2] ^= signBit[2]; - *(unsigned int *)&tmp[3] ^= signBit[3]; + *( unsigned int * )&tmp[0] ^= signBit[0]; + *( unsigned int * )&tmp[1] ^= signBit[1]; + *( unsigned int * )&tmp[2] ^= signBit[2]; + *( unsigned int * )&tmp[3] ^= signBit[3]; t3[0] *= tmp[0]; t3[1] *= tmp[1]; @@ -14448,7 +14608,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons // area sign bit tmp = d3 * d9 - d4 * d8; - signBit[0] = ( *(unsigned int *)&tmp ) & ( 1 << 31 ); + signBit[0] = ( *( unsigned int * )&tmp ) & ( 1 << 31 ); // first tangent t0 = d0 * d9 - d4 * d5; @@ -14456,7 +14616,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons t2 = d2 * d9 - d4 * d7; tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 ); - *(unsigned int *)&tmp ^= signBit[0]; + *( unsigned int * )&tmp ^= signBit[0]; t0 *= tmp; t1 *= tmp; @@ -14468,7 +14628,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons t5 = d3 * d7 - d2 * d8; tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 ); - *(unsigned int *)&tmp ^= signBit[0]; + *( unsigned int * )&tmp ^= signBit[0]; t3 *= tmp; t4 *= tmp; @@ -14605,7 +14765,7 @@ void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const domin for ( j = 0; j < 4; j++ ) { const idDrawVert *a, *b, *c; - const dominantTri_s &dt = dominantTris[i+j]; + const dominantTri_s &dt = dominantTris[i + j]; s0[j] = dt.normalizationScale[0]; s1[j] = dt.normalizationScale[1]; @@ -15097,9 +15257,9 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ALIGN16( float normal[12] ); assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); assert( verts != NULL ); assert( numVerts >= 0 ); @@ -15122,7 +15282,7 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts sub eax, DRAWVERT_SIZE*4 jl loopVert1 - loopVert4: + loopVert4: sub eax, DRAWVERT_SIZE*4 @@ -15172,48 +15332,48 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts // save the 4 idDrawVert::normal to project the tangents - movaps [normal+ 0], xmm0 - movaps [normal+16], xmm1 - movaps [normal+32], xmm2 + movaps [normal + 0], xmm0 + movaps [normal + 16], xmm1 + movaps [normal + 32], xmm2 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 8], xmm2 // project and normalize 4 idDrawVert::tangent[0] - movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X - movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4 - movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X - movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2 - movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X - movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10 - movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X - movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8 + movss xmm0, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 0] // 0, X, X, X + movhps xmm0, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 0] // 0, X, 3, 4 + movss xmm2, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 8] // 5, X, X, X + movhps xmm2, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 4] // 5, X, 1, 2 + movss xmm4, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 0] // 6, X, X, X + movhps xmm4, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 0] // 6, X, 9, 10 + movss xmm3, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 8] // 11, X, X, X + movhps xmm3, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 4] // 11, X, 7, 8 movaps xmm1, xmm0 movaps xmm5, xmm2 @@ -15227,17 +15387,17 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts movaps xmm4, xmm1 movaps xmm5, xmm2 - mulps xmm3, [normal+ 0] - mulps xmm4, [normal+16] - mulps xmm5, [normal+32] + mulps xmm3, [normal + 0] + mulps xmm4, [normal + 16] + mulps xmm5, [normal + 32] addps xmm3, xmm4 addps xmm3, xmm5 movaps xmm4, xmm3 movaps xmm5, xmm3 - mulps xmm3, [normal+ 0] - mulps xmm4, [normal+16] - mulps xmm5, [normal+32] + mulps xmm3, [normal + 0] + mulps xmm4, [normal + 16] + mulps xmm5, [normal + 32] subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 @@ -15267,44 +15427,44 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts mulps xmm1, xmm3 mulps xmm2, xmm3 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2 // project and normalize 4 idDrawVert::tangent[1] - movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X - movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4 - movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X - movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2 - movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X - movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10 - movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X - movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8 + movss xmm0, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 0] // 0, X, X, X + movhps xmm0, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 0] // 0, X, 3, 4 + movss xmm2, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 8] // 5, X, X, X + movhps xmm2, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 4] // 5, X, 1, 2 + movss xmm4, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 0] // 6, X, X, X + movhps xmm4, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 0] // 6, X, 9, 10 + movss xmm3, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 8] // 11, X, X, X + movhps xmm3, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 4] // 11, X, 7, 8 movaps xmm1, xmm0 movaps xmm5, xmm2 @@ -15318,17 +15478,17 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts movaps xmm4, xmm1 movaps xmm5, xmm2 - mulps xmm3, [normal+ 0] - mulps xmm4, [normal+16] - mulps xmm5, [normal+32] + mulps xmm3, [normal + 0] + mulps xmm4, [normal + 16] + mulps xmm5, [normal + 32] addps xmm3, xmm4 addps xmm3, xmm5 movaps xmm4, xmm3 movaps xmm5, xmm3 - mulps xmm3, [normal+ 0] - mulps xmm4, [normal+16] - mulps xmm5, [normal+32] + mulps xmm3, [normal + 0] + mulps xmm4, [normal + 16] + mulps xmm5, [normal + 32] subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 @@ -15358,48 +15518,48 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts mulps xmm1, xmm3 mulps xmm2, xmm3 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2 - add eax, DRAWVERT_SIZE*8 + add eax, DRAWVERT_SIZE * 8 jle loopVert4 - sub eax, DRAWVERT_SIZE*4 + sub eax, DRAWVERT_SIZE * 4 jge done - loopVert1: + loopVert1: // normalize one idDrawVert::normal - movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] + movss xmm0, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0] + movss xmm1, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4] + movss xmm2, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 @@ -15425,30 +15585,30 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts mulss xmm1, xmm3 mulss xmm2, xmm3 - movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_NORMAL_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_NORMAL_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_NORMAL_OFFSET + 8], xmm2 // project and normalize one idDrawVert::tangent[0] - movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8] + movss xmm0, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 0] + movss xmm1, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 4] + movss xmm2, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 - mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] - mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] - mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] + mulss xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0] + mulss xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4] + mulss xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8] addss xmm3, xmm4 addss xmm3, xmm5 movss xmm4, xmm3 movss xmm5, xmm3 - mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] - mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] - mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] + mulss xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0] + mulss xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4] + mulss xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8] subss xmm0, xmm3 subss xmm1, xmm4 subss xmm2, xmm5 @@ -15478,30 +15638,30 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts mulss xmm1, xmm3 mulss xmm2, xmm3 - movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_TANGENT0_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_TANGENT0_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_TANGENT0_OFFSET + 8], xmm2 // project and normalize one idDrawVert::tangent[1] - movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0] - movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4] - movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8] + movss xmm0, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 0] + movss xmm1, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 4] + movss xmm2, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 - mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] - mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] - mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] + mulss xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0] + mulss xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4] + mulss xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8] addss xmm3, xmm4 addss xmm3, xmm5 movss xmm4, xmm3 movss xmm5, xmm3 - mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] - mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] - mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] + mulss xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0] + mulss xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4] + mulss xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8] subss xmm0, xmm3 subss xmm1, xmm4 subss xmm2, xmm5 @@ -15531,14 +15691,14 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts mulss xmm1, xmm3 mulss xmm2, xmm3 - movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0 - movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1 - movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2 + movss [esi + eax + DRAWVERT_TANGENT1_OFFSET + 0], xmm0 + movss [esi + eax + DRAWVERT_TANGENT1_OFFSET + 4], xmm1 + movss [esi + eax + DRAWVERT_TANGENT1_OFFSET + 8], xmm2 add eax, DRAWVERT_SIZE jl loopVert1 - done: + done: } } @@ -15550,12 +15710,12 @@ idSIMD_SSE::CreateTextureSpaceLightVectors void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); - assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); - bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); + bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( int i = numIndexes - 1; i >= 0; i-- ) { @@ -15584,7 +15744,7 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co mov ecx, lightVectors sub ecx, 3*4 - loopVert: + loopVert: inc eax jge done @@ -15625,13 +15785,13 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co addps xmm5, xmm4 addps xmm5, xmm2 - movlps [ecx+0], xmm5 + movlps [ecx + 0], xmm5 shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 ) - movss [ecx+8], xmm5 + movss [ecx + 8], xmm5 jmp loopVert - done: + done: } #elif 1 @@ -15684,7 +15844,7 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co neg eax dec eax - loopVert4: + loopVert4: inc eax jge done4 @@ -15783,9 +15943,9 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[8] @@ -15793,9 +15953,9 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[12] @@ -15803,14 +15963,14 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 xor ecx, ecx jmp loopVert4 - done4: + done4: test ecx, ecx jz done xor eax, eax @@ -15818,49 +15978,49 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co imul edi, 12 add edi, lightVectors - loopVert1: - movss xmm0, lightDir0[eax*4] - movss xmm1, lightDir1[eax*4] - movss xmm2, lightDir2[eax*4] + loopVert1: + movss xmm0, lightDir0[eax * 4] + movss xmm1, lightDir1[eax * 4] + movss xmm2, lightDir2[eax * 4] - mov edx, usedVertNums[eax*4] + mov edx, usedVertNums[eax * 4] imul edx, 12 - movss xmm3, tangent0[eax*4] + movss xmm3, tangent0[eax * 4] mulss xmm3, xmm0 - movss xmm4, tangent1[eax*4] + movss xmm4, tangent1[eax * 4] mulss xmm4, xmm1 - movss xmm5, tangent2[eax*4] + movss xmm5, tangent2[eax * 4] mulss xmm5, xmm2 addss xmm3, xmm4 addss xmm5, xmm3 - movss [edi+edx+0], xmm5 + movss [edi + edx + 0], xmm5 - movss xmm3, tangent3[eax*4] + movss xmm3, tangent3[eax * 4] mulss xmm3, xmm0 - movss xmm4, tangent4[eax*4] + movss xmm4, tangent4[eax * 4] mulss xmm4, xmm1 - movss xmm6, tangent5[eax*4] + movss xmm6, tangent5[eax * 4] mulss xmm6, xmm2 addss xmm3, xmm4 addss xmm6, xmm3 - movss [edi+edx+4], xmm6 + movss [edi + edx + 4], xmm6 - mulss xmm0, normal0[eax*4] - mulss xmm1, normal1[eax*4] - mulss xmm2, normal2[eax*4] + mulss xmm0, normal0[eax * 4] + mulss xmm1, normal1[eax * 4] + mulss xmm2, normal2[eax * 4] addss xmm0, xmm1 addss xmm0, xmm2 - movss [edi+edx+8], xmm0 + movss [edi + edx + 8], xmm0 inc eax dec ecx jg loopVert1 - done: + done: } #else @@ -15978,12 +16138,12 @@ idSIMD_SSE::CreateSpecularTextureCoords void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); - assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); - assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); - assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); + assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); - bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); + bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( int i = numIndexes - 1; i >= 0; i-- ) { @@ -16016,7 +16176,7 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id mov ecx, texCoords sub ecx, 4*4 - loopVert: + loopVert: inc eax jge done @@ -16059,16 +16219,16 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id mulps xmm1, xmm5 addps xmm0, xmm1 - movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0] - movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4] + movss xmm2, [edi + DRAWVERT_TANGENT0_OFFSET + 0] + movhps xmm2, [edi + DRAWVERT_TANGENT0_OFFSET + 4] mulps xmm2, xmm0 - movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0] - movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4] + movss xmm3, [edi + DRAWVERT_TANGENT1_OFFSET + 0] + movhps xmm3, [edi + DRAWVERT_TANGENT1_OFFSET + 4] mulps xmm3, xmm0 - movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0] - movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4] + movss xmm4, [edi + DRAWVERT_NORMAL_OFFSET + 0] + movhps xmm4, [edi + DRAWVERT_NORMAL_OFFSET + 4] mulps xmm4, xmm0 movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2 @@ -16083,12 +16243,12 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id addps xmm5, xmm4 addps xmm5, xmm2 - movaps [ecx+0], xmm5 - movss [ecx+12], xmm3 + movaps [ecx + 0], xmm5 + movss [ecx + 12], xmm3 jmp loopVert - done: + done: } #elif 0 @@ -16159,7 +16319,7 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id neg eax dec eax - loopVert4: + loopVert4: inc eax jge done4 @@ -16302,10 +16462,10 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 - movss [ecx+edx+12], xmm3 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 + movss [ecx + edx + 12], xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[8] @@ -16313,10 +16473,10 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 - movss [ecx+edx+12], xmm3 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 + movss [ecx + edx + 12], xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[12] @@ -16324,15 +16484,15 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) - movss [ecx+edx+0], xmm5 - movss [ecx+edx+4], xmm6 - movss [ecx+edx+8], xmm0 - movss [ecx+edx+12], xmm3 + movss [ecx + edx + 0], xmm5 + movss [ecx + edx + 4], xmm6 + movss [ecx + edx + 8], xmm0 + movss [ecx + edx + 12], xmm3 xor ecx, ecx jmp loopVert4 - done4: + done4: test ecx, ecx jz done xor eax, eax @@ -16340,15 +16500,15 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id shl edi, 4 add edi, texCoords - loopVert1: - movss xmm6, lightDir0[eax*4] + loopVert1: + movss xmm6, lightDir0[eax * 4] movss xmm0, xmm6 mulss xmm6, xmm6 - movss xmm7, lightDir1[eax*4] + movss xmm7, lightDir1[eax * 4] movss xmm1, xmm7 mulss xmm7, xmm7 addss xmm6, xmm7 - movss xmm5, lightDir2[eax*4] + movss xmm5, lightDir2[eax * 4] movss xmm2, xmm5 mulss xmm5, xmm5 addss xmm6, xmm5 @@ -16358,14 +16518,14 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id mulss xmm1, xmm6 mulss xmm2, xmm6 - movss xmm3, viewDir0[eax*4] + movss xmm3, viewDir0[eax * 4] movss xmm7, xmm3 mulss xmm7, xmm7 - movss xmm4, viewDir1[eax*4] + movss xmm4, viewDir1[eax * 4] movss xmm6, xmm4 mulss xmm6, xmm6 addss xmm7, xmm6 - movss xmm5, viewDir2[eax*4] + movss xmm5, viewDir2[eax * 4] movss xmm6, xmm5 mulss xmm6, xmm6 addss xmm7, xmm6 @@ -16378,44 +16538,44 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id mulss xmm5, xmm7 addss xmm2, xmm5 - mov edx, usedVertNums[eax*4] + mov edx, usedVertNums[eax * 4] shl edx, 4 - movss xmm3, tangent0[eax*4] + movss xmm3, tangent0[eax * 4] mulss xmm3, xmm0 - movss xmm4, tangent1[eax*4] + movss xmm4, tangent1[eax * 4] mulss xmm4, xmm1 addss xmm3, xmm4 - movss xmm5, tangent2[eax*4] + movss xmm5, tangent2[eax * 4] mulss xmm5, xmm2 addss xmm5, xmm3 - movss [edi+edx+0], xmm5 + movss [edi + edx + 0], xmm5 - movss xmm3, tangent3[eax*4] + movss xmm3, tangent3[eax * 4] mulss xmm3, xmm0 - movss xmm4, tangent4[eax*4] + movss xmm4, tangent4[eax * 4] mulss xmm4, xmm1 addss xmm3, xmm4 - movss xmm6, tangent5[eax*4] + movss xmm6, tangent5[eax * 4] mulss xmm6, xmm2 addss xmm6, xmm3 - movss [edi+edx+4], xmm6 + movss [edi + edx + 4], xmm6 - mulss xmm0, normal0[eax*4] - mulss xmm1, normal1[eax*4] + mulss xmm0, normal0[eax * 4] + mulss xmm1, normal1[eax * 4] addss xmm0, xmm1 - mulss xmm2, normal2[eax*4] + mulss xmm2, normal2[eax * 4] addss xmm0, xmm2 - movss [edi+edx+8], xmm0 + movss [edi + edx + 8], xmm0 movss xmm3, SIMD_SP_one - movss [edi+edx+12], xmm3 + movss [edi + edx + 12], xmm3 inc eax dec ecx jg loopVert1 - done: + done: } #else @@ -16687,75 +16847,75 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c add edx, eax neg eax - loop4: - prefetchnta [edx+128] - prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] + loop4: + prefetchnta [edx + 128] + prefetchnta [esi + 4 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET] - cmp dword ptr [edx+eax+0], ebx + cmp dword ptr [edx + eax + 0], ebx jne skip1 - mov dword ptr [edx+eax+0], ecx - movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + mov dword ptr [edx + eax + 0], ecx + movss xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] add ecx, 2 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); orps xmm0, xmm5 - movaps [edi+0*16], xmm0 + movaps [edi + 0 * 16], xmm0 subps xmm0, xmm6 - movaps [edi+1*16], xmm0 - add edi, 2*16 + movaps [edi + 1 * 16], xmm0 + add edi, 2 * 16 - skip1: - cmp dword ptr [edx+eax+4], ebx + skip1: + cmp dword ptr [edx + eax + 4], ebx jne skip2 - mov dword ptr [edx+eax+4], ecx - movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + mov dword ptr [edx + eax + 4], ecx + movss xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] add ecx, 2 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 ) orps xmm1, xmm5 - movaps [edi+0*16], xmm1 + movaps [edi + 0 * 16], xmm1 subps xmm1, xmm7 - movaps [edi+1*16], xmm1 - add edi, 2*16 + movaps [edi + 1 * 16], xmm1 + add edi, 2 * 16 - skip2: - cmp dword ptr [edx+eax+8], ebx + skip2: + cmp dword ptr [edx + eax + 8], ebx jne skip3 - mov dword ptr [edx+eax+8], ecx - movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + mov dword ptr [edx + eax + 8], ecx + movss xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] add ecx, 2 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ); orps xmm2, xmm5 - movaps [edi+0*16], xmm2 + movaps [edi + 0 * 16], xmm2 subps xmm2, xmm6 - movaps [edi+1*16], xmm2 - add edi, 2*16 + movaps [edi + 1 * 16], xmm2 + add edi, 2 * 16 - skip3: - cmp dword ptr [edx+eax+12], ebx + skip3: + cmp dword ptr [edx + eax + 12], ebx jne skip4 - mov dword ptr [edx+eax+12], ecx - movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + mov dword ptr [edx + eax + 12], ecx + movss xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] add ecx, 2 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 ) orps xmm3, xmm5 - movaps [edi+0*16], xmm3 + movaps [edi + 0 * 16], xmm3 subps xmm3, xmm7 - movaps [edi+1*16], xmm3 - add edi, 2*16 + movaps [edi + 1 * 16], xmm3 + add edi, 2 * 16 - skip4: - add esi, 4*DRAWVERT_SIZE - add eax, 4*4 + skip4: + add esi, 4 * DRAWVERT_SIZE + add eax, 4 * 4 jl loop4 - done4: + done4: mov eax, numVerts and eax, 3 jz done1 @@ -16763,28 +16923,28 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c add edx, eax neg eax - loop1: - cmp dword ptr [edx+eax+0], ebx + loop1: + cmp dword ptr [edx + eax + 0], ebx jne skip0 - mov dword ptr [edx+eax+0], ecx - movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + mov dword ptr [edx + eax + 0], ecx + movss xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] add ecx, 2 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ) orps xmm0, xmm5 - movaps [edi+0*16], xmm0 + movaps [edi + 0 * 16], xmm0 subps xmm0, xmm6 - movaps [edi+1*16], xmm0 - add edi, 2*16 + movaps [edi + 1 * 16], xmm0 + add edi, 2 * 16 - skip0: + skip0: add esi, DRAWVERT_SIZE add eax, 4 jl loop1 - done1: + done1: pop ebx mov outVerts, ecx } @@ -16798,18 +16958,18 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c continue; } const float *v = verts[i].xyz.ToFloatPtr(); - vertexCache[outVerts+0][0] = v[0]; - vertexCache[outVerts+0][1] = v[1]; - vertexCache[outVerts+0][2] = v[2]; - vertexCache[outVerts+0][3] = 1.0f; + vertexCache[outVerts + 0][0] = v[0]; + vertexCache[outVerts + 0][1] = v[1]; + vertexCache[outVerts + 0][2] = v[2]; + vertexCache[outVerts + 0][3] = 1.0f; // R_SetupProjection() builds the projection matrix with a slight crunch // for depth, which keeps this w=0 division from rasterizing right at the // wrap around point and causing depth fighting with the rear caps - vertexCache[outVerts+1][0] = v[0] - lightOrigin[0]; - vertexCache[outVerts+1][1] = v[1] - lightOrigin[1]; - vertexCache[outVerts+1][2] = v[2] - lightOrigin[2]; - vertexCache[outVerts+1][3] = 0.0f; + vertexCache[outVerts + 1][0] = v[0] - lightOrigin[0]; + vertexCache[outVerts + 1][1] = v[1] - lightOrigin[1]; + vertexCache[outVerts + 1][2] = v[2] - lightOrigin[2]; + vertexCache[outVerts + 1][3] = 0.0f; vertRemap[i] = outVerts; outVerts += 2; } @@ -16841,42 +17001,42 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons add edi, eax neg eax - loop4: + loop4: prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); - movaps [edi+eax+1*16], xmm0 + movaps [edi + eax + 1 * 16], xmm0 orps xmm0, xmm4 - movaps [edi+eax+0*16], xmm0 + movaps [edi + eax + 0 * 16], xmm0 - movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 ) - movaps [edi+eax+3*16], xmm1 + movaps [edi + eax + 3 * 16], xmm1 orps xmm1, xmm5 - movaps [edi+eax+2*16], xmm1 + movaps [edi + eax + 2 * 16], xmm1 - movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + movss xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ); - movaps [edi+eax+5*16], xmm2 + movaps [edi + eax + 5 * 16], xmm2 orps xmm2, xmm6 - movaps [edi+eax+4*16], xmm2 + movaps [edi + eax + 4 * 16], xmm2 - movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] - movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] + movss xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] + movhps xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 ) - movaps [edi+eax+7*16], xmm3 + movaps [edi + eax + 7 * 16], xmm3 orps xmm3, xmm7 - movaps [edi+eax+6*16], xmm3 + movaps [edi + eax + 6 * 16], xmm3 - add esi, 4*DRAWVERT_SIZE - add eax, 4*8*4 + add esi, 4 * DRAWVERT_SIZE + add eax, 4 * 8 * 4 jl loop4 - done4: + done4: mov eax, numVerts and eax, 3 jz done1 @@ -16884,19 +17044,19 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons add edi, eax neg eax - loop1: - movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] - movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] + loop1: + movss xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8] + movhps xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); - movaps [edi+eax+1*16], xmm0 + movaps [edi + eax + 1 * 16], xmm0 orps xmm0, xmm4 - movaps [edi+eax+0*16], xmm0 + movaps [edi + eax + 0 * 16], xmm0 add esi, DRAWVERT_SIZE - add eax, 8*4 + add eax, 8 * 4 jl loop1 - done1: + done1: } return numVerts * 2; @@ -16904,15 +17064,15 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); - vertexCache[i*2+0][0] = v[0]; - vertexCache[i*2+0][1] = v[1]; - vertexCache[i*2+0][2] = v[2]; - vertexCache[i*2+0][3] = 1.0f; - - vertexCache[i*2+1][0] = v[0]; - vertexCache[i*2+1][1] = v[1]; - vertexCache[i*2+1][2] = v[2]; - vertexCache[i*2+1][3] = 0.0f; + vertexCache[i * 2 + 0][0] = v[0]; + vertexCache[i * 2 + 0][1] = v[1]; + vertexCache[i * 2 + 0][2] = v[2]; + vertexCache[i * 2 + 0][3] = 1.0f; + + vertexCache[i * 2 + 1][0] = v[0]; + vertexCache[i * 2 + 1][1] = v[1]; + vertexCache[i * 2 + 1][2] = v[2]; + vertexCache[i * 2 + 1][3] = 0.0f; } return numVerts * 2; @@ -16937,25 +17097,25 @@ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, cons neg eax align 16 - loop2: + loop2: add edi, 2*4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi-2*4*4+0], xmm0 - movhps [edi-2*4*4+8], xmm0 + movlps [edi - 2 * 4 * 4 + 0], xmm0 + movhps [edi - 2 * 4 * 4 + 8], xmm0 - movsx edx, word ptr [esi+eax+2] + movsx edx, word ptr [esi + eax + 2] cvtsi2ss xmm1, edx shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi-1*4*4+0], xmm1 - movhps [edi-1*4*4+8], xmm1 + movlps [edi - 1 * 4 * 4 + 0], xmm1 + movhps [edi - 1 * 4 * 4 + 8], xmm1 - add eax, 2*2 + add eax, 2 * 2 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -16963,10 +17123,10 @@ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, cons movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi+0], xmm0 - movhps [edi+8], xmm0 + movlps [edi + 0], xmm0 + movhps [edi + 8], xmm0 - done: + done: } } @@ -16988,7 +17148,7 @@ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, co neg eax align 16 - loop2: + loop2: add edi, 8*4 movsx ecx, word ptr [esi+eax+0] @@ -17007,7 +17167,7 @@ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, co add eax, 2*2 jl loop2 - done2: + done2: } } @@ -17029,7 +17189,7 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons neg eax align 16 - loop2: + loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] @@ -17039,13 +17199,13 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons cvtsi2ss xmm1, edx shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi-4*4+0], xmm0 - movhps [edi-4*4+8], xmm0 + movlps [edi - 4 * 4 + 0], xmm0 + movhps [edi - 4 * 4 + 8], xmm0 - add eax, 2*2 + add eax, 2 * 2 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17055,7 +17215,7 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi], xmm0 - done: + done: } } @@ -17077,7 +17237,7 @@ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, co neg eax align 16 - loop2: + loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] @@ -17093,7 +17253,7 @@ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, co add eax, 2*2 jl loop2 - done2: + done2: } } @@ -17115,7 +17275,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons neg eax align 16 - loop2: + loop2: add edi, 2*4 movsx ecx, word ptr [esi+eax+0] @@ -17129,7 +17289,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons add eax, 2*2 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17138,7 +17298,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons cvtsi2ss xmm0, ecx movss [edi], xmm0 - done: + done: } } @@ -17194,25 +17354,25 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons neg eax align 16 - loop2: - add edi, 2*16 + loop2: + add edi, 2 * 16 - movss xmm0, [esi+eax+0] + movss xmm0, [esi + eax + 0] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi-32], xmm0 - movlps [edi-24], xmm0 + movlps [edi - 32], xmm0 + movlps [edi - 24], xmm0 - movss xmm1, [esi+eax+4] + movss xmm1, [esi + eax + 4] mulss xmm1, xmm7 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi-16], xmm1 - movlps [edi- 8], xmm1 + movlps [edi - 16], xmm1 + movlps [edi - 8], xmm1 - add eax, 2*4 + add eax, 2 * 4 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17220,10 +17380,10 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi+0], xmm0 - movlps [edi+8], xmm0 + movlps [edi + 0], xmm0 + movlps [edi + 8], xmm0 - done: + done: } } @@ -17232,7 +17392,7 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons SSE_UpSample11kHzStereoOGGTo44kHz ============ */ -static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { +static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src @@ -17251,26 +17411,26 @@ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const neg eax align 16 - loop2: - add edi, 4*16 + loop2: + add edi, 4 * 16 - movlps xmm0, [ecx+eax] - movlps xmm1, [edx+eax] + movlps xmm0, [ecx + eax] + movlps xmm1, [edx + eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi-8*8], xmm0 - movlps [edi-7*8], xmm0 - movlps [edi-6*8], xmm0 - movlps [edi-5*8], xmm0 - movhps [edi-4*8], xmm0 - movhps [edi-3*8], xmm0 - movhps [edi-2*8], xmm0 - movhps [edi-1*8], xmm0 - - add eax, 2*4 + movlps [edi - 8 * 8], xmm0 + movlps [edi - 7 * 8], xmm0 + movlps [edi - 6 * 8], xmm0 + movlps [edi - 5 * 8], xmm0 + movhps [edi - 4 * 8], xmm0 + movhps [edi - 3 * 8], xmm0 + movhps [edi - 2 * 8], xmm0 + movhps [edi - 1 * 8], xmm0 + + add eax, 2 * 4 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17279,12 +17439,12 @@ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi+0*8], xmm0 - movlps [edi+1*8], xmm0 - movlps [edi+2*8], xmm0 - movlps [edi+3*8], xmm0 + movlps [edi + 0 * 8], xmm0 + movlps [edi + 1 * 8], xmm0 + movlps [edi + 2 * 8], xmm0 + movlps [edi + 3 * 8], xmm0 - done: + done: } } @@ -17309,20 +17469,20 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons neg eax align 16 - loop2: - add edi, 2*8 + loop2: + add edi, 2 * 8 - movss xmm0, [esi+eax+0] - movss xmm1, [esi+eax+4] + movss xmm0, [esi + eax + 0] + movss xmm1, [esi + eax + 4] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm7 - movlps [edi-16], xmm0 - movhps [edi- 8], xmm0 + movlps [edi - 16], xmm0 + movhps [edi - 8], xmm0 - add eax, 2*4 + add eax, 2 * 4 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17330,9 +17490,9 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) - movlps [edi+0], xmm0 + movlps [edi + 0], xmm0 - done: + done: } } @@ -17341,7 +17501,7 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons SSE_UpSample22kHzStereoOGGTo44kHz ============ */ -static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { +static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src @@ -17360,22 +17520,22 @@ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const neg eax align 16 - loop2: - add edi, 2*16 + loop2: + add edi, 2 * 16 - movlps xmm0, [ecx+eax] - movlps xmm1, [edx+eax] + movlps xmm0, [ecx + eax] + movlps xmm1, [edx + eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi-4*8], xmm0 - movlps [edi-3*8], xmm0 - movhps [edi-2*8], xmm0 - movhps [edi-1*8], xmm0 + movlps [edi - 4 * 8], xmm0 + movlps [edi - 3 * 8], xmm0 + movhps [edi - 2 * 8], xmm0 + movhps [edi - 1 * 8], xmm0 - add eax, 2*4 + add eax, 2 * 4 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17384,10 +17544,10 @@ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi+0*8], xmm0 - movlps [edi+1*8], xmm0 + movlps [edi + 0 * 8], xmm0 + movlps [edi + 1 * 8], xmm0 - done: + done: } } @@ -17406,7 +17566,7 @@ static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, cons SSE_UpSample44kHzStereoOGGTo44kHz ============ */ -static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { +static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src @@ -17425,20 +17585,20 @@ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const neg eax align 16 - loop2: + loop2: add edi, 16 - movlps xmm0, [ecx+eax] - movlps xmm1, [edx+eax] + movlps xmm0, [ecx + eax] + movlps xmm1, [edx + eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi-2*8], xmm0 - movhps [edi-1*8], xmm0 + movlps [edi - 2 * 8], xmm0 + movhps [edi - 1 * 8], xmm0 - add eax, 2*4 + add eax, 2 * 4 jl loop2 - done2: + done2: mov eax, numSamples and eax, 1 jz done @@ -17447,9 +17607,9 @@ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 - movlps [edi+0*8], xmm0 + movlps [edi + 0 * 8], xmm0 - done: + done: } } @@ -17460,7 +17620,7 @@ idSIMD_SSE::UpSampleOGGTo44kHz Duplicate samples for 44kHz output. ============ */ -void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) { +void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); @@ -17517,38 +17677,38 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *s shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 ) addps xmm7, xmm7 - loop16: - add edi, 4*4*4 + loop16: + add edi, 4 * 4 * 4 - movaps xmm0, [esi+eax+0*4*4] + movaps xmm0, [esi + eax + 0 * 4 * 4] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm0, xmm6 - addps xmm0, [edi-4*4*4] + addps xmm0, [edi - 4 * 4 * 4] addps xmm6, xmm7 - movaps [edi-4*4*4], xmm0 + movaps [edi - 4 * 4 * 4], xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm1, xmm6 - addps xmm1, [edi-3*4*4] + addps xmm1, [edi - 3 * 4 * 4] addps xmm6, xmm7 - movaps [edi-3*4*4], xmm1 + movaps [edi - 3 * 4 * 4], xmm1 - movaps xmm2, [esi+eax+1*4*4] + movaps xmm2, [esi + eax + 1 * 4 * 4] movaps xmm3, xmm2 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm2, xmm6 - addps xmm2, [edi-2*4*4] + addps xmm2, [edi - 2 * 4 * 4] addps xmm6, xmm7 - movaps [edi-2*4*4], xmm2 + movaps [edi - 2 * 4 * 4], xmm2 shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm3, xmm6 - addps xmm3, [edi-1*4*4] + addps xmm3, [edi - 1 * 4 * 4] addps xmm6, xmm7 - movaps [edi-1*4*4], xmm3 + movaps [edi - 1 * 4 * 4], xmm3 - add eax, 2*4*4 + add eax, 2 * 4 * 4 jl loop16 } @@ -17574,11 +17734,11 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *s incL *= 2; incR *= 2; - for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { - mixBuffer[i*2+0] += samples[i+0] * sL0; - mixBuffer[i*2+1] += samples[i+0] * sR0; - mixBuffer[i*2+2] += samples[i+1] * sL1; - mixBuffer[i*2+3] += samples[i+1] * sR1; + for ( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { + mixBuffer[i * 2 + 0] += samples[i + 0] * sL0; + mixBuffer[i * 2 + 1] += samples[i + 0] * sR0; + mixBuffer[i * 2 + 2] += samples[i + 1] * sL1; + mixBuffer[i * 2 + 3] += samples[i + 1] * sR1; sL0 += incL; sR0 += incR; sL1 += incL; @@ -17620,34 +17780,34 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 ) addps xmm7, xmm7 - loop16: - add edi, 4*4*4 + loop16: + add edi, 4 * 4 * 4 - movaps xmm0, [esi+eax+0*4*4] + movaps xmm0, [esi + eax + 0 * 4 * 4] mulps xmm0, xmm6 - addps xmm0, [edi-4*4*4] + addps xmm0, [edi - 4 * 4 * 4] addps xmm6, xmm7 - movaps [edi-4*4*4], xmm0 + movaps [edi - 4 * 4 * 4], xmm0 - movaps xmm2, [esi+eax+1*4*4] + movaps xmm2, [esi + eax + 1 * 4 * 4] mulps xmm2, xmm6 - addps xmm2, [edi-3*4*4] + addps xmm2, [edi - 3 * 4 * 4] addps xmm6, xmm7 - movaps [edi-3*4*4], xmm2 + movaps [edi - 3 * 4 * 4], xmm2 - movaps xmm3, [esi+eax+2*4*4] + movaps xmm3, [esi + eax + 2 * 4 * 4] mulps xmm3, xmm6 - addps xmm3, [edi-2*4*4] + addps xmm3, [edi - 2 * 4 * 4] addps xmm6, xmm7 - movaps [edi-2*4*4], xmm3 + movaps [edi - 2 * 4 * 4], xmm3 - movaps xmm4, [esi+eax+3*4*4] + movaps xmm4, [esi + eax + 3 * 4 * 4] mulps xmm4, xmm6 - addps xmm4, [edi-1*4*4] + addps xmm4, [edi - 1 * 4 * 4] addps xmm6, xmm7 - movaps [edi-1*4*4], xmm4 + movaps [edi - 1 * 4 * 4], xmm4 - add eax, 4*4*4 + add eax, 4 * 4 * 4 jl loop16 } @@ -17673,11 +17833,11 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float incL *= 2; incR *= 2; - for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { - mixBuffer[i*2+0] += samples[i*2+0] * sL0; - mixBuffer[i*2+1] += samples[i*2+1] * sR0; - mixBuffer[i*2+2] += samples[i*2+2] * sL1; - mixBuffer[i*2+3] += samples[i*2+3] * sR1; + for ( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { + mixBuffer[i * 2 + 0] += samples[i * 2 + 0] * sL0; + mixBuffer[i * 2 + 1] += samples[i * 2 + 1] * sR0; + mixBuffer[i * 2 + 2] += samples[i * 2 + 2] * sL1; + mixBuffer[i * 2 + 3] += samples[i * 2 + 3] * sR1; sL0 += incL; sR0 += incR; sL1 += incL; @@ -17724,8 +17884,8 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s xorps xmm5, xmm5 movhps xmm5, incs - movlps xmm7, incs+8 - movhps xmm7, incs+16 + movlps xmm7, incs + 8 + movhps xmm7, incs + 16 addps xmm3, xmm5 addps xmm4, xmm7 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 ) @@ -17735,53 +17895,53 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s addps xmm6, xmm6 addps xmm7, xmm7 - loop24: - add edi, 6*16 + loop24: + add edi, 6 * 16 - movaps xmm0, [esi+eax] + movaps xmm0, [esi + eax] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm2 - addps xmm1, [edi-6*16] + addps xmm1, [edi - 6 * 16] addps xmm2, xmm5 - movaps [edi-6*16], xmm1 + movaps [edi - 6 * 16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm1, xmm3 - addps xmm1, [edi-5*16] + addps xmm1, [edi - 5 * 16] addps xmm3, xmm6 - movaps [edi-5*16], xmm1 + movaps [edi - 5 * 16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm1, xmm4 - addps xmm1, [edi-4*16] + addps xmm1, [edi - 4 * 16] addps xmm4, xmm7 - movaps [edi-4*16], xmm1 + movaps [edi - 4 * 16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 ) mulps xmm1, xmm2 - addps xmm1, [edi-3*16] + addps xmm1, [edi - 3 * 16] addps xmm2, xmm5 - movaps [edi-3*16], xmm1 + movaps [edi - 3 * 16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm1, xmm3 - addps xmm1, [edi-2*16] + addps xmm1, [edi - 2 * 16] addps xmm3, xmm6 - movaps [edi-2*16], xmm1 + movaps [edi - 2 * 16], xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 ) mulps xmm0, xmm4 - addps xmm0, [edi-1*16] + addps xmm0, [edi - 1 * 16] addps xmm4, xmm7 - movaps [edi-1*16], xmm0 + movaps [edi - 1 * 16], xmm0 - add eax, 4*4 + add eax, 4 * 4 jl loop24 } @@ -17822,21 +17982,21 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s incL4 *= 2; incL5 *= 2; - for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { - mixBuffer[i*6+ 0] += samples[i+0] * sL0; - mixBuffer[i*6+ 1] += samples[i+0] * sL1; - mixBuffer[i*6+ 2] += samples[i+0] * sL2; - mixBuffer[i*6+ 3] += samples[i+0] * sL3; + for ( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { + mixBuffer[i * 6 + 0] += samples[i + 0] * sL0; + mixBuffer[i * 6 + 1] += samples[i + 0] * sL1; + mixBuffer[i * 6 + 2] += samples[i + 0] * sL2; + mixBuffer[i * 6 + 3] += samples[i + 0] * sL3; - mixBuffer[i*6+ 4] += samples[i+0] * sL4; - mixBuffer[i*6+ 5] += samples[i+0] * sL5; - mixBuffer[i*6+ 6] += samples[i+1] * sL6; - mixBuffer[i*6+ 7] += samples[i+1] * sL7; + mixBuffer[i * 6 + 4] += samples[i + 0] * sL4; + mixBuffer[i * 6 + 5] += samples[i + 0] * sL5; + mixBuffer[i * 6 + 6] += samples[i + 1] * sL6; + mixBuffer[i * 6 + 7] += samples[i + 1] * sL7; - mixBuffer[i*6+ 8] += samples[i+1] * sL8; - mixBuffer[i*6+ 9] += samples[i+1] * sL9; - mixBuffer[i*6+10] += samples[i+1] * sL10; - mixBuffer[i*6+11] += samples[i+1] * sL11; + mixBuffer[i * 6 + 8] += samples[i + 1] * sL8; + mixBuffer[i * 6 + 9] += samples[i + 1] * sL9; + mixBuffer[i * 6 + 10] += samples[i + 1] * sL10; + mixBuffer[i * 6 + 11] += samples[i + 1] * sL11; sL0 += incL0; sL1 += incL1; @@ -17896,8 +18056,8 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float xorps xmm5, xmm5 movhps xmm5, incs - movlps xmm7, incs+ 8 - movhps xmm7, incs+16 + movlps xmm7, incs + 8 + movhps xmm7, incs + 16 addps xmm3, xmm5 addps xmm4, xmm7 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 ) @@ -17907,32 +18067,32 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float addps xmm6, xmm6 addps xmm7, xmm7 - loop12: - add edi, 3*16 + loop12: + add edi, 3 * 16 - movaps xmm0, [esi+eax+0] + movaps xmm0, [esi + eax + 0] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 ) mulps xmm1, xmm2 - addps xmm1, [edi-3*16] + addps xmm1, [edi - 3 * 16] addps xmm2, xmm5 - movaps [edi-3*16], xmm1 + movaps [edi - 3 * 16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 ) mulps xmm1, xmm3 - addps xmm1, [edi-2*16] + addps xmm1, [edi - 2 * 16] addps xmm3, xmm6 - movaps [edi-2*16], xmm1 + movaps [edi - 2 * 16], xmm1 - add eax, 4*4 + add eax, 4 * 4 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 ) mulps xmm0, xmm4 - addps xmm0, [edi-1*16] + addps xmm0, [edi - 1 * 16] addps xmm4, xmm7 - movaps [edi-1*16], xmm0 + movaps [edi - 1 * 16], xmm0 jl loop12 @@ -17977,21 +18137,21 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float incL4 *= 2; incL5 *= 2; - for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { - mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0; - mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1; - mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2; - mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3; + for ( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { + mixBuffer[i * 6 + 0] += samples[i * 2 + 0 + 0] * sL0; + mixBuffer[i * 6 + 1] += samples[i * 2 + 0 + 1] * sL1; + mixBuffer[i * 6 + 2] += samples[i * 2 + 0 + 0] * sL2; + mixBuffer[i * 6 + 3] += samples[i * 2 + 0 + 0] * sL3; - mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4; - mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5; - mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6; - mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7; + mixBuffer[i * 6 + 4] += samples[i * 2 + 0 + 0] * sL4; + mixBuffer[i * 6 + 5] += samples[i * 2 + 0 + 1] * sL5; + mixBuffer[i * 6 + 6] += samples[i * 2 + 2 + 0] * sL6; + mixBuffer[i * 6 + 7] += samples[i * 2 + 2 + 1] * sL7; - mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8; - mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9; - mixBuffer[i*6+10] += samples[i*2+2+0] * sL10; - mixBuffer[i*6+11] += samples[i*2+2+1] * sL11; + mixBuffer[i * 6 + 8] += samples[i * 2 + 2 + 0] * sL8; + mixBuffer[i * 6 + 9] += samples[i * 2 + 2 + 0] * sL9; + mixBuffer[i * 6 + 10] += samples[i * 2 + 2 + 0] * sL10; + mixBuffer[i * 6 + 11] += samples[i * 2 + 2 + 1] * sL11; sL0 += incL0; sL1 += incL1; @@ -18031,7 +18191,7 @@ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuf add edi, eax neg eax - loop16: + loop16: movaps xmm0, [edi+eax+0*16] movaps xmm2, [edi+eax+1*16] @@ -18084,15 +18244,13 @@ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuf } else if ( mixBuffer[i] >= 32767.0f ) { samples[i] = 32767; } else { - samples[i] = (short) mixBuffer[i]; + samples[i] = ( short ) mixBuffer[i]; } } #endif } -#endif /* _MSC_VER */ - /* ============ idSIMD_SSE::CullByFrustum @@ -18194,3 +18352,121 @@ void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, c pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6; } } + +// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used) +#elif defined(_MSC_VER) && defined(_M_X64) + +#include + +/* +============ +idSIMD_SSE::GetName +============ +*/ +const char *idSIMD_SSE::GetName( void ) const { + return "MMX & SSE"; +} + +/* +============ +idSIMD_SSE::CullByFrustum +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + pointCull[j] = mask_lo & mask6; + } +} + +/* +============ +idSIMD_SSE::CullByFrustum2 +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + eps = _mm_set1_ps( -epsilon ); + int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) ); + int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + int mask_hi = mask_hi14 | mask_hi56 << 4; + pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6; + } +} + +#endif /* _MSC_VER */ diff --git a/neo/idlib/math/Simd_SSE.h b/neo/idlib/math/Simd_SSE.h index 859d8ce07..2391fb106 100644 --- a/neo/idlib/math/Simd_SSE.h +++ b/neo/idlib/math/Simd_SSE.h @@ -46,10 +46,14 @@ class idSIMD_SSE : public idSIMD_MMX { using idSIMD_MMX::MinMax; virtual const char *VPCALL GetName( void ) const; + virtual void VPCALL Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ); virtual void VPCALL MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ); virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ); + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); + #elif defined(_MSC_VER) && defined(_M_IX86) virtual const char *VPCALL GetName( void ) const; @@ -143,12 +147,18 @@ class idSIMD_SSE : public idSIMD_MMX { virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ); -#endif + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); + +// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used) +#elif defined(_MSC_VER) && defined(_M_X64) + + virtual const char *VPCALL GetName( void ) const; - // Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used) virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); +#endif /* _MSC_VER */ }; #endif /* !__MATH_SIMD_SSE_H__ */