diff --git a/neo/idlib/math/Simd_AVX.cpp b/neo/idlib/math/Simd_AVX.cpp
index 659e01ff6..4ef11728f 100644
--- a/neo/idlib/math/Simd_AVX.cpp
+++ b/neo/idlib/math/Simd_AVX.cpp
@@ -35,6 +35,10 @@ If you have questions concerning this license or the applicable additional terms
 //
 //===============================================================
 
+// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used)
+#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \
+	defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX__)
+
 #include <immintrin.h>
 
 #include "idlib/geometry/DrawVert.h"
@@ -122,3 +126,5 @@ void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, c
 	}
 	_mm256_zeroupper();
 }
+
+#endif  /* _MSC_VER */
diff --git a/neo/idlib/math/Simd_AVX.h b/neo/idlib/math/Simd_AVX.h
index 6b8b9bdc3..a8030bb2f 100644
--- a/neo/idlib/math/Simd_AVX.h
+++ b/neo/idlib/math/Simd_AVX.h
@@ -28,10 +28,15 @@
 
 class idSIMD_AVX : public idSIMD_SSE3 {
 public:
-	// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
+// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
+#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \
+	defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX__)
+
 	virtual const char *VPCALL GetName( void ) const;
 	virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
 	virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+
+#endif  /* _MSC_VER */
 };
 
 #endif /* !__MATH_SIMD_AVX_H__ */
diff --git a/neo/idlib/math/Simd_AVX2.cpp b/neo/idlib/math/Simd_AVX2.cpp
index c338027f3..3b6f9b1cb 100644
--- a/neo/idlib/math/Simd_AVX2.cpp
+++ b/neo/idlib/math/Simd_AVX2.cpp
@@ -26,7 +26,7 @@ If you have questions concerning this license or the applicable additional terms
 ===========================================================================
 */
 
-#include "sys/platform.h"
+#include "sys/platform.h"
 #include "Simd_AVX2.h"
 
 //===============================================================
@@ -35,6 +35,10 @@ If you have questions concerning this license or the applicable additional terms
 //
 //===============================================================
 
+// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used)
+#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \
+	defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX2__)
+
 #include <immintrin.h>
 
 #include "idlib/geometry/DrawVert.h"
@@ -62,20 +66,17 @@ void VPCALL idSIMD_AVX2::CullByFrustum( idDrawVert *verts, const int numVerts, c
 	const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
 	const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
 	const __m256 eps = _mm256_set1_ps( epsilon );
-	const byte mask6 = (1 << 6) - 1;
+	const byte mask6 = ( 1 << 6 ) - 1;
 
 	for ( int j = 0; j < numVerts; j++ ) {
 		idVec3 &vec = verts[j].xyz;
 		__m256 vX = _mm256_set1_ps( vec.x );
 		__m256 vY = _mm256_set1_ps( vec.y );
 		__m256 vZ = _mm256_set1_ps( vec.z );
-		__m256 d = _mm256_fmadd_ps( fA, vX,
-			_mm256_fmadd_ps( fB, vY,
-				_mm256_fmadd_ps( fC, vZ, fD )
-			) 
-		);
+		__m256 d = _mm256_fmadd_ps( fA, vX, _mm256_fmadd_ps( fB, vY,
+		                                    _mm256_fmadd_ps( fC, vZ, fD ) ) );
 		int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
-		pointCull[j] = (byte)mask_lo & mask6;
+		pointCull[j] = ( byte )mask_lo & mask6;
 	}
 	_mm256_zeroupper();
 }
@@ -92,21 +93,20 @@ void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts,
 	const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
 	const __m256 eps = _mm256_set1_ps( epsilon );
 	static const __m256 epsM = _mm256_set1_ps( -epsilon );
-	const short mask6 = (1 << 6) - 1;
+	const short mask6 = ( 1 << 6 ) - 1;
 
 	for ( int j = 0; j < numVerts; j++ ) {
 		idVec3 &vec = verts[j].xyz;
 		__m256 vX = _mm256_set1_ps( vec.x );
 		__m256 vY = _mm256_set1_ps( vec.y );
 		__m256 vZ = _mm256_set1_ps( vec.z );
-		__m256 d = _mm256_fmadd_ps( fA, vX,
-			_mm256_fmadd_ps( fB, vY,
-				_mm256_fmadd_ps( fC, vZ, fD )
-			) 
-		);
+		__m256 d = _mm256_fmadd_ps( fA, vX, _mm256_fmadd_ps( fB, vY, 
+											_mm256_fmadd_ps( fC, vZ, fD ) ) );
 		int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
 		int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) );
-		pointCull[j] = (unsigned short)(mask_lo & mask6 | (mask_hi & mask6) << 6);
+		pointCull[j] = ( unsigned short )( mask_lo & mask6 | ( mask_hi & mask6 ) << 6 );
 	}
 	_mm256_zeroupper();
 }
+
+#endif
diff --git a/neo/idlib/math/Simd_AVX2.h b/neo/idlib/math/Simd_AVX2.h
index 764f1ee8d..1698363ec 100644
--- a/neo/idlib/math/Simd_AVX2.h
+++ b/neo/idlib/math/Simd_AVX2.h
@@ -28,10 +28,15 @@
 
 class idSIMD_AVX2 : public idSIMD_AVX {
 public:
-	// Revelator: these work whether gcc clang or msvc in x86 or x64 (no inline assembly used)
+// Revelator: these work whether in gcc clang or msvc in x86 or x64 (no inline assembly used)
+#if defined(_MSC_VER) && ( defined(_M_X64) || defined(_M_IX86) ) || \
+	defined(__GNUC__) && ( defined(__i386__) || defined (__x86_64__) ) && defined(__AVX2__)
+
 	virtual const char *VPCALL GetName( void ) const;
 	virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
 	virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+
+#endif  /* _MSC_VER */
 };
 
 #endif /* !__MATH_SIMD_AVX2_H__ */
diff --git a/neo/idlib/math/Simd_SSE.cpp b/neo/idlib/math/Simd_SSE.cpp
index ae0bc6f08..7e3a6b6b4 100644
--- a/neo/idlib/math/Simd_SSE.cpp
+++ b/neo/idlib/math/Simd_SSE.cpp
@@ -35,8 +35,6 @@ If you have questions concerning this license or the applicable additional terms
 //                                                        E
 //===============================================================
 
-#include <xmmintrin.h>
-
 #include "idlib/geometry/DrawVert.h"
 #include "idlib/geometry/JointTransform.h"
 #include "idlib/math/Vector.h"
@@ -55,6 +53,8 @@ If you have questions concerning this license or the applicable additional terms
 
 #if defined(__GNUC__) && defined(__SSE__)
 
+#include <xmmintrin.h>
+
 #define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
 #define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
 
@@ -90,12 +90,12 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;	// Declare 8 xmm registers.
 	int count_l4 = count;                                   // count_l4 = eax
 	int count_l1 = count;                                   // count_l1 = edx
-	char *constant_p = (char *)&constant;                   // constant_p = edi
-	char *src_p = (char *) src;                             // src_p = esi
-	char *dst_p = (char *) dst;                             // dst_p = ecx
+	char *constant_p = ( char * )&constant;                 // constant_p = edi
+	char *src_p = ( char * ) src;                           // src_p = esi
+	char *dst_p = ( char * ) dst;                           // dst_p = ecx
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
+	assert( ptrdiff_t( &src->xyz ) - ptrdiff_t( src ) == DRAWVERT_XYZ_OFFSET );
 
 	/*
 		and			eax, ~3
@@ -109,100 +109,100 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 	*/
 	count_l4 = count_l4 & ~3;
-	xmm4 = _mm_load_ss((float *) (constant_p));
-	xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
-	xmm5 = _mm_load_ss((float *) (constant_p + 4));
-	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
-	xmm6 = _mm_load_ss((float *) (constant_p + 8));
-	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
-	xmm7 = _mm_load_ss((float *) (constant_p + 12));
-	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
+	xmm4 = _mm_load_ss( ( float * )( constant_p ) );
+	xmm4 = _mm_shuffle_ps( xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm5 = _mm_load_ss( ( float * )( constant_p + 4 ) );
+	xmm5 = _mm_shuffle_ps( xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm6 = _mm_load_ss( ( float * )( constant_p + 8 ) );
+	xmm6 = _mm_shuffle_ps( xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm7 = _mm_load_ss( ( float * )( constant_p + 12 ) );
+	xmm7 = _mm_shuffle_ps( xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) );
 
 	/*
 		jz			startVert1
 	*/
-	if(count_l4 != 0) {
-	/*
-		imul		eax, DRAWVERT_SIZE
-		add			esi, eax
-		neg			eax
-	*/
+	if ( count_l4 != 0 ) {
+		/*
+			imul		eax, DRAWVERT_SIZE
+			add			esi, eax
+			neg			eax
+		*/
 		count_l4 = count_l4 * DRAWVERT_SIZE;
 		src_p = src_p + count_l4;
 		count_l4 = -count_l4;
-	/*
-	loopVert4:
-	*/
+		/*
+		loopVert4:
+		*/
 		do {
-	/*
-		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
-		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
-		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
-		movaps		xmm1, xmm0												//  3,  X,  0,  1
-	*/
-			xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 3,  X,  X,  X
-			xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));        // 2,  X,  X,  X
-			xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3,  X,  0,  1
+			/*
+				movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
+				movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
+				movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
+				movaps		xmm1, xmm0												//  3,  X,  0,  1
+			*/
+			xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 3,  X,  X,  X
+			xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8 ) ); // 2,  X,  X,  X
+			xmm0 = _mm_loadh_pi( xmm0, ( __m64 * )( src_p + count_l4 + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 3,  X,  0,  1
 			xmm1 = xmm0;							                                                    // 3,  X,  0,  1
 
-	/*
-		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
-		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
-	*/
-			xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4,  5,  0,  1
-			xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ));                               // 2,  X,  4,  5
-
-	/*
-		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
-		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
-	*/
-			xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 9,  X,  X,  X
-			xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9,  X,  6,  7
-			xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ));                               // 0,  3,  6,  9
-	/*
-		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
-		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
-	*/
-			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4));  // 10, 11, 6,  7
-			xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ));                               // 1,  4,  7,  10
-	/*
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
-		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
-	*/
-			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));  // 10, 11, 8,  X
-			xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ));                               // 2,  5,  8,  11
-
-	/*
-		add			ecx, 16
-		add			eax, 4*DRAWVERT_SIZE
-	*/
+			/*
+				movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
+				shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
+			*/
+			xmm1 = _mm_loadl_pi( xmm1, ( __m64 * )( src_p + count_l4 + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4 ) ); // 4,  5,  0,  1
+			xmm2 = _mm_shuffle_ps( xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) );                             // 2,  X,  4,  5
+
+			/*
+				movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
+				movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
+				shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
+			*/
+			xmm3 = _mm_load_ss( ( float * )( src_p + count_l4 + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 9,  X,  X,  X
+			xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0 ) ); // 9,  X,  6,  7
+			xmm0 = _mm_shuffle_ps( xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) );                             // 0,  3,  6,  9
+			/*
+				movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
+				shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
+			*/
+			xmm3 = _mm_loadl_pi( xmm3, ( __m64 * )( src_p + count_l4 + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4 ) ); // 10, 11, 6,  7
+			xmm1 = _mm_shuffle_ps( xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) );                             // 1,  4,  7,  10
+			/*
+				movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
+			*/
+			xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8 ) ); // 10, 11, 8,  X
+			xmm2 = _mm_shuffle_ps( xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) );                             // 2,  5,  8,  11
+
+			/*
+				add			ecx, 16
+				add			eax, 4*DRAWVERT_SIZE
+			*/
 			dst_p = dst_p + 16;
-			count_l4 = count_l4 + 4*DRAWVERT_SIZE;
+			count_l4 = count_l4 + 4 * DRAWVERT_SIZE;
 
-	/*
-		mulps		xmm0, xmm4
-		mulps		xmm1, xmm5
-		mulps		xmm2, xmm6
-		addps		xmm0, xmm7
-		addps		xmm0, xmm1
-		addps		xmm0, xmm2
-	*/
-			xmm0 = _mm_mul_ps(xmm0, xmm4);
-			xmm1 = _mm_mul_ps(xmm1, xmm5);
-			xmm2 = _mm_mul_ps(xmm2, xmm6);
-			xmm0 = _mm_add_ps(xmm0, xmm7);
-			xmm0 = _mm_add_ps(xmm0, xmm1);
-			xmm0 = _mm_add_ps(xmm0, xmm2);
-
-	/*
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
-		jl			loopVert4
-	*/
-			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
-			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
-		} while(count_l4 < 0);
+			/*
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				addps		xmm0, xmm7
+				addps		xmm0, xmm1
+				addps		xmm0, xmm2
+			*/
+			xmm0 = _mm_mul_ps( xmm0, xmm4 );
+			xmm1 = _mm_mul_ps( xmm1, xmm5 );
+			xmm2 = _mm_mul_ps( xmm2, xmm6 );
+			xmm0 = _mm_add_ps( xmm0, xmm7 );
+			xmm0 = _mm_add_ps( xmm0, xmm1 );
+			xmm0 = _mm_add_ps( xmm0, xmm2 );
+
+			/*
+				movlps		[ecx-16+0], xmm0
+				movhps		[ecx-16+8], xmm0
+				jl			loopVert4
+			*/
+			_mm_storel_pi( ( __m64 * )( dst_p - 16 + 0 ), xmm0 );
+			_mm_storeh_pi( ( __m64 * )( dst_p - 16 + 8 ), xmm0 );
+		} while ( count_l4 < 0 );
 	}
 
 	/*
@@ -211,39 +211,39 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 		jz			done
 	*/
 	count_l1 = count_l1 & 3;
-	if(count_l1 != 0) {
-	/*
-		loopVert1:
-		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
-		mulss		xmm0, xmm4
-		mulss		xmm1, xmm5
-		mulss		xmm2, xmm6
-		addss		xmm0, xmm7
-		add			ecx, 4
-		addss		xmm0, xmm1
-		add			eax, DRAWVERT_SIZE
-		addss		xmm0, xmm2
-		dec			edx
-		movss		[ecx-4], xmm0
-		jnz			loopVert1
-	*/
+	if ( count_l1 != 0 ) {
+		/*
+			loopVert1:
+			movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
+			movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
+			movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
+			mulss		xmm0, xmm4
+			mulss		xmm1, xmm5
+			mulss		xmm2, xmm6
+			addss		xmm0, xmm7
+			add			ecx, 4
+			addss		xmm0, xmm1
+			add			eax, DRAWVERT_SIZE
+			addss		xmm0, xmm2
+			dec			edx
+			movss		[ecx-4], xmm0
+			jnz			loopVert1
+		*/
 		do {
-			xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
-			xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
-			xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
-			xmm0 = _mm_mul_ss(xmm0, xmm4);
-			xmm1 = _mm_mul_ss(xmm1, xmm5);
-			xmm2 = _mm_mul_ss(xmm2, xmm6);
-			xmm0 = _mm_add_ss(xmm0, xmm7);
+			xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm1 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 4 ) );
+			xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + DRAWVERT_XYZ_OFFSET + 8 ) );
+			xmm0 = _mm_mul_ss( xmm0, xmm4 );
+			xmm1 = _mm_mul_ss( xmm1, xmm5 );
+			xmm2 = _mm_mul_ss( xmm2, xmm6 );
+			xmm0 = _mm_add_ss( xmm0, xmm7 );
 			dst_p = dst_p + 4;
-			xmm0 = _mm_add_ss(xmm0, xmm1);
+			xmm0 = _mm_add_ss( xmm0, xmm1 );
 			count_l4 = count_l4 + DRAWVERT_SIZE;
-			xmm0 = _mm_add_ss(xmm0, xmm2);
+			xmm0 = _mm_add_ss( xmm0, xmm2 );
 			count_l1 = count_l1 - 1;
-			_mm_store_ss((float *) (dst_p-4), xmm0);
-		} while( count_l1 != 0);
+			_mm_store_ss( ( float * )( dst_p - 4 ), xmm0 );
+		} while ( count_l1 != 0 );
 	}
 	/*
 		done:
@@ -258,7 +258,7 @@ idSIMD_SSE::MinMax
 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
+	assert( ptrdiff_t( &src->xyz ) - ptrdiff_t( src ) == DRAWVERT_XYZ_OFFSET );
 
 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 	char *indexes_p;
@@ -276,13 +276,13 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		movaps		xmm2, xmm0
 		movaps		xmm3, xmm1
 	*/
-		xmm0 = _mm_load_ss(&idMath::INFINITY);
-		// To satisfy the compiler use xmm0 instead.
-		xmm1 = _mm_xor_ps(xmm0, xmm0);
-		xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
-		xmm1 = _mm_sub_ps(xmm1, xmm0);
-		xmm2 = xmm0;
-		xmm3 = xmm1;
+	xmm0 = _mm_load_ss( &idMath::INFINITY );
+	// To satisfy the compiler use xmm0 instead.
+	xmm1 = _mm_xor_ps( xmm0, xmm0 );
+	xmm0 = _mm_shuffle_ps( xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm1 = _mm_sub_ps( xmm1, xmm0 );
+	xmm2 = xmm0;
+	xmm3 = xmm1;
 
 	/*
 		mov			edi, indexes
@@ -291,91 +291,91 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		and			eax, ~3
 		jz			done4
 	*/
-		indexes_p = (char *) indexes;
-		src_p = (char *) src;
-		count_l = count;
-		count_l = count_l & ~3;
-		if(count_l != 0) {
-	/*
-		shl			eax, 2
-		add			edi, eax
-		neg			eax
-	*/
-			count_l = count_l << 2;
-			indexes_p = indexes_p + count_l;
-			count_l = -count_l;
-	/*
-	loop4:
-//		prefetchnta	[edi+128]
-//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
-	*/
+	indexes_p = ( char * ) indexes;
+	src_p = ( char * ) src;
+	count_l = count;
+	count_l = count_l & ~3;
+	if ( count_l != 0 ) {
+		/*
+			shl			eax, 2
+			add			edi, eax
+			neg			eax
+		*/
+		count_l = count_l << 2;
+		indexes_p = indexes_p + count_l;
+		count_l = -count_l;
+		/*
+		loop4:
+		//		prefetchnta	[edi+128]
+		//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
+		*/
 		do {
-	/*
-		mov			edx, [edi+eax+0]
-		imul		edx, DRAWVERT_SIZE
-		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		minps		xmm0, xmm4
-		maxps		xmm1, xmm4
-	*/
-			edx = *((int*)(indexes_p+count_l+0));
+			/*
+				mov			edx, [edi+eax+0]
+				imul		edx, DRAWVERT_SIZE
+				movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
+				movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+				minps		xmm0, xmm4
+				maxps		xmm1, xmm4
+			*/
+			edx = *( ( int * )( indexes_p + count_l + 0 ) );
 			edx = edx * DRAWVERT_SIZE;
-			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
-			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
-			xmm0 = _mm_min_ps(xmm0, xmm4);
-			xmm1 = _mm_max_ps(xmm1, xmm4);
-
-	/*
-		mov			edx, [edi+eax+4]
-		imul		edx, DRAWVERT_SIZE
-		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
-		minps		xmm2, xmm5
-		maxps		xmm3, xmm5
-	*/
-			edx = *((int*)(indexes_p+count_l+4));
+			xmm4 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) );
+			xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm0 = _mm_min_ps( xmm0, xmm4 );
+			xmm1 = _mm_max_ps( xmm1, xmm4 );
+
+			/*
+				mov			edx, [edi+eax+4]
+				imul		edx, DRAWVERT_SIZE
+				movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+				movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
+				minps		xmm2, xmm5
+				maxps		xmm3, xmm5
+			*/
+			edx = *( ( int * )( indexes_p + count_l + 4 ) );
 			edx = edx * DRAWVERT_SIZE;
-			xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
-			xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
-			xmm2 = _mm_min_ps(xmm2, xmm5);
-			xmm3 = _mm_max_ps(xmm3, xmm5);
-
-	/*
-		mov			edx, [edi+eax+8]
-		imul		edx, DRAWVERT_SIZE
-		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		minps		xmm0, xmm6
-		maxps		xmm1, xmm6
-	*/
-			edx = *((int*)(indexes_p+count_l+8));
+			xmm5 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm5 = _mm_loadh_pi( xmm5, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 4 ) );
+			xmm2 = _mm_min_ps( xmm2, xmm5 );
+			xmm3 = _mm_max_ps( xmm3, xmm5 );
+
+			/*
+				mov			edx, [edi+eax+8]
+				imul		edx, DRAWVERT_SIZE
+				movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
+				movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+				minps		xmm0, xmm6
+				maxps		xmm1, xmm6
+			*/
+			edx = *( ( int * )( indexes_p + count_l + 8 ) );
 			edx = edx * DRAWVERT_SIZE;
-			xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
-			xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
-			xmm0 = _mm_min_ps(xmm0, xmm6);
-			xmm1 = _mm_max_ps(xmm1, xmm6);
-
-	/*
-		mov			edx, [edi+eax+12]
-		imul		edx, DRAWVERT_SIZE
-		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
-		minps		xmm2, xmm7
-		maxps		xmm3, xmm7
-	*/
-			edx = *((int*)(indexes_p+count_l+12));
+			xmm6 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) );
+			xmm6 = _mm_loadh_pi( xmm6, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm0 = _mm_min_ps( xmm0, xmm6 );
+			xmm1 = _mm_max_ps( xmm1, xmm6 );
+
+			/*
+				mov			edx, [edi+eax+12]
+				imul		edx, DRAWVERT_SIZE
+				movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+				movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
+				minps		xmm2, xmm7
+				maxps		xmm3, xmm7
+			*/
+			edx = *( ( int * )( indexes_p + count_l + 12 ) );
 			edx = edx * DRAWVERT_SIZE;
-			xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
-			xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
-			xmm2 = _mm_min_ps(xmm2, xmm7);
-			xmm3 = _mm_max_ps(xmm3, xmm7);
-
-	/*
-		add			eax, 4*4
-		jl			loop4
-	*/
-			count_l = count_l + 4*4;
-		} while (count_l < 0);
+			xmm7 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm7 = _mm_loadh_pi( xmm7, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 4 ) );
+			xmm2 = _mm_min_ps( xmm2, xmm7 );
+			xmm3 = _mm_max_ps( xmm3, xmm7 );
+
+			/*
+				add			eax, 4*4
+				jl			loop4
+			*/
+			count_l = count_l + 4 * 4;
+		} while ( count_l < 0 );
 	}
 	/*
 	done4:
@@ -385,40 +385,40 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 	*/
 	count_l = count;
 	count_l = count_l & 3;
-	if(count_l != 0) {
-	/*
-		shl			eax, 2
-		add			edi, eax
-		neg			eax
-	*/
+	if ( count_l != 0 ) {
+		/*
+			shl			eax, 2
+			add			edi, eax
+			neg			eax
+		*/
 		count_l = count_l << 2;
 		indexes_p = indexes_p + count_l;
 		count_l = -count_l;
-	/*
-	loop1:
-	*/
-		do{
-	/*
-		mov			edx, [edi+eax+0]
-		imul		edx, DRAWVERT_SIZE;
-		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		minps		xmm0, xmm4
-		maxps		xmm1, xmm4
-	*/
-			edx = *((int*)(indexes_p+count_l+0));
+		/*
+		loop1:
+		*/
+		do {
+			/*
+				mov			edx, [edi+eax+0]
+				imul		edx, DRAWVERT_SIZE;
+				movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
+				movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+				minps		xmm0, xmm4
+				maxps		xmm1, xmm4
+			*/
+			edx = *( ( int * )( indexes_p + count_l + 0 ) );
 			edx = edx * DRAWVERT_SIZE;
-			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
-			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
-			xmm0 = _mm_min_ps(xmm0, xmm4);
-			xmm1 = _mm_max_ps(xmm1, xmm4);
-
-	/*
-		add			eax, 4
-		jl			loop1
-	*/
+			xmm4 = _mm_load_ss( ( float * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 8 ) );
+			xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + edx + DRAWVERT_XYZ_OFFSET + 0 ) );
+			xmm0 = _mm_min_ps( xmm0, xmm4 );
+			xmm1 = _mm_max_ps( xmm1, xmm4 );
+
+			/*
+				add			eax, 4
+				jl			loop1
+			*/
 			count_l = count_l + 4;
-		} while (count_l < 0);
+		} while ( count_l < 0 );
 
 	}
 
@@ -435,16 +435,16 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		movhps		[edi], xmm1
 		movss		[edi+8], xmm1
 	*/
-	xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
-	xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
-	xmm0 = _mm_min_ps(xmm0, xmm2);
-	xmm1 = _mm_max_ps(xmm1, xmm3);
-	min_p = (char *) &min;
-	_mm_storeh_pi((__m64 *)(min_p), xmm0);
-	_mm_store_ss((float *)(min_p+8), xmm0);
-	max_p = (char *) &max;
-	_mm_storeh_pi((__m64 *)(max_p), xmm1);
-	_mm_store_ss((float *)(max_p+8), xmm1);
+	xmm2 = _mm_shuffle_ps( xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) );
+	xmm3 = _mm_shuffle_ps( xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) );
+	xmm0 = _mm_min_ps( xmm0, xmm2 );
+	xmm1 = _mm_max_ps( xmm1, xmm3 );
+	min_p = ( char * ) &min;
+	_mm_storeh_pi( ( __m64 * )( min_p ), xmm0 );
+	_mm_store_ss( ( float * )( min_p + 8 ), xmm0 );
+	max_p = ( char * ) &max;
+	_mm_storeh_pi( ( __m64 * )( max_p ), xmm1 );
+	_mm_store_ss( ( float * )( max_p + 8 ), xmm1 );
 }
 
 /*
@@ -479,10 +479,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 		and			eax, ~3
 	*/
 	count_l4 = count;
-	constant_p = (char *) &constant;
+	constant_p = ( char * ) &constant;
 	count_l1 = count_l4;
-	src_p = (char *) src;
-	dst_p = (char *) dst;
+	src_p = ( char * ) src;
+	dst_p = ( char * ) dst;
 	count_l4 = count_l4 & ~3;
 
 	/*
@@ -493,91 +493,91 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 		movss		xmm7, [edi+8]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 	*/
-	xmm5 = _mm_load_ss((float *) (constant_p+0));
-	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
-	xmm6 = _mm_load_ss((float *) (constant_p+4));
-	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
-	xmm7 = _mm_load_ss((float *) (constant_p+8));
-	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
+	xmm5 = _mm_load_ss( ( float * )( constant_p + 0 ) );
+	xmm5 = _mm_shuffle_ps( xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm6 = _mm_load_ss( ( float * )( constant_p + 4 ) );
+	xmm6 = _mm_shuffle_ps( xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) );
+	xmm7 = _mm_load_ss( ( float * )( constant_p + 8 ) );
+	xmm7 = _mm_shuffle_ps( xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) );
 
 	/*
 		jz			startVert1
 	*/
-	if (count_l4 != 0) {
-	/*
-		imul		eax, 16
-		add			esi, eax
-		neg			eax
-	*/
+	if ( count_l4 != 0 ) {
+		/*
+			imul		eax, 16
+			add			esi, eax
+			neg			eax
+		*/
 		count_l4 = count_l4 * 16;
 		src_p = src_p + count_l4;
 		count_l4 = -count_l4;
-	/*
-	loopVert4:
-	*/
+		/*
+		loopVert4:
+		*/
 		do {
-	/*
-		movlps		xmm1, [esi+eax+ 0]
-		movlps		xmm3, [esi+eax+ 8]
-		movhps		xmm1, [esi+eax+16]
-		movhps		xmm3, [esi+eax+24]
-		movlps		xmm2, [esi+eax+32]
-		movlps		xmm4, [esi+eax+40]
-		movhps		xmm2, [esi+eax+48]
-		movhps		xmm4, [esi+eax+56]
-		movaps		xmm0, xmm1
-		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
-		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
-		movaps		xmm2, xmm3
-		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
-		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
-	*/
-			xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
-			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
-			xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
-			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
-			xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
-			xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
-			xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
-			xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
+			/*
+				movlps		xmm1, [esi+eax+ 0]
+				movlps		xmm3, [esi+eax+ 8]
+				movhps		xmm1, [esi+eax+16]
+				movhps		xmm3, [esi+eax+24]
+				movlps		xmm2, [esi+eax+32]
+				movlps		xmm4, [esi+eax+40]
+				movhps		xmm2, [esi+eax+48]
+				movhps		xmm4, [esi+eax+56]
+				movaps		xmm0, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
+				movaps		xmm2, xmm3
+				shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
+			*/
+			xmm1 = _mm_loadl_pi( xmm1, ( __m64 * )( src_p + count_l4 + 0 ) );
+			xmm3 = _mm_loadl_pi( xmm3, ( __m64 * )( src_p + count_l4 + 8 ) );
+			xmm1 = _mm_loadh_pi( xmm1, ( __m64 * )( src_p + count_l4 + 16 ) );
+			xmm3 = _mm_loadh_pi( xmm3, ( __m64 * )( src_p + count_l4 + 24 ) );
+			xmm2 = _mm_loadl_pi( xmm2, ( __m64 * )( src_p + count_l4 + 32 ) );
+			xmm4 = _mm_loadl_pi( xmm4, ( __m64 * )( src_p + count_l4 + 40 ) );
+			xmm2 = _mm_loadh_pi( xmm2, ( __m64 * )( src_p + count_l4 + 48 ) );
+			xmm4 = _mm_loadh_pi( xmm4, ( __m64 * )( src_p + count_l4 + 56 ) );
 
 			xmm0 = xmm1;
-			xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
-			xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
+			xmm0 = _mm_shuffle_ps( xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) );
+			xmm1 = _mm_shuffle_ps( xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) );
 			xmm2 = xmm3;
-			xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
-			xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
+			xmm2 = _mm_shuffle_ps( xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) );
+			xmm3 = _mm_shuffle_ps( xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) );
 
-	/*
-		add			ecx, 16
-		add			eax, 4*16
-	*/
+			/*
+				add			ecx, 16
+				add			eax, 4*16
+			*/
 			dst_p = dst_p + 16;
-			count_l4 = count_l4 + 4*16;
-
-	/*
-		mulps		xmm0, xmm5
-		mulps		xmm1, xmm6
-		mulps		xmm2, xmm7
-		addps		xmm0, xmm3
-		addps		xmm0, xmm1
-		addps		xmm0, xmm2
-	*/
-			xmm0 = _mm_mul_ps(xmm0, xmm5);
-			xmm1 = _mm_mul_ps(xmm1, xmm6);
-			xmm2 = _mm_mul_ps(xmm2, xmm7);
-			xmm0 = _mm_add_ps(xmm0, xmm3);
-			xmm0 = _mm_add_ps(xmm0, xmm1);
-			xmm0 = _mm_add_ps(xmm0, xmm2);
+			count_l4 = count_l4 + 4 * 16;
 
-	/*
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
-		jl			loopVert4
-	*/
-			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
-			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
-		} while (count_l4 < 0);
+			/*
+				mulps		xmm0, xmm5
+				mulps		xmm1, xmm6
+				mulps		xmm2, xmm7
+				addps		xmm0, xmm3
+				addps		xmm0, xmm1
+				addps		xmm0, xmm2
+			*/
+			xmm0 = _mm_mul_ps( xmm0, xmm5 );
+			xmm1 = _mm_mul_ps( xmm1, xmm6 );
+			xmm2 = _mm_mul_ps( xmm2, xmm7 );
+			xmm0 = _mm_add_ps( xmm0, xmm3 );
+			xmm0 = _mm_add_ps( xmm0, xmm1 );
+			xmm0 = _mm_add_ps( xmm0, xmm2 );
+
+			/*
+				movlps		[ecx-16+0], xmm0
+				movhps		[ecx-16+8], xmm0
+				jl			loopVert4
+			*/
+			_mm_storel_pi( ( __m64 * )( dst_p - 16 + 0 ), xmm0 );
+			_mm_storeh_pi( ( __m64 * )( dst_p - 16 + 8 ), xmm0 );
+		} while ( count_l4 < 0 );
 	}
 
 	/*
@@ -587,74 +587,178 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 	*/
 	count_l1 = count_l1 & 3;
 
-	if(count_l1 != 0) {
-	/*
-	loopVert1:
-	*/
+	if ( count_l1 != 0 ) {
+		/*
+		loopVert1:
+		*/
 		do {
-	/*
-		movss		xmm0, [esi+eax+0]
-		movss		xmm1, [esi+eax+4]
-		movss		xmm2, [esi+eax+8]
-		mulss		xmm0, xmm5
-		mulss		xmm1, xmm6
-		mulss		xmm2, xmm7
-		addss		xmm0, [esi+eax+12]
-		add			ecx, 4
-		addss		xmm0, xmm1
-		add			eax, 16
-		addss		xmm0, xmm2
-		dec			edx
-		movss		[ecx-4], xmm0
-		jnz			loopVert1
-	*/
-			xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
-			xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
-			xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
-			xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
-
-			xmm0 = _mm_mul_ss(xmm0, xmm5);
-			xmm1 = _mm_mul_ss(xmm1, xmm6);
-			xmm2 = _mm_mul_ss(xmm2, xmm7);
-
-			xmm0 = _mm_add_ss(xmm0, xmm3);
+			/*
+				movss		xmm0, [esi+eax+0]
+				movss		xmm1, [esi+eax+4]
+				movss		xmm2, [esi+eax+8]
+				mulss		xmm0, xmm5
+				mulss		xmm1, xmm6
+				mulss		xmm2, xmm7
+				addss		xmm0, [esi+eax+12]
+				add			ecx, 4
+				addss		xmm0, xmm1
+				add			eax, 16
+				addss		xmm0, xmm2
+				dec			edx
+				movss		[ecx-4], xmm0
+				jnz			loopVert1
+			*/
+			xmm0 = _mm_load_ss( ( float * )( src_p + count_l4 + 0 ) );
+			xmm1 = _mm_load_ss( ( float * )( src_p + count_l4 + 4 ) );
+			xmm2 = _mm_load_ss( ( float * )( src_p + count_l4 + 8 ) );
+			xmm3 = _mm_load_ss( ( float * )( src_p + count_l4 + 12 ) );
+
+			xmm0 = _mm_mul_ss( xmm0, xmm5 );
+			xmm1 = _mm_mul_ss( xmm1, xmm6 );
+			xmm2 = _mm_mul_ss( xmm2, xmm7 );
+
+			xmm0 = _mm_add_ss( xmm0, xmm3 );
 			dst_p = dst_p + 4;
-			xmm0 = _mm_add_ss(xmm0, xmm1);
+			xmm0 = _mm_add_ss( xmm0, xmm1 );
 			count_l4 = count_l4 + 16;
-			xmm0 = _mm_add_ss(xmm0, xmm2);
+			xmm0 = _mm_add_ss( xmm0, xmm2 );
 			count_l1 = count_l1 - 1;
-			_mm_store_ss((float *) (dst_p-4), xmm0);
-		} while (count_l1 != 0);
+			_mm_store_ss( ( float * )( dst_p - 4 ), xmm0 );
+		} while ( count_l1 != 0 );
 	}
 	/*
 	done:
 	*/
 }
 
-#elif defined(_MSC_VER) && defined(_M_IX86)
+/*
+============
+idSIMD_SSE::CullByFrustum
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+	__m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+	__m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+	__m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+	__m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+	__m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+	__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+	__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+	__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
 
-#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
-#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
+	for ( int j = 0; j < numVerts; j++ ) {
+		idVec3 &vec = verts[j].xyz;
+		__m128 vX = _mm_set1_ps( vec.x );
+		__m128 vY = _mm_set1_ps( vec.y );
+		__m128 vZ = _mm_set1_ps( vec.z );
+		__m128 d14 = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps( fA14, vX ),
+				_mm_mul_ps( fB14, vY )
+			),
+			_mm_add_ps(
+				_mm_mul_ps( fC14, vZ ),
+				fD14
+			)
+		);
+		__m128 d56 = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps( fA56, vX ),
+				_mm_mul_ps( fB56, vY )
+			),
+			_mm_add_ps(
+				_mm_mul_ps( fC56, vZ ),
+				fD56
+			)
+		);
+		const short mask6 = ( 1 << 6 ) - 1;
+		__m128 eps = _mm_set1_ps( epsilon );
+		int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+		int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+		int mask_lo = mask_lo14 | mask_lo56 << 4;
+		pointCull[j] = mask_lo & mask6;
+	}
+}
 
-// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
-#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 )											\
-	__asm	movaps		reg4, reg2								/* reg4 =  8,  9, 10, 11 */		\
-	__asm	unpcklps	reg2, reg3								/* reg2 =  8, 12,  9, 13 */		\
-	__asm	unpckhps	reg4, reg3								/* reg4 = 10, 14, 11, 15 */		\
-	__asm	movaps		reg3, reg0								/* reg3 =  0,  1,  2,  3 */		\
-	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  4,  1,  5 */		\
-	__asm	unpckhps	reg3, reg1								/* reg3 =  2,  6,  3,  7 */		\
-	__asm	movaps		reg1, reg0								/* reg1 =  0,  4,  1,  5 */		\
-	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg0 =  0,  4,  8, 12 */		\
-	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg1 =  1,  5,  9, 13 */		\
-	__asm	movaps		reg2, reg3								/* reg2 =  2,  6,  3,  7 */		\
-	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg2 =  2,  6, 10, 14 */		\
-	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg3 =  3,  7, 11, 15 */
+/*
+============
+idSIMD_SSE::CullByFrustum2
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+	__m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+	__m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+	__m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+	__m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+	__m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+	__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+	__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+	__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
 
-// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
-#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )						\
-	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
-	__asm	movlps		reg3, [address+ 8]						/* reg3 =  2,  3,  X,  X */		\
+	for ( int j = 0; j < numVerts; j++ ) {
+		idVec3 &vec = verts[j].xyz;
+		__m128 vX = _mm_set1_ps( vec.x );
+		__m128 vY = _mm_set1_ps( vec.y );
+		__m128 vZ = _mm_set1_ps( vec.z );
+		__m128 d14 = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps( fA14, vX ),
+				_mm_mul_ps( fB14, vY )
+			),
+			_mm_add_ps(
+				_mm_mul_ps( fC14, vZ ),
+				fD14
+			)
+		);
+		__m128 d56 = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps( fA56, vX ),
+				_mm_mul_ps( fB56, vY )
+			),
+			_mm_add_ps(
+				_mm_mul_ps( fC56, vZ ),
+				fD56
+			)
+		);
+		const short mask6 = ( 1 << 6 ) - 1;
+		__m128 eps = _mm_set1_ps( epsilon );
+		int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+		int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+		eps = _mm_set1_ps( -epsilon );
+		int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) );
+		int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) );
+		int mask_lo = mask_lo14 | mask_lo56 << 4;
+		int mask_hi = mask_hi14 | mask_hi56 << 4;
+		pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6;
+	}
+}
+
+#elif defined(_MSC_VER) && defined(_M_IX86)
+
+#include <xmmintrin.h>
+
+#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
+#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
+
+// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
+#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 )											\
+	__asm	movaps		reg4, reg2								/* reg4 =  8,  9, 10, 11 */		\
+	__asm	unpcklps	reg2, reg3								/* reg2 =  8, 12,  9, 13 */		\
+	__asm	unpckhps	reg4, reg3								/* reg4 = 10, 14, 11, 15 */		\
+	__asm	movaps		reg3, reg0								/* reg3 =  0,  1,  2,  3 */		\
+	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  4,  1,  5 */		\
+	__asm	unpckhps	reg3, reg1								/* reg3 =  2,  6,  3,  7 */		\
+	__asm	movaps		reg1, reg0								/* reg1 =  0,  4,  1,  5 */		\
+	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg0 =  0,  4,  8, 12 */		\
+	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg1 =  1,  5,  9, 13 */		\
+	__asm	movaps		reg2, reg3								/* reg2 =  2,  6,  3,  7 */		\
+	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg2 =  2,  6, 10, 14 */		\
+	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg3 =  3,  7, 11, 15 */
+
+// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
+#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )						\
+	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
+	__asm	movlps		reg3, [address+ 8]						/* reg3 =  2,  3,  X,  X */		\
 	__asm	movhps		reg1, [address+16]						/* reg1 =  0,  1,  4,  5 */		\
 	__asm	movhps		reg3, [address+24]						/* reg3 =  2,  3,  6,  7 */		\
 	__asm	movlps		reg2, [address+32]						/* reg2 =  8,  9,  X,  X */		\
@@ -1017,17 +1121,17 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 #define ALIGN8_INIT1( X, INIT )				ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
 
 ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
-ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
+ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1 << 15 );
 
-ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
-ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
-ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
-ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
+ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, ( 3 << 0 ) | ( 2 << 8 ) | ( 1 << 16 ) | ( 0 << 24 ) );
+ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, ( 0 << 0 ) | ( 1 << 8 ) | ( 2 << 16 ) | ( 3 << 24 ) );
+ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, ( 1 << 0 ) | ( 0 << 8 ) | ( 3 << 16 ) | ( 2 << 24 ) );
+ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, ( 2 << 0 ) | ( 3 << 8 ) | ( 0 << 16 ) | ( 1 << 24 ) );
 
-ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
-ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
-ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
-ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
+ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, ( unsigned int )( 1 << 31 ), 0, 0, 0 );
+ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, ( unsigned int )( 1 << 31 ) );
+ALIGN4_INIT1( unsigned int SIMD_SP_absMask, ( unsigned int ) ~( 1 << 31 ) );
+ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, ( unsigned int ) ~( 1 << 23 ) );
 ALIGN4_INIT1( unsigned int SIMD_SP_not, 0xFFFFFFFF );
 
 ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
@@ -1036,7 +1140,7 @@ ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
 ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
 ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
 ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
-ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
+ALIGN4_INIT1( float SIMD_SP_maxShort, ( 1 << 15 ) );
 ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
 ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
 ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
@@ -1047,7 +1151,7 @@ ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
 
 ALIGN4_INIT1( float SIMD_SP_rsqrt_c0,  3.0f );
 ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
-ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
+ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f * 0.5f );
 
 ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
 ALIGN4_INIT1( float SIMD_SP_sin_c1,  2.7526e-06f );
@@ -1952,7 +2056,7 @@ float SSE_ATan( float y, float x ) {
 	if ( fabs( y ) > fabs( x ) ) {
 		a = -x / y;
 		d = idMath::HALF_PI;
-		*((unsigned int *)&d) ^= ( *((unsigned int *)&x) ^ *((unsigned int *)&y) ) & (1<<31);
+		*( ( unsigned int * )&d ) ^= ( *( ( unsigned int * )&x ) ^ * ( ( unsigned int * )&y ) ) & ( 1 << 31 );
 	} else {
 		a = y / x;
 		d = 0.0f;
@@ -2179,63 +2283,62 @@ void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src,
 	int pre, post;
 
 	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
-	__asm
-	{
-		movss	xmm1,constant
-		shufps	xmm1,xmm1,0
+	__asm {
+		movss	xmm1, constant
+		shufps	xmm1, xmm1, 0
 
 		KFLOATINITDS( dst, src, count, pre, post )
-		and		eax,15
+		and		eax, 15
 		jne		lpNA
 		jmp		lpA
 		align	16
-lpA:
-		movaps	xmm2,[edx+ebx]
-		movaps	xmm3,[edx+ebx+16]
-		rcpps	xmm4,xmm2
-		rcpps	xmm5,xmm3
-		prefetchnta	[edx+ebx+64]
-		mulps	xmm2,xmm4
-		mulps	xmm2,xmm4
-		mulps	xmm3,xmm5
-		mulps	xmm3,xmm5
-		addps	xmm4,xmm4
-		addps	xmm5,xmm5
-		subps	xmm4,xmm2
-		subps	xmm5,xmm3
-		mulps	xmm4,xmm1
-		mulps	xmm5,xmm1
-		movaps	[edi+ebx],xmm4
-		movaps	[edi+ebx+16],xmm5
-		add		ebx,16*2
+		lpA:
+		movaps	xmm2, [edx + ebx]
+		movaps	xmm3, [edx + ebx + 16]
+		rcpps	xmm4, xmm2
+		rcpps	xmm5, xmm3
+		prefetchnta	[edx + ebx + 64]
+		mulps	xmm2, xmm4
+		mulps	xmm2, xmm4
+		mulps	xmm3, xmm5
+		mulps	xmm3, xmm5
+		addps	xmm4, xmm4
+		addps	xmm5, xmm5
+		subps	xmm4, xmm2
+		subps	xmm5, xmm3
+		mulps	xmm4, xmm1
+		mulps	xmm5, xmm1
+		movaps	[edi + ebx], xmm4
+		movaps	[edi + ebx + 16], xmm5
+		add		ebx, 16 * 2
 		jl		lpA
 		jmp		done
 		align	16
-lpNA:
-		movups	xmm2,[edx+ebx]
-		movups	xmm3,[edx+ebx+16]
-		rcpps	xmm4,xmm2
-		rcpps	xmm5,xmm3
-		prefetchnta	[edx+ebx+64]
-		mulps	xmm2,xmm4
-		mulps	xmm2,xmm4
-		mulps	xmm3,xmm5
-		mulps	xmm3,xmm5
-		addps	xmm4,xmm4
-		addps	xmm5,xmm5
-		subps	xmm4,xmm2
-		subps	xmm5,xmm3
-		mulps	xmm4,xmm1
-		mulps	xmm5,xmm1
-		movaps	[edi+ebx],xmm4
-		movaps	[edi+ebx+16],xmm5
-		add		ebx,16*2
+		lpNA:
+		movups	xmm2, [edx + ebx]
+		movups	xmm3, [edx + ebx + 16]
+		rcpps	xmm4, xmm2
+		rcpps	xmm5, xmm3
+		prefetchnta	[edx + ebx + 64]
+		mulps	xmm2, xmm4
+		mulps	xmm2, xmm4
+		mulps	xmm3, xmm5
+		mulps	xmm3, xmm5
+		addps	xmm4, xmm4
+		addps	xmm5, xmm5
+		subps	xmm4, xmm2
+		subps	xmm5, xmm3
+		mulps	xmm4, xmm1
+		mulps	xmm5, xmm1
+		movaps	[edi + ebx], xmm4
+		movaps	[edi + ebx + 16], xmm5
+		add		ebx, 16 * 2
 		jl		lpNA
-done:
-		mov		edx,src
-		mov		edi,dst
-		KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
-					KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
+		done:
+		mov		edx, src
+		mov		edi, dst
+		KFLOATOPER( KDIVDSS1( [edi + ebx], xmm1, [edx + ebx] ),
+		            KDIVDSS4( [edi + ebx], xmm1, [edx + ebx] ), count )
 	}
 }
 
@@ -2247,66 +2350,65 @@ idSIMD_SSE::Div
 ============
 */
 void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
-	int		pre,post;
+	int		pre, post;
 
 	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
-	__asm
-	{
+	__asm {
 		KFLOATINITDSS( dst, src0, src1, count, pre, post )
-		and		eax,15
+		and		eax, 15
 		jne		lpNA
 		jmp		lpA
 		align	16
-lpA:
-		movaps	xmm2,[esi+ebx]
-		movaps	xmm3,[esi+ebx+16]
-		rcpps	xmm4,xmm2
-		rcpps	xmm5,xmm3
-		prefetchnta	[esi+ebx+64]
-		mulps	xmm2,xmm4
-		mulps	xmm2,xmm4
-		mulps	xmm3,xmm5
-		mulps	xmm3,xmm5
-		addps	xmm4,xmm4
-		addps	xmm5,xmm5
-		subps	xmm4,xmm2
-		subps	xmm5,xmm3
-		mulps	xmm4,[edx+ebx]
-		mulps	xmm5,[edx+ebx+16]
-		movaps	[edi+ebx],xmm4
-		movaps	[edi+ebx+16],xmm5
-		add		ebx,16*2
+		lpA:
+		movaps	xmm2, [esi + ebx]
+		movaps	xmm3, [esi + ebx + 16]
+		rcpps	xmm4, xmm2
+		rcpps	xmm5, xmm3
+		prefetchnta	[esi + ebx + 64]
+		mulps	xmm2, xmm4
+		mulps	xmm2, xmm4
+		mulps	xmm3, xmm5
+		mulps	xmm3, xmm5
+		addps	xmm4, xmm4
+		addps	xmm5, xmm5
+		subps	xmm4, xmm2
+		subps	xmm5, xmm3
+		mulps	xmm4, [edx + ebx]
+		mulps	xmm5, [edx + ebx + 16]
+		movaps	[edi + ebx], xmm4
+		movaps	[edi + ebx + 16], xmm5
+		add		ebx, 16 * 2
 		jl		lpA
 		jmp		done
 		align	16
-lpNA:
-		movups	xmm2,[esi+ebx]
-		movups	xmm3,[esi+ebx+16]
-		rcpps	xmm4,xmm2
-		rcpps	xmm5,xmm3
-		prefetchnta	[esi+ebx+64]
-		mulps	xmm2,xmm4
-		mulps	xmm2,xmm4
-		mulps	xmm3,xmm5
-		mulps	xmm3,xmm5
-		addps	xmm4,xmm4
-		addps	xmm5,xmm5
-		subps	xmm4,xmm2
-		subps	xmm5,xmm3
-		movups	xmm2,[edx+ebx]
-		movups	xmm3,[edx+ebx+16]
-		mulps	xmm4,xmm2
-		mulps	xmm5,xmm3
-		movaps	[edi+ebx],xmm4
-		movaps	[edi+ebx+16],xmm5
-		add		ebx,16*2
+		lpNA:
+		movups	xmm2, [esi + ebx]
+		movups	xmm3, [esi + ebx + 16]
+		rcpps	xmm4, xmm2
+		rcpps	xmm5, xmm3
+		prefetchnta	[esi + ebx + 64]
+		mulps	xmm2, xmm4
+		mulps	xmm2, xmm4
+		mulps	xmm3, xmm5
+		mulps	xmm3, xmm5
+		addps	xmm4, xmm4
+		addps	xmm5, xmm5
+		subps	xmm4, xmm2
+		subps	xmm5, xmm3
+		movups	xmm2, [edx + ebx]
+		movups	xmm3, [edx + ebx + 16]
+		mulps	xmm4, xmm2
+		mulps	xmm5, xmm3
+		movaps	[edi + ebx], xmm4
+		movaps	[edi + ebx + 16], xmm5
+		add		ebx, 16 * 2
 		jl		lpNA
-done:
-		mov		edx,src0
-		mov		esi,src1
-		mov		edi,dst
-		KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
-					KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
+		done:
+		mov		edx, src0
+		mov		esi, src1
+		mov		edi, dst
+		KFLOATOPER( KDIVDSS1( [edi + ebx], [edx + ebx], [esi + ebx] ),
+		            KDIVDSS4( [edi + ebx], [edx + ebx], [esi + ebx] ), count )
 	}
 }
 /*
@@ -2527,8 +2629,7 @@ idSIMD_SSE::Dot
 ============
 */
 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
-	__asm
-	{
+	__asm {
 		mov			eax, count
 		mov			edi, constant
 		mov			edx, eax
@@ -2538,9 +2639,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s
 
 		movss		xmm4, [edi+0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm5, [edi+4]
+		movss		xmm5, [edi + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [edi+8]
+		movss		xmm6, [edi + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 
 		jz			done4
@@ -2548,37 +2649,37 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s
 		add			esi, eax
 		neg			eax
 
-	loop4:
-		movlps		xmm1, [esi+eax+ 0]
-		movlps		xmm2, [esi+eax+ 8]
-		movlps		xmm3, [esi+eax+16]
-		movhps		xmm1, [esi+eax+24]
-		movhps		xmm2, [esi+eax+32]
-		movhps		xmm3, [esi+eax+40]
+		loop4:
+		movlps		xmm1, [esi + eax + 0]
+		movlps		xmm2, [esi + eax + 8]
+		movlps		xmm3, [esi + eax + 16]
+		movhps		xmm1, [esi + eax + 24]
+		movhps		xmm2, [esi + eax + 32]
+		movhps		xmm3, [esi + eax + 40]
 		movaps		xmm0, xmm1
 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
 		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
 		add			ecx, 16
-		add			eax, 4*12
+		add			eax, 4 * 12
 		mulps		xmm0, xmm4
 		mulps		xmm1, xmm5
 		mulps		xmm2, xmm6
 		addps		xmm0, xmm1
 		addps		xmm0, xmm2
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
+		movlps		[ecx - 16 + 0], xmm0
+		movhps		[ecx - 16 + 8], xmm0
 		jl			loop4
 
-	done4:
+		done4:
 		and			edx, 3
 		jz			done1
 
-	loop1:
-		movss		xmm0, [esi+eax+0]
-		movss		xmm1, [esi+eax+4]
-		movss		xmm2, [esi+eax+8]
+		loop1:
+		movss		xmm0, [esi + eax + 0]
+		movss		xmm1, [esi + eax + 4]
+		movss		xmm2, [esi + eax + 8]
 		mulss		xmm0, xmm4
 		mulss		xmm1, xmm5
 		mulss		xmm2, xmm6
@@ -2587,10 +2688,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *s
 		add			eax, 12
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loop1
 
-	done1:
+		done1:
 	}
 }
 
@@ -2612,9 +2713,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 
 		movss		xmm5, [edi+0]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [edi+4]
+		movss		xmm6, [edi + 4]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm7, [edi+8]
+		movss		xmm7, [edi + 8]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 
 		jz			startVert1
@@ -2622,16 +2723,16 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 		add			esi, eax
 		neg			eax
 
-	loopVert4:
+		loopVert4:
 
-		movlps		xmm1, [esi+eax+ 0]
-		movlps		xmm3, [esi+eax+ 8]
-		movhps		xmm1, [esi+eax+16]
-		movhps		xmm3, [esi+eax+24]
-		movlps		xmm2, [esi+eax+32]
-		movlps		xmm4, [esi+eax+40]
-		movhps		xmm2, [esi+eax+48]
-		movhps		xmm4, [esi+eax+56]
+		movlps		xmm1, [esi + eax + 0]
+		movlps		xmm3, [esi + eax + 8]
+		movhps		xmm1, [esi + eax + 16]
+		movhps		xmm3, [esi + eax + 24]
+		movlps		xmm2, [esi + eax + 32]
+		movlps		xmm4, [esi + eax + 40]
+		movhps		xmm2, [esi + eax + 48]
+		movhps		xmm4, [esi + eax + 56]
 		movaps		xmm0, xmm1
 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
 		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
@@ -2640,7 +2741,7 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
 
 		add			ecx, 16
-		add			eax, 4*16
+		add			eax, 4 * 16
 
 		mulps		xmm0, xmm5
 		mulps		xmm1, xmm6
@@ -2649,31 +2750,31 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *
 		addps		xmm0, xmm1
 		addps		xmm0, xmm2
 
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
+		movlps		[ecx - 16 + 0], xmm0
+		movhps		[ecx - 16 + 8], xmm0
 		jl			loopVert4
 
-	startVert1:
+		startVert1:
 		and			edx, 3
 		jz			done
 
-	loopVert1:
-		movss		xmm0, [esi+eax+0]
-		movss		xmm1, [esi+eax+4]
-		movss		xmm2, [esi+eax+8]
+		loopVert1:
+		movss		xmm0, [esi + eax + 0]
+		movss		xmm1, [esi + eax + 4]
+		movss		xmm2, [esi + eax + 8]
 		mulss		xmm0, xmm5
 		mulss		xmm1, xmm6
 		mulss		xmm2, xmm7
-		addss		xmm0, [esi+eax+12]
+		addss		xmm0, [esi + eax + 12]
 		add			ecx, 4
 		addss		xmm0, xmm1
 		add			eax, 16
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loopVert1
 
-	done:
+		done:
 	}
 }
 
@@ -2687,7 +2788,7 @@ idSIMD_SSE::Dot
 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	// 0,  1,  2
 	// 3,  4,  5
@@ -2704,9 +2805,9 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer
 
 		movss		xmm4, [edi+0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm5, [edi+4]
+		movss		xmm5, [edi + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [edi+8]
+		movss		xmm6, [edi + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 
 		jz			startVert1
@@ -2714,27 +2815,27 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer
 		add			esi, eax
 		neg			eax
 
-	loopVert4:
-		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
-		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
-		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
+		loopVert4:
+		movss		xmm0, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  3,  X,  X,  X
+		movss		xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]	//  2,  X,  X,  X
+		movhps		xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  3,  X,  0,  1
 		movaps		xmm1, xmm0												//  3,  X,  0,  1
 
-		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
+		movlps		xmm1, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]	//  4,  5,  0,  1
 		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
 
-		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
+		movss		xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  9,  X,  X,  X
+		movhps		xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  9,  X,  6,  7
 		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
 
-		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
+		movlps		xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]	// 10, 11,  6,  7
 		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
 
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
+		movhps		xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]	// 10, 11,  8,  X
 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
 
 		add			ecx, 16
-		add			eax, 4*DRAWVERT_SIZE
+		add			eax, 4 * DRAWVERT_SIZE
 
 		mulps		xmm0, xmm4
 		mulps		xmm1, xmm5
@@ -2742,18 +2843,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer
 		addps		xmm0, xmm1
 		addps		xmm0, xmm2
 
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
+		movlps		[ecx - 16 + 0], xmm0
+		movhps		[ecx - 16 + 8], xmm0
 		jl			loopVert4
 
-	startVert1:
+		startVert1:
 		and			edx, 3
 		jz			done
 
-	loopVert1:
-		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
+		loopVert1:
+		movss		xmm0, [esi + eax + DRAWVERT_XYZ_OFFSET + 0]
+		movss		xmm1, [esi + eax + DRAWVERT_XYZ_OFFSET + 4]
+		movss		xmm2, [esi + eax + DRAWVERT_XYZ_OFFSET + 8]
 		mulss		xmm0, xmm4
 		mulss		xmm1, xmm5
 		mulss		xmm2, xmm6
@@ -2762,10 +2863,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVer
 		add			eax, DRAWVERT_SIZE
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loopVert1
 
-	done:
+		done:
 	}
 }
 
@@ -2777,8 +2878,7 @@ idSIMD_SSE::Dot
 ============
 */
 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
-	__asm
-	{
+	__asm {
 		mov			eax, count
 		mov			edi, constant
 		mov			edx, eax
@@ -2788,11 +2888,11 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *
 
 		movss		xmm4, [edi+0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm5, [edi+4]
+		movss		xmm5, [edi + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [edi+8]
+		movss		xmm6, [edi + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm7, [edi+12]
+		movss		xmm7, [edi + 12]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 
 		jz			done4
@@ -2800,20 +2900,20 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *
 		add			esi, eax
 		neg			eax
 
-	loop4:
-		movlps		xmm1, [esi+eax+ 0]
-		movlps		xmm2, [esi+eax+ 8]
-		movlps		xmm3, [esi+eax+16]
-		movhps		xmm1, [esi+eax+24]
-		movhps		xmm2, [esi+eax+32]
-		movhps		xmm3, [esi+eax+40]
+		loop4:
+		movlps		xmm1, [esi + eax + 0]
+		movlps		xmm2, [esi + eax + 8]
+		movlps		xmm3, [esi + eax + 16]
+		movhps		xmm1, [esi + eax + 24]
+		movhps		xmm2, [esi + eax + 32]
+		movhps		xmm3, [esi + eax + 40]
 		movaps		xmm0, xmm1
 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
 		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
 
 		add			ecx, 16
-		add			eax, 4*12
+		add			eax, 4 * 12
 
 		mulps		xmm0, xmm4
 		mulps		xmm1, xmm5
@@ -2823,18 +2923,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *
 		addps		xmm0, xmm2
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
 
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
+		movlps		[ecx - 16 + 0], xmm0
+		movhps		[ecx - 16 + 8], xmm0
 		jl			loop4
 
-	done4:
+		done4:
 		and			edx, 3
 		jz			done1
 
-	loop1:
-		movss		xmm0, [esi+eax+0]
-		movss		xmm1, [esi+eax+4]
-		movss		xmm2, [esi+eax+8]
+		loop1:
+		movss		xmm0, [esi + eax + 0]
+		movss		xmm1, [esi + eax + 4]
+		movss		xmm2, [esi + eax + 8]
 		mulss		xmm0, xmm4
 		mulss		xmm1, xmm5
 		mulss		xmm2, xmm6
@@ -2844,10 +2944,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *
 		add			eax, 12
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loop1
 
-	done1:
+		done1:
 	}
 }
 
@@ -2895,22 +2995,22 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane
 		mov			ecx, count
 
 		movlps		xmm4, [ebx]
-		shufps		xmm4, xmm4, SHUFFLEPS(1,0,1,0)
-		movlps		xmm5, [ebx+8]
-		shufps		xmm5, xmm5, SHUFFLEPS(1,0,1,0)
+		shufps		xmm4, xmm4, SHUFFLEPS( 1, 0, 1, 0 )
+		movlps		xmm5, [ebx + 8]
+		shufps		xmm5, xmm5, SHUFFLEPS( 1, 0, 1, 0 )
 
 		xorps		xmm0, xmm0
 		xorps		xmm1, xmm1
 
-	_lpAlignDest:
+		_lpAlignDest:
 		test		edx, 0x0f
 		jz			_destAligned
-		SINGLE_OP(eax,edx)
+		SINGLE_OP( eax, edx )
 		dec			ecx
 		jnz			_lpAlignDest
 		jmp			_vpExit
 
-	_destAligned:
+		_destAligned:
 		push		ecx
 
 		cmp			ecx, 4
@@ -2918,30 +3018,30 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane
 
 		and			ecx, ~3
 		shl			ecx, 2
-		lea			eax, [eax+ecx*4]
+		lea			eax, [eax + ecx * 4]
 		add			edx, ecx
 		neg			ecx
 
-		movlps		xmm0, [eax+ecx*4]
-		movhps		xmm0, [eax+ecx*4+16]
-		movlps		xmm2, [eax+ecx*4+32]
-		movhps		xmm2, [eax+ecx*4+48]
+		movlps		xmm0, [eax + ecx * 4]
+		movhps		xmm0, [eax + ecx * 4 + 16]
+		movlps		xmm2, [eax + ecx * 4 + 32]
+		movhps		xmm2, [eax + ecx * 4 + 48]
 		jmp			_lpStart
 
 		align	16
-	_lp:
-		prefetchnta	[eax+ecx*4+128]
+		_lp:
+		prefetchnta	[eax + ecx * 4 + 128]
 		addps		xmm1, xmm0
-		movlps		xmm0, [eax+ecx*4]
-		movhps		xmm0, [eax+ecx*4+16]
-		movlps		xmm2, [eax+ecx*4+32]
-		movhps		xmm2, [eax+ecx*4+48]
-		movaps		[edx+ecx-16],xmm1
-	_lpStart:
-		movlps		xmm1, [eax+ecx*4+8]
-		movhps		xmm1, [eax+ecx*4+24]
-		movlps		xmm3, [eax+ecx*4+40]
-		movhps		xmm3, [eax+ecx*4+56]
+		movlps		xmm0, [eax + ecx * 4]
+		movhps		xmm0, [eax + ecx * 4 + 16]
+		movlps		xmm2, [eax + ecx * 4 + 32]
+		movhps		xmm2, [eax + ecx * 4 + 48]
+		movaps		[edx + ecx - 16], xmm1
+		_lpStart:
+		movlps		xmm1, [eax + ecx * 4 + 8]
+		movhps		xmm1, [eax + ecx * 4 + 24]
+		movlps		xmm3, [eax + ecx * 4 + 40]
+		movhps		xmm3, [eax + ecx * 4 + 56]
 		add			ecx, 16
 		mulps		xmm1, xmm5
 		mulps		xmm2, xmm4
@@ -2950,23 +3050,23 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane
 		mulps		xmm0, xmm4
 		addps		xmm0, xmm1						// y1+w1 x1+z1 y0+w0 x0+z0
 		movaps		xmm1, xmm0
-		shufps		xmm0, xmm2, SHUFFLEPS(2,0,2,0)	// x3+z3 x2+z2 x1+z1 x0+z0
-		shufps		xmm1, xmm2, SHUFFLEPS(3,1,3,1)	// y3+w3 y2+w2 y1+w1 y0+w0
+		shufps		xmm0, xmm2, SHUFFLEPS( 2, 0, 2, 0 )	// x3+z3 x2+z2 x1+z1 x0+z0
+		shufps		xmm1, xmm2, SHUFFLEPS( 3, 1, 3, 1 )	// y3+w3 y2+w2 y1+w1 y0+w0
 		js			_lp
 		addps		xmm1, xmm0
-		movaps		[edx+ecx-16], xmm1
-	_post:
+		movaps		[edx + ecx - 16], xmm1
+		_post:
 		pop			ecx
 		and			ecx, 0x3
 		cmp			ecx, 2
 		jl			_post1
-		DUAL_OP(eax,edx)
+		DUAL_OP( eax, edx )
 		sub			ecx, 2
-	_post1:
+		_post1:
 		cmp			ecx, 1
 		jne			_vpExit
-		SINGLE_OP(eax,edx)
-	_vpExit:
+		SINGLE_OP( eax, edx )
+		_vpExit:
 	}
 
 #undef DUAL_OP
@@ -2984,7 +3084,7 @@ idSIMD_SSE::Dot
 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	// 0,  1,  2
 	// 3,  4,  5
@@ -3001,11 +3101,11 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 
 		movss		xmm4, [edi+0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm5, [edi+4]
+		movss		xmm5, [edi + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [edi+8]
+		movss		xmm6, [edi + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm7, [edi+12]
+		movss		xmm7, [edi + 12]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 
 		jz			startVert1
@@ -3013,27 +3113,27 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 		add			esi, eax
 		neg			eax
 
-	loopVert4:
-		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
-		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
-		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
+		loopVert4:
+		movss		xmm0, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  3,  X,  X,  X
+		movss		xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]	//  2,  X,  X,  X
+		movhps		xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  3,  X,  0,  1
 		movaps		xmm1, xmm0												//  3,  X,  0,  1
 
-		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
+		movlps		xmm1, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]	//  4,  5,  0,  1
 		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
 
-		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
+		movss		xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  9,  X,  X,  X
+		movhps		xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]	//  9,  X,  6,  7
 		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
 
-		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
+		movlps		xmm3, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]	// 10, 11,  6,  7
 		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
 
-		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
+		movhps		xmm3, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]	// 10, 11,  8,  X
 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
 
 		add			ecx, 16
-		add			eax, 4*DRAWVERT_SIZE
+		add			eax, 4 * DRAWVERT_SIZE
 
 		mulps		xmm0, xmm4
 		mulps		xmm1, xmm5
@@ -3042,18 +3142,18 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 		addps		xmm0, xmm1
 		addps		xmm0, xmm2
 
-		movlps		[ecx-16+0], xmm0
-		movhps		[ecx-16+8], xmm0
+		movlps		[ecx - 16 + 0], xmm0
+		movhps		[ecx - 16 + 8], xmm0
 		jl			loopVert4
 
-	startVert1:
+		startVert1:
 		and			edx, 3
 		jz			done
 
-	loopVert1:
-		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
+		loopVert1:
+		movss		xmm0, [esi + eax + DRAWVERT_XYZ_OFFSET + 0]
+		movss		xmm1, [esi + eax + DRAWVERT_XYZ_OFFSET + 4]
+		movss		xmm2, [esi + eax + DRAWVERT_XYZ_OFFSET + 8]
 		mulss		xmm0, xmm4
 		mulss		xmm1, xmm5
 		mulss		xmm2, xmm6
@@ -3063,10 +3163,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVe
 		add			eax, DRAWVERT_SIZE
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loopVert1
 
-	done:
+		done:
 	}
 }
 
@@ -3078,8 +3178,7 @@ idSIMD_SSE::Dot
 ============
 */
 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
-	__asm
-	{
+	__asm {
 		mov			eax, count
 		mov			edi, src0
 		mov			edx, eax
@@ -3093,7 +3192,7 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1,
 		add			esi, eax
 		neg			eax
 
-	loop4:
+		loop4:
 		movlps		xmm0, [esi+eax]						// 0, 1, X, X
 		movlps		xmm3, [edi+eax]						// 0, 1, X, X
 		movlps		xmm1, [esi+eax+8]					// 2, 3, X, X
@@ -3121,21 +3220,21 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1,
 		addps		xmm7, xmm1
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
 
-		movlps		[ecx-16+0], xmm7
-		movhps		[ecx-16+8], xmm7
+		movlps		[ecx - 16 + 0], xmm7
+		movhps		[ecx - 16 + 8], xmm7
 		jl			loop4
 
-	done4:
+		done4:
 		and			edx, 3
 		jz			done1
 
-	loop1:
-		movss		xmm0, [esi+eax+0]
-		movss		xmm3, [edi+eax+0]
-		movss		xmm1, [esi+eax+4]
-		movss		xmm4, [edi+eax+4]
-		movss		xmm2, [esi+eax+8]
-		movss		xmm5, [edi+eax+8]
+		loop1:
+		movss		xmm0, [esi + eax + 0]
+		movss		xmm3, [edi + eax + 0]
+		movss		xmm1, [esi + eax + 4]
+		movss		xmm4, [edi + eax + 4]
+		movss		xmm2, [esi + eax + 8]
+		movss		xmm5, [edi + eax + 8]
 		mulss		xmm0, xmm3
 		mulss		xmm1, xmm4
 		mulss		xmm2, xmm5
@@ -3144,10 +3243,10 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1,
 		add			eax, 12
 		addss		xmm0, xmm2
 		dec			edx
-		movss		[ecx-4], xmm0
+		movss		[ecx - 4], xmm0
 		jnz			loop1
 
-	done1:
+		done1:
 	}
 }
 
@@ -3159,108 +3258,108 @@ idSIMD_SSE::Dot
 ============
 */
 void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
-	switch( count ) {
-		case 0:
-			dot = 0.0f;
-			return;
-		case 1:
-			dot = src1[0] * src2[0];
-			return;
-		case 2:
-			dot = src1[0] * src2[0] + src1[1] * src2[1];
-			return;
-		case 3:
-			dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
-			return;
-		default:
-			__asm {
-				mov			ecx, src1
-				mov			edx, src2
-				mov			eax, ecx
-				or			eax, edx
-				and			eax, 15
-				jz			alignedDot
-				// unaligned
-				mov			eax, count
-				shr			eax, 2
-				shl			eax, 4
-				add			ecx, eax
-				add			edx, eax
-				neg			eax
-				movups		xmm0, [ecx+eax]
-				movups		xmm1, [edx+eax]
-				mulps		xmm0, xmm1
-				add			eax, 16
-				jz			doneDot
+	switch ( count ) {
+	case 0:
+		dot = 0.0f;
+		return;
+	case 1:
+		dot = src1[0] * src2[0];
+		return;
+	case 2:
+		dot = src1[0] * src2[0] + src1[1] * src2[1];
+		return;
+	case 3:
+		dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
+		return;
+	default:
+		__asm {
+			mov			ecx, src1
+			mov			edx, src2
+			mov			eax, ecx
+			or			eax, edx
+			and			eax, 15
+			jz			alignedDot
+			// unaligned
+			mov			eax, count
+			shr			eax, 2
+			shl			eax, 4
+			add			ecx, eax
+			add			edx, eax
+			neg			eax
+			movups		xmm0, [ecx+eax]
+			movups		xmm1, [edx+eax]
+			mulps		xmm0, xmm1
+			add			eax, 16
+			jz			doneDot
 			loopUnalignedDot:
-				movups		xmm1, [ecx+eax]
-				movups		xmm2, [edx+eax]
-				mulps		xmm1, xmm2
-				addps		xmm0, xmm1
-				add			eax, 16
-				jl			loopUnalignedDot
-				jmp			doneDot
-				// aligned
+			movups		xmm1, [ecx+eax]
+			movups		xmm2, [edx+eax]
+			mulps		xmm1, xmm2
+			addps		xmm0, xmm1
+			add			eax, 16
+			jl			loopUnalignedDot
+			jmp			doneDot
+			// aligned
 			alignedDot:
-				mov			eax, count
-				shr			eax, 2
-				shl			eax, 4
-				add			ecx, eax
-				add			edx, eax
-				neg			eax
-				movaps		xmm0, [ecx+eax]
-				movaps		xmm1, [edx+eax]
-				mulps		xmm0, xmm1
-				add			eax, 16
-				jz			doneDot
+			mov			eax, count
+			shr			eax, 2
+			shl			eax, 4
+			add			ecx, eax
+			add			edx, eax
+			neg			eax
+			movaps		xmm0, [ecx+eax]
+			movaps		xmm1, [edx+eax]
+			mulps		xmm0, xmm1
+			add			eax, 16
+			jz			doneDot
 			loopAlignedDot:
-				movaps		xmm1, [ecx+eax]
-				movaps		xmm2, [edx+eax]
-				mulps		xmm1, xmm2
-				addps		xmm0, xmm1
-				add			eax, 16
-				jl			loopAlignedDot
+			movaps		xmm1, [ecx+eax]
+			movaps		xmm2, [edx+eax]
+			mulps		xmm1, xmm2
+			addps		xmm0, xmm1
+			add			eax, 16
+			jl			loopAlignedDot
 			doneDot:
+		}
+		switch ( count & 3 ) {
+		case 1:
+			__asm {
+				movss	xmm1, [ecx]
+				movss	xmm2, [edx]
+				mulss	xmm1, xmm2
+				addss	xmm0, xmm1
 			}
-			switch( count & 3 ) {
-				case 1:
-					__asm {
-						movss	xmm1, [ecx]
-						movss	xmm2, [edx]
-						mulss	xmm1, xmm2
-						addss	xmm0, xmm1
-					}
-					break;
-				case 2:
-					__asm {
-						xorps	xmm2, xmm2
-						movlps	xmm1, [ecx]
-						movlps	xmm2, [edx]
-						mulps	xmm1, xmm2
-						addps	xmm0, xmm1
-					}
-					break;
-				case 3:
-					__asm {
-						movss	xmm1, [ecx]
-						movhps	xmm1, [ecx+4]
-						movss	xmm2, [edx]
-						movhps	xmm2, [edx+4]
-						mulps	xmm1, xmm2
-						addps	xmm0, xmm1
-					}
-					break;
+			break;
+		case 2:
+			__asm {
+				xorps	xmm2, xmm2
+				movlps	xmm1, [ecx]
+				movlps	xmm2, [edx]
+				mulps	xmm1, xmm2
+				addps	xmm0, xmm1
 			}
+			break;
+		case 3:
 			__asm {
-				movhlps		xmm1, xmm0
-				addps		xmm0, xmm1
-				movaps		xmm1, xmm0
-				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
-				addss		xmm0, xmm1
-				mov			eax, dot
-				movss		[eax], xmm0
+				movss	xmm1, [ecx]
+				movhps	xmm1, [ecx+4]
+				movss	xmm2, [edx]
+				movhps	xmm2, [edx+4]
+				mulps	xmm1, xmm2
+				addps	xmm0, xmm1
 			}
-			return;
+			break;
+		}
+		__asm {
+			movhlps		xmm1, xmm0
+			addps		xmm0, xmm1
+			movaps		xmm1, xmm0
+			shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
+			addss		xmm0, xmm1
+			mov			eax, dot
+			movss		[eax], xmm0
+		}
+		return;
 	}
 }
 
@@ -3558,10 +3657,10 @@ idSIMD_SSE::MinMax
 void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
 	int i, pre, post;
 
-	min = idMath::INFINITY; max = -idMath::INFINITY;
+	min = idMath::INFINITY;
+	max = -idMath::INFINITY;
 
-	__asm
-	{
+	__asm {
 		push		ebx
 		mov			eax, min
 		mov			ebx, max
@@ -3575,30 +3674,30 @@ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const
 		jz			lpA
 		jmp			lpNA
 		align		16
-lpNA:
-		movups		xmm2, [edx+ebx]
-		movups		xmm3, [edx+ebx+16]
+		lpNA:
+		movups		xmm2, [edx + ebx]
+		movups		xmm3, [edx + ebx + 16]
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm2
-		prefetchnta	[edx+ebx+64]
+		prefetchnta	[edx + ebx + 64]
 		minps		xmm0, xmm3
 		maxps		xmm1, xmm3
-		add			ebx, 16*2
+		add			ebx, 16 * 2
 		jl			lpNA
 		jmp			done2
-lpA:
-		movaps		xmm2, [edx+ebx]
-		movaps		xmm3, [edx+ebx+16]
+		lpA:
+		movaps		xmm2, [edx + ebx]
+		movaps		xmm3, [edx + ebx + 16]
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm2
-		prefetchnta	[edx+ebx+64]
+		prefetchnta	[edx + ebx + 64]
 		minps		xmm0, xmm3
 		maxps		xmm1, xmm3
-		add			ebx, 16*2
+		add			ebx, 16 * 2
 		jl			lpA
 		jmp			done2
 		align		16
-done2:
+		done2:
 		movaps		xmm2, xmm0
 		movaps		xmm3, xmm1
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
@@ -3617,7 +3716,7 @@ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const
 		mov			ebx, max
 		movss		[eax], xmm0
 		movss		[ebx], xmm1
-done:
+		done:
 		pop			ebx
 	}
 
@@ -3662,21 +3761,21 @@ void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, con
 		movlps		xmm2, [esi]
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
 		dec			eax
-		add			esi, 2*4
+		add			esi, 2 * 4
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm2
-	startLoop:
-		imul		eax, 2*4
+		startLoop:
+		imul		eax, 2 * 4
 		add			esi, eax
 		neg			eax
-	loopVert:
-		movlps		xmm2, [esi+eax]
-		movhps		xmm2, [esi+eax+8]
-		add			eax, 4*4
+		loopVert:
+		movlps		xmm2, [esi + eax]
+		movhps		xmm2, [esi + eax + 8]
+		add			eax, 4 * 4
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm2
 		jl			loopVert
-	done:
+		done:
 		movaps		xmm2, xmm0
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
 		minps		xmm0, xmm2
@@ -3713,33 +3812,33 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, con
 		add			esi, eax
 		neg			eax
 
-	loop4:
-//		prefetchnta	[esi+4*12]
+		loop4:
+		//		prefetchnta	[esi+4*12]
 
-		movss		xmm4, [esi+eax+0*12+8]
-		movhps		xmm4, [esi+eax+0*12+0]
+		movss		xmm4, [esi + eax + 0 * 12 + 8]
+		movhps		xmm4, [esi + eax + 0 * 12 + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
-		movss		xmm5, [esi+eax+1*12+0]
-		movhps		xmm5, [esi+eax+1*12+4]
+		movss		xmm5, [esi + eax + 1 * 12 + 0]
+		movhps		xmm5, [esi + eax + 1 * 12 + 4]
 		minps		xmm2, xmm5
 		maxps		xmm3, xmm5
 
-		movss		xmm6, [esi+eax+2*12+8]
-		movhps		xmm6, [esi+eax+2*12+0]
+		movss		xmm6, [esi + eax + 2 * 12 + 8]
+		movhps		xmm6, [esi + eax + 2 * 12 + 0]
 		minps		xmm0, xmm6
 		maxps		xmm1, xmm6
 
-		movss		xmm7, [esi+eax+3*12+0]
-		movhps		xmm7, [esi+eax+3*12+4]
+		movss		xmm7, [esi + eax + 3 * 12 + 0]
+		movhps		xmm7, [esi + eax + 3 * 12 + 4]
 		minps		xmm2, xmm7
 		maxps		xmm3, xmm7
 
-		add			eax, 4*12
+		add			eax, 4 * 12
 		jl			loop4
 
-	done4:
+		done4:
 		mov			eax, count
 		and			eax, 3
 		jz			done1
@@ -3747,26 +3846,26 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, con
 		add			esi, eax
 		neg			eax
 
-	loop1:
-		movss		xmm4, [esi+eax+0*12+8]
-		movhps		xmm4, [esi+eax+0*12+0]
+		loop1:
+		movss		xmm4, [esi + eax + 0 * 12 + 8]
+		movhps		xmm4, [esi + eax + 0 * 12 + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
 		add			eax, 12
 		jl			loop1
 
-	done1:
+		done1:
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm3
 		mov			esi, min
 		movhps		[esi], xmm0
-		movss		[esi+8], xmm0
+		movss		[esi + 8], xmm0
 		mov			edi, max
 		movhps		[edi], xmm1
-		movss		[edi+8], xmm1
+		movss		[edi + 8], xmm1
 	}
 }
 
@@ -3778,7 +3877,7 @@ idSIMD_SSE::MinMax
 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 
@@ -3797,33 +3896,33 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		add			esi, eax
 		neg			eax
 
-	loop4:
-//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
+		loop4:
+		//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
 
-		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
-		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		minps		xmm2, xmm5
 		maxps		xmm3, xmm5
 
-		movss		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm6, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm6, [esi + eax + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm6
 		maxps		xmm1, xmm6
 
-		movss		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm7, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm7, [esi + eax + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		minps		xmm2, xmm7
 		maxps		xmm3, xmm7
 
-		add			eax, 4*DRAWVERT_SIZE
+		add			eax, 4 * DRAWVERT_SIZE
 		jl			loop4
 
-	done4:
+		done4:
 		mov			eax, count
 		and			eax, 3
 		jz			done1
@@ -3831,26 +3930,26 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		add			esi, eax
 		neg			eax
 
-	loop1:
-		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		loop1:
+		movss		xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm4, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
 		add			eax, DRAWVERT_SIZE
 		jl			loop1
 
-	done1:
+		done1:
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm3
 		mov			esi, min
 		movhps		[esi], xmm0
-		movss		[esi+8], xmm0
+		movss		[esi + 8], xmm0
 		mov			edi, max
 		movhps		[edi], xmm1
-		movss		[edi+8], xmm1
+		movss		[edi + 8], xmm1
 	}
 }
 
@@ -3862,7 +3961,7 @@ idSIMD_SSE::MinMax
 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 
@@ -3882,42 +3981,42 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		add			edi, eax
 		neg			eax
 
-	loop4:
-//		prefetchnta	[edi+128]
-//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
+		loop4:
+		//		prefetchnta	[edi+128]
+		//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
 
-		mov			edx, [edi+eax+0]
+		mov			edx, [edi + eax + 0]
 		imul		edx, DRAWVERT_SIZE
-		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
-		mov			edx, [edi+eax+4]
+		mov			edx, [edi + eax + 4]
 		imul		edx, DRAWVERT_SIZE
-		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm5, [esi + edx + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm5, [esi + edx + DRAWVERT_XYZ_OFFSET + 4]
 		minps		xmm2, xmm5
 		maxps		xmm3, xmm5
 
-		mov			edx, [edi+eax+8]
+		mov			edx, [edi + eax + 8]
 		imul		edx, DRAWVERT_SIZE
-		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm6, [esi + edx + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm6, [esi + edx + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm6
 		maxps		xmm1, xmm6
 
-		mov			edx, [edi+eax+12]
+		mov			edx, [edi + eax + 12]
 		imul		edx, DRAWVERT_SIZE
-		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm7, [esi + edx + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm7, [esi + edx + DRAWVERT_XYZ_OFFSET + 4]
 		minps		xmm2, xmm7
 		maxps		xmm3, xmm7
 
-		add			eax, 4*4
+		add			eax, 4 * 4
 		jl			loop4
 
-	done4:
+		done4:
 		mov			eax, count
 		and			eax, 3
 		jz			done1
@@ -3925,28 +4024,28 @@ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src,
 		add			edi, eax
 		neg			eax
 
-	loop1:
-		mov			edx, [edi+eax+0]
+		loop1:
+		mov			edx, [edi + eax + 0]
 		imul		edx, DRAWVERT_SIZE;
-		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm4, [esi + edx + DRAWVERT_XYZ_OFFSET + 0]
 		minps		xmm0, xmm4
 		maxps		xmm1, xmm4
 
 		add			eax, 4
 		jl			loop1
 
-	done1:
+		done1:
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
 		minps		xmm0, xmm2
 		maxps		xmm1, xmm3
 		mov			esi, min
 		movhps		[esi], xmm0
-		movss		[esi+8], xmm0
+		movss		[esi + 8], xmm0
 		mov			edi, max
 		movhps		[edi], xmm1
-		movss		[edi+8], xmm1
+		movss		[edi + 8], xmm1
 	}
 }
 
@@ -3958,64 +4057,63 @@ idSIMD_SSE::Clamp
 void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
 	int	i, pre, post;
 
-	__asm
-	{
-		movss	xmm0,min
-		movss	xmm1,max
-		shufps	xmm0,xmm0,0
-		shufps	xmm1,xmm1,0
+	__asm {
+		movss	xmm0, min
+		movss	xmm1, max
+		shufps	xmm0, xmm0, 0
+		shufps	xmm1, xmm1, 0
 
 		KFLOATINITDS( dst, src, count, pre, post )
-		and		eax,15
+		and		eax, 15
 		jne		lpNA
 		jmp		lpA
 		align	16
-lpA:
-		movaps	xmm2,[edx+ebx]
-		movaps	xmm3,[edx+ebx+16]
-		maxps	xmm2,xmm0
-		maxps	xmm3,xmm0
-		prefetchnta	[edx+ebx+64]
-		minps	xmm2,xmm1
-		minps	xmm3,xmm1
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpA:
+		movaps	xmm2, [edx + ebx]
+		movaps	xmm3, [edx + ebx + 16]
+		maxps	xmm2, xmm0
+		maxps	xmm3, xmm0
+		prefetchnta	[edx + ebx + 64]
+		minps	xmm2, xmm1
+		minps	xmm3, xmm1
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpA
 		jmp		done
 
 		align	16
-lpNA:
-		movups	xmm2,[edx+ebx]
-		movups	xmm3,[edx+ebx+16]
-		maxps	xmm2,xmm0
-		maxps	xmm3,xmm0
-		prefetchnta	[edx+ebx+64]
-		minps	xmm2,xmm1
-		minps	xmm3,xmm1
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpNA:
+		movups	xmm2, [edx + ebx]
+		movups	xmm3, [edx + ebx + 16]
+		maxps	xmm2, xmm0
+		maxps	xmm3, xmm0
+		prefetchnta	[edx + ebx + 64]
+		minps	xmm2, xmm1
+		minps	xmm3, xmm1
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpNA
-done:
+		done:
 	}
 
 	for ( i = 0; i < pre; i++ ) {
 		if ( src[i] < min )
-			dst[i] = min;
+		{ dst[i] = min; }
 		else if ( src[i] > max )
-			dst[i] = max;
+		{ dst[i] = max; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
 
-	for( i = count - post; i < count; i++ ) {
+	for ( i = count - post; i < count; i++ ) {
 		if ( src[i] < min )
-			dst[i] = min;
+		{ dst[i] = min; }
 		else if ( src[i] > max )
-			dst[i] = max;
+		{ dst[i] = max; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
 }
 
@@ -4027,53 +4125,52 @@ idSIMD_SSE::ClampMin
 void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
 	int	i, pre, post;
 
-	__asm
-	{
-		movss	xmm0,min
-		shufps	xmm0,xmm0,0
+	__asm {
+		movss	xmm0, min
+		shufps	xmm0, xmm0, 0
 
 		KFLOATINITDS( dst, src, count, pre, post )
-		and		eax,15
+		and		eax, 15
 		jne		lpNA
 		jmp		lpA
 		align	16
-lpA:
-		movaps	xmm2,[edx+ebx]
-		movaps	xmm3,[edx+ebx+16]
-		maxps	xmm2,xmm0
-		prefetchnta	[edx+ebx+64]
-		maxps	xmm3,xmm0
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpA:
+		movaps	xmm2, [edx + ebx]
+		movaps	xmm3, [edx + ebx + 16]
+		maxps	xmm2, xmm0
+		prefetchnta	[edx + ebx + 64]
+		maxps	xmm3, xmm0
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpA
 		jmp		done
 
 		align	16
-lpNA:
-		movups	xmm2,[edx+ebx]
-		movups	xmm3,[edx+ebx+16]
-		maxps	xmm2,xmm0
-		prefetchnta	[edx+ebx+64]
-		maxps	xmm3,xmm0
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpNA:
+		movups	xmm2, [edx + ebx]
+		movups	xmm3, [edx + ebx + 16]
+		maxps	xmm2, xmm0
+		prefetchnta	[edx + ebx + 64]
+		maxps	xmm3, xmm0
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpNA
-done:
+		done:
 	}
 
-	for( i = 0; i < pre; i++ ) {
+	for ( i = 0; i < pre; i++ ) {
 		if ( src[i] < min )
-			dst[i] = min;
+		{ dst[i] = min; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
-	for( i = count - post; i < count; i++ ) {
+	for ( i = count - post; i < count; i++ ) {
 		if ( src[i] < min )
-			dst[i] = min;
+		{ dst[i] = min; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
 }
 
@@ -4085,54 +4182,53 @@ idSIMD_SSE::ClampMax
 void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
 	int	i, pre, post;
 
-	__asm
-	{
-		movss	xmm1,max
-		shufps	xmm1,xmm1,0
+	__asm {
+		movss	xmm1, max
+		shufps	xmm1, xmm1, 0
 
 		KFLOATINITDS( dst, src, count, pre, post )
-		and		eax,15
+		and		eax, 15
 		jne		lpNA
 		jmp		lpA
 		align	16
-lpA:
-		movaps	xmm2,[edx+ebx]
-		movaps	xmm3,[edx+ebx+16]
-		minps	xmm2,xmm1
-		prefetchnta	[edx+ebx+64]
-		minps	xmm3,xmm1
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpA:
+		movaps	xmm2, [edx + ebx]
+		movaps	xmm3, [edx + ebx + 16]
+		minps	xmm2, xmm1
+		prefetchnta	[edx + ebx + 64]
+		minps	xmm3, xmm1
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpA
 		jmp		done
 
 		align	16
-lpNA:
-		movups	xmm2,[edx+ebx]
-		movups	xmm3,[edx+ebx+16]
-		minps	xmm2,xmm1
-		prefetchnta	[edx+ebx+64]
-		minps	xmm3,xmm1
-		movaps	[edi+ebx],xmm2
-		movaps	[edi+ebx+16],xmm3
-		add		ebx,16*2
+		lpNA:
+		movups	xmm2, [edx + ebx]
+		movups	xmm3, [edx + ebx + 16]
+		minps	xmm2, xmm1
+		prefetchnta	[edx + ebx + 64]
+		minps	xmm3, xmm1
+		movaps	[edi + ebx], xmm2
+		movaps	[edi + ebx + 16], xmm3
+		add		ebx, 16 * 2
 		jl		lpNA
-done:
+		done:
 	}
 
-	for( i = 0; i < pre; i++ ) {
+	for ( i = 0; i < pre; i++ ) {
 		if ( src[i] > max )
-			dst[i] = max;
+		{ dst[i] = max; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
 
-	for( i = count - post; i < count; i++ ) {
+	for ( i = count - post; i < count; i++ ) {
 		if ( src[i] > max )
-			dst[i] = max;
+		{ dst[i] = max; }
 		else
-			dst[i] = src[i];
+		{ dst[i] = src[i]; }
 	}
 }
 
@@ -4152,11 +4248,11 @@ void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
 		add		edx, eax
 		neg		eax
 		xorps	xmm0, xmm0
-	loopZero16:
+		loopZero16:
 		movaps	[edx+eax], xmm0
 		add		eax, 16
 		jl		loopZero16
-	doneZero16:
+		doneZero16:
 	}
 }
 
@@ -4177,13 +4273,13 @@ void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
 		neg		eax
 		movss	xmm0, SIMD_SP_signBitMask
 		shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-	loopNegate16:
-		movaps	xmm1, [edx+eax]
+		loopNegate16:
+		movaps	xmm1, [edx + eax]
 		xorps	xmm1, xmm0
-		movaps	[edx+eax], xmm1
+		movaps	[edx + eax], xmm1
 		add		eax, 16
 		jl		loopNegate16
-	doneNegate16:
+		doneNegate16:
 	}
 }
 
@@ -4204,12 +4300,12 @@ void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count )
 		add		ecx, eax
 		add		edx, eax
 		neg		eax
-	loopCopy16:
+		loopCopy16:
 		movaps	xmm0, [ecx+eax]
 		movaps	[edx+eax], xmm0
 		add		eax, 16
 		jl		loopCopy16
-	doneCopy16:
+		doneCopy16:
 	}
 }
 
@@ -4232,13 +4328,13 @@ void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2,
 		add		ecx, eax
 		add		edx, eax
 		neg		eax
-	loopAdd16:
+		loopAdd16:
 		movaps	xmm0, [ecx+eax]
 		addps	xmm0, [edx+eax]
 		movaps	[esi+eax], xmm0
 		add		eax, 16
 		jl		loopAdd16
-	doneAdd16:
+		doneAdd16:
 	}
 }
 
@@ -4261,13 +4357,13 @@ void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2,
 		add		ecx, eax
 		add		edx, eax
 		neg		eax
-	loopSub16:
+		loopSub16:
 		movaps	xmm0, [ecx+eax]
 		subps	xmm0, [edx+eax]
 		movaps	[esi+eax], xmm0
 		add		eax, 16
 		jl		loopSub16
-	doneSub16:
+		doneSub16:
 	}
 }
 
@@ -4290,13 +4386,13 @@ void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float consta
 		add		edx, eax
 		neg		eax
 		shufps	xmm1, xmm1, 0x00
-	loopMulScalar16:
+		loopMulScalar16:
 		movaps	xmm0, [edx+eax]
 		mulps	xmm0, xmm1
 		movaps	[ecx+eax], xmm0
 		add		eax, 16
 		jl		loopMulScalar16
-	doneMulScalar16:
+		doneMulScalar16:
 	}
 }
 
@@ -4317,13 +4413,13 @@ void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int cou
 		add		ecx, eax
 		add		edx, eax
 		neg		eax
-	loopAddAssign16:
+		loopAddAssign16:
 		movaps	xmm0, [ecx+eax]
 		addps	xmm0, [edx+eax]
 		movaps	[ecx+eax], xmm0
 		add		eax, 16
 		jl		loopAddAssign16
-	doneAddAssign16:
+		doneAddAssign16:
 	}
 }
 
@@ -4344,13 +4440,13 @@ void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int cou
 		add		ecx, eax
 		add		edx, eax
 		neg		eax
-	loopSubAssign16:
+		loopSubAssign16:
 		movaps	xmm0, [ecx+eax]
 		subps	xmm0, [edx+eax]
 		movaps	[ecx+eax], xmm0
 		add		eax, 16
 		jl		loopSubAssign16
-	doneSubAssign16:
+		doneSubAssign16:
 	}
 }
 
@@ -4371,13 +4467,13 @@ void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int
 		add		ecx, eax
 		neg		eax
 		shufps	xmm1, xmm1, 0x00
-	loopMulAssign16:
+		loopMulAssign16:
 		movaps	xmm0, [ecx+eax]
 		mulps	xmm0, xmm1
 		movaps	[ecx+eax], xmm0
 		add		eax, 16
 		jl		loopMulAssign16
-	doneMulAssign16:
+		doneMulAssign16:
 	}
 }
 
@@ -4417,1444 +4513,698 @@ void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const
 	vPtr = vec.ToFloatPtr();
 	dstPtr = dst.ToFloatPtr();
 	numRows = mat.GetNumRows();
-	switch( mat.GetNumColumns() ) {
-		case 1: {
-			switch( numRows ) {
-				case 1: {		// 1x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						STORE1( 0, xmm0, xmm1 )
-					}
-					return;
-				}
-				case 6: {		// 6x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	switch ( mat.GetNumColumns() ) {
+	case 1: {
+		switch ( numRows ) {
+		case 1: {		// 1x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				STORE1( 0, xmm0, xmm1 )
 			}
-			break;
+			return;
 		}
-		case 2: {
-			switch( numRows ) {
-				case 2: {		// 2x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm1, [esi+4]
-						movss		xmm2, [edi]
-						mulss		xmm2, xmm0
-						movss		xmm3, [edi+4]
-						mulss		xmm3, xmm1
-						addss		xmm2, xmm3
-						STORE1( 0, xmm2, xmm4 )
-						mulss		xmm0, [edi+8]
-						mulss		xmm1, [edi+8+4]
-						addss		xmm0, xmm1
-						STORE1( 4, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, [esi]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movaps		xmm0, [edi]
-						mulps		xmm0, xmm7
-						movaps		xmm1, [edi+16]
-						mulps		xmm1, xmm7
-						movaps		xmm2, xmm0
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						movaps		xmm3, [edi+32]
-						addps		xmm0, xmm2
-						mulps		xmm3, xmm7
-						STORE4( 0, xmm0, xmm4 )
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm1, xmm3
-						addps		xmm3, xmm1
-						STORE2LO( 16, xmm3, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
-						mPtr += 2;
-					}
-					return;
-				}
+		case 6: {		// 6x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm2 )
 			}
-			break;
+			return;
 		}
-		case 3: {
-			switch( numRows ) {
-				case 3: {		// 3x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm4, [edi]
-						mulss		xmm4, xmm0
-						movss		xmm1, [esi+4]
-						movss		xmm5, [edi+4]
-						mulss		xmm5, xmm1
-						addss		xmm4, xmm5
-						movss		xmm2, [esi+8]
-						movss		xmm6, [edi+8]
-						mulss		xmm6, xmm2
-						addss		xmm4, xmm6
-						movss		xmm3, [edi+12]
-						mulss		xmm3, xmm0
-						STORE1( 0, xmm4, xmm7 );
-						movss		xmm5, [edi+12+4]
-						mulss		xmm5, xmm1
-						addss		xmm3, xmm5
-						movss		xmm6, [edi+12+8]
-						mulss		xmm6, xmm2
-						addss		xmm3, xmm6
-						mulss		xmm0, [edi+24]
-						mulss		xmm1, [edi+24+4]
-						STORE1( 4, xmm3, xmm7 );
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+24+8]
-						addss		xmm0, xmm2
-						STORE1( 8, xmm0, xmm7 );
-					}
-					return;
-				}
-				case 6: {		// 6x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm5, [esi]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm6, [esi+4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
-						movlps		xmm1, [edi+4*4]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
-						movlps		xmm2, [edi+6*4]
-						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
-						mulps		xmm0, xmm5
-						movlps		xmm3, [edi+10*4]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
-						movaps		xmm3, xmm1
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
-						mulps		xmm1, xmm6
-						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
-						mulps		xmm3, xmm7
-						addps		xmm0, xmm1
-						addps		xmm0, xmm3
-						STORE4( 0, xmm0, xmm4 )
-						movss		xmm1, [edi+12*4]
-						mulss		xmm1, xmm5
-						movss		xmm2, [edi+13*4]
-						mulss		xmm2, xmm6
-						movss		xmm3, [edi+14*4]
-						mulss		xmm3, xmm7
-						addss		xmm1, xmm2
-						addss		xmm1, xmm3
-						STORE1( 16, xmm1, xmm4 )
-						mulss		xmm5, [edi+15*4]
-						mulss		xmm6, [edi+16*4]
-						mulss		xmm7, [edi+17*4]
-						addss		xmm5, xmm6
-						addss		xmm5, xmm7
-						STORE1( 20, xmm5, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
-						mPtr += 3;
-					}
-					return;
-				}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0];
+				mPtr++;
 			}
-			break;
+			return;
 		}
-		case 4: {
-			switch( numRows ) {
-				case 4: {		// 4x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi ]
-						movlps		xmm0, qword ptr [edi ]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						STORE4( 0, xmm0, xmm2 )
-					}
-					return;
-				}
-				case 6: {		// 6x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi+ 0]
-						movlps		xmm0, qword ptr [edi+ 0]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						movlps		xmm1, qword ptr [edi+64]
-						movhps		xmm1, qword ptr [edi+80]
-						STORE4( 0, xmm0, xmm4 )
-						mulps		xmm1, xmm6
-						movlps		xmm2, qword ptr [edi+72]
-						movhps		xmm2, qword ptr [edi+88]
-						mulps		xmm2, xmm7
-						addps		xmm1, xmm2
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm3, xmm1
-						addps		xmm1, xmm3
-						STORE2LO( 16, xmm1, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
-						mPtr += 4;
-					}
-					return;
-				}
-			}
-			break;
 		}
-		case 5: {
-			switch( numRows ) {
-				case 5: {		// 5x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
-						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
-						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
-						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
-						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
-						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
-						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
-						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
-						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
-						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
-						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
-						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
-						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
-						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
-						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
-						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
-						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
-						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
-						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
-						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
-						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
-						movss		xmm7, [esi+0*4]
-						shufps		xmm7, xmm7, 0
-						mulps		xmm0, xmm7
-						movss		xmm5, [esi+1*4]
-						shufps		xmm5, xmm5, 0
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, 0
-						mulps		xmm2, xmm6
-						addps		xmm0, xmm2
-						movss		xmm1, [esi+3*4]
-						shufps		xmm1, xmm1, 0
-						mulps		xmm3, xmm1
-						addps		xmm0, xmm3
-						movss		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, 0
-						mulps		xmm4, xmm2
-						addps		xmm0, xmm4
-						mulss		xmm7, [edi+20*4]
-						mulss		xmm5, [edi+21*4]
-						addps		xmm7, xmm5
-						mulss		xmm6, [edi+22*4]
-						addps		xmm7, xmm6
-						mulss		xmm1, [edi+23*4]
-						addps		xmm7, xmm1
-						mulss		xmm2, [edi+24*4]
-						addps		xmm7, xmm2
-						STORE4( 0, xmm0, xmm3 )
-						STORE1( 16, xmm7, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [esi]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm0, [edi]
-						movhps		xmm3, [edi+8]
-						movaps		xmm1, [edi+16]
-						movlps		xmm2, [edi+32]
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
-						mulps		xmm0, xmm6
-						mulps		xmm3, xmm7
-						movlps		xmm2, [edi+40]
-						addps		xmm0, xmm3								// xmm0 + xmm1
-						movhps		xmm5, [edi+40+8]
-						movlps		xmm3, [edi+40+16]
-						movhps		xmm3, [edi+40+24]
-						movlps		xmm4, [edi+40+32]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
-						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
-						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
-						mulps		xmm2, xmm6
-						mulps		xmm5, xmm7
-						addps		xmm2, xmm5								// xmm2 + xmm3
-						movss		xmm5, [esi+16]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm0
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
-						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
-						addps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						STORE4( 0, xmm0, xmm2 )
-						movlps		xmm4, [edi+80]
-						movhps		xmm3, [edi+80+8]
-						movaps		xmm1, [edi+80+16]
-						movlps		xmm2, [edi+80+32]
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
-						mulps		xmm4, xmm6
-						mulps		xmm3, xmm7
-						mulps		xmm1, xmm5
-						addps		xmm4, xmm3								// xmm4 + xmm1
-						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
-						addps		xmm4, xmm1
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
-						addps		xmm4, xmm1
-						STORE2LO( 16, xmm4, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
-						mPtr += 5;
-					}
-					return;
-				}
+		break;
+	}
+	case 2: {
+		switch ( numRows ) {
+		case 2: {		// 2x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm1, [esi+4]
+				movss		xmm2, [edi]
+				mulss		xmm2, xmm0
+				movss		xmm3, [edi+4]
+				mulss		xmm3, xmm1
+				addss		xmm2, xmm3
+				STORE1( 0, xmm2, xmm4 )
+				mulss		xmm0, [edi + 8]
+				mulss		xmm1, [edi + 8 + 4]
+				addss		xmm0, xmm1
+				STORE1( 4, xmm0, xmm4 )
 			}
-			break;
+			return;
 		}
-		case 6: {
-			switch( numRows ) {
-				case 1: {		// 1x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						movss		xmm1, [esi+4]
-						mulss		xmm1, [edi+4]
-						movss		xmm2, [esi+8]
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+8]
-						movss		xmm3, [esi+12]
-						addss		xmm0, xmm2
-						mulss		xmm3, [edi+12]
-						movss		xmm4, [esi+16]
-						addss		xmm0, xmm3
-						mulss		xmm4, [edi+16]
-						movss		xmm5, [esi+20]
-						addss		xmm0, xmm4
-						mulss		xmm5, [edi+20]
-						movss		xmm6, [esi+24]
-						addss		xmm0, xmm5
-						mulss		xmm6, [edi+24]
-						addss		xmm0, xmm6
-						STORE1( 0, xmm0, xmm7 )
-					}
-					return;
-				}
-				case 2: {		// 2x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 3: {		// 3x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-						// row 2
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm0, xmm1
-						STORE1( 8, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 4: {		// 4x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 5: {		// 5x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						// row 5
-						movaps		xmm0, [edi+96]
-						movaps		xmm1, [edi+96+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, 0x01
-						addss		xmm0, xmm1
-						STORE1( 16, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, qword ptr [esi]
-						movlps		xmm6, qword ptr [esi+8]
-						shufps		xmm7, xmm7, 0x44
-						shufps		xmm6, xmm6, 0x44
-						movlps		xmm0, qword ptr [edi    ]
-						movhps		xmm0, qword ptr [edi+ 24]
-						mulps		xmm0, xmm7
-						movlps		xmm3, qword ptr [edi+  8]
-						movhps		xmm3, qword ptr [edi+ 32]
-						mulps		xmm3, xmm6
-						movlps		xmm1, qword ptr [edi+ 48]
-						movhps		xmm1, qword ptr [edi+ 72]
-						mulps		xmm1, xmm7
-						movlps		xmm2, qword ptr [edi+ 96]
-						movhps		xmm2, qword ptr [edi+120]
-						mulps		xmm2, xmm7
-						movlps		xmm4, qword ptr [edi+ 56]
-						movhps		xmm4, qword ptr [edi+ 80]
-						movlps		xmm5, qword ptr [edi+104]
-						movhps		xmm5, qword ptr [edi+128]
-						mulps		xmm4, xmm6
-						movlps		xmm7, qword ptr [esi+16]
-						addps		xmm0, xmm3
-						shufps		xmm7, xmm7, 0x44
-						mulps		xmm5, xmm6
-						addps		xmm1, xmm4
-						movlps		xmm3, qword ptr [edi+ 16]
-						movhps		xmm3, qword ptr [edi+ 40]
-						addps		xmm2, xmm5
-						movlps		xmm4, qword ptr [edi+ 64]
-						movhps		xmm4, qword ptr [edi+ 88]
-						mulps		xmm3, xmm7
-						movlps		xmm5, qword ptr [edi+112]
-						movhps		xmm5, qword ptr [edi+136]
-						addps		xmm0, xmm3
-						mulps		xmm4, xmm7
-						mulps		xmm5, xmm7
-						addps		xmm1, xmm4
-						addps		xmm2, xmm5
-						movaps		xmm6, xmm0
-						shufps		xmm0, xmm1, 0x88
-						shufps		xmm6, xmm1, 0xDD
-						movaps		xmm7, xmm2
-						shufps		xmm7, xmm2, 0x88
-						shufps		xmm2, xmm2, 0xDD
-						addps		xmm0, xmm6
-						addps		xmm2, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						STORE2LO( 16, xmm2, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
-									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
-						mPtr += 6;
-					}
-					return;
-				}
+		case 6: {		// 6x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, [esi]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movaps		xmm0, [edi]
+				mulps		xmm0, xmm7
+				movaps		xmm1, [edi + 16]
+				mulps		xmm1, xmm7
+				movaps		xmm2, xmm0
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				movaps		xmm3, [edi + 32]
+				addps		xmm0, xmm2
+				mulps		xmm3, xmm7
+				STORE4( 0, xmm0, xmm4 )
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm1, xmm3
+				addps		xmm3, xmm1
+				STORE2LO( 16, xmm3, xmm4 )
 			}
-			break;
+			return;
 		}
 		default: {
-			int numColumns = mat.GetNumColumns();
 			for ( int i = 0; i < numRows; i++ ) {
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numColumns; j++ ) {
-					sum += mPtr[j] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
-				mPtr += numColumns;
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
+				mPtr += 2;
 			}
-			break;
+			return;
+		}
 		}
+		break;
 	}
-
-#undef STOREC
-#undef STORE4
-#undef STORE2HI
-#undef STORE2LO
-#undef STORE1
-}
-
-/*
-============
-idSIMD_SSE::MatX_MultiplyAddVecX
-
-	optimizes the following matrix multiplications:
-
-	NxN * Nx1
-	Nx6 * 6x1
-	6xN * Nx1
-
-	with N in the range [1-6]
-============
-*/
-void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
-#define STORE1( offset, reg1, reg2 )		\
-	__asm movss		reg2, [eax+offset]		\
-	__asm addss		reg2, reg1				\
-	__asm movss		[eax+offset], reg2
-#define STORE2LO( offset, reg1, reg2 )		\
-	__asm movlps	reg2, [eax+offset]		\
-	__asm addps		reg2, reg1				\
-	__asm movlps	[eax+offset], reg2
-#define STORE2HI( offset, reg1, reg2 )		\
-	__asm movhps	reg2, [eax+offset]		\
-	__asm addps		reg2, reg1				\
-	__asm movhps	[eax+offset], reg2
-#define STORE4( offset, reg1, reg2 )		\
-	__asm movlps	reg2, [eax+offset]		\
-	__asm movhps	reg2, [eax+offset+8]	\
-	__asm addps		reg2, reg1				\
-	__asm movlps	[eax+offset], reg2		\
-	__asm movhps	[eax+offset+8], reg2
-#define STOREC		+=
-
-	int numRows;
-	const float *mPtr, *vPtr;
-	float *dstPtr;
-
-	assert( vec.GetSize() >= mat.GetNumColumns() );
-	assert( dst.GetSize() >= mat.GetNumRows() );
-
-	mPtr = mat.ToFloatPtr();
-	vPtr = vec.ToFloatPtr();
-	dstPtr = dst.ToFloatPtr();
-	numRows = mat.GetNumRows();
-	switch( mat.GetNumColumns() ) {
-		case 1: {
-			switch( numRows ) {
-				case 1: {		// 1x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						STORE1( 0, xmm0, xmm1 )
-					}
-					return;
-				}
-				case 6: {		// 6x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	case 3: {
+		switch ( numRows ) {
+		case 3: {		// 3x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm4, [edi]
+				mulss		xmm4, xmm0
+				movss		xmm1, [esi+4]
+				movss		xmm5, [edi+4]
+				mulss		xmm5, xmm1
+				addss		xmm4, xmm5
+				movss		xmm2, [esi+8]
+				movss		xmm6, [edi+8]
+				mulss		xmm6, xmm2
+				addss		xmm4, xmm6
+				movss		xmm3, [edi+12]
+				mulss		xmm3, xmm0
+				STORE1( 0, xmm4, xmm7 );
+				movss		xmm5, [edi + 12 + 4]
+				mulss		xmm5, xmm1
+				addss		xmm3, xmm5
+				movss		xmm6, [edi + 12 + 8]
+				mulss		xmm6, xmm2
+				addss		xmm3, xmm6
+				mulss		xmm0, [edi + 24]
+				mulss		xmm1, [edi + 24 + 4]
+				STORE1( 4, xmm3, xmm7 );
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi + 24 + 8]
+				addss		xmm0, xmm2
+				STORE1( 8, xmm0, xmm7 );
 			}
-			break;
+			return;
 		}
-		case 2: {
-			switch( numRows ) {
-				case 2: {		// 2x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm1, [esi+4]
-						movss		xmm2, [edi]
-						mulss		xmm2, xmm0
-						movss		xmm3, [edi+4]
-						mulss		xmm3, xmm1
-						addss		xmm2, xmm3
-						STORE1( 0, xmm2, xmm4 )
-						mulss		xmm0, [edi+8]
-						mulss		xmm1, [edi+8+4]
-						addss		xmm0, xmm1
-						STORE1( 4, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, [esi]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movaps		xmm0, [edi]
-						mulps		xmm0, xmm7
-						movaps		xmm1, [edi+16]
-						mulps		xmm1, xmm7
-						movaps		xmm2, xmm0
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						movaps		xmm3, [edi+32]
-						addps		xmm0, xmm2
-						mulps		xmm3, xmm7
-						STORE4( 0, xmm0, xmm4 )
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm1, xmm3
-						addps		xmm3, xmm1
-						STORE2LO( 16, xmm3, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
-						mPtr += 2;
-					}
-					return;
-				}
+		case 6: {		// 6x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm5, [esi]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm6, [esi + 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
+				movlps		xmm1, [edi + 4 * 4]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
+				movlps		xmm2, [edi + 6 * 4]
+				movhps		xmm2, [edi + 8 * 4]							// xmm2 = 6, 7, 8, 9
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
+				mulps		xmm0, xmm5
+				movlps		xmm3, [edi + 10 * 4]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
+				movaps		xmm3, xmm1
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
+				mulps		xmm1, xmm6
+				shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
+				mulps		xmm3, xmm7
+				addps		xmm0, xmm1
+				addps		xmm0, xmm3
+				STORE4( 0, xmm0, xmm4 )
+				movss		xmm1, [edi + 12 * 4]
+				mulss		xmm1, xmm5
+				movss		xmm2, [edi + 13 * 4]
+				mulss		xmm2, xmm6
+				movss		xmm3, [edi + 14 * 4]
+				mulss		xmm3, xmm7
+				addss		xmm1, xmm2
+				addss		xmm1, xmm3
+				STORE1( 16, xmm1, xmm4 )
+				mulss		xmm5, [edi + 15 * 4]
+				mulss		xmm6, [edi + 16 * 4]
+				mulss		xmm7, [edi + 17 * 4]
+				addss		xmm5, xmm6
+				addss		xmm5, xmm7
+				STORE1( 20, xmm5, xmm4 )
 			}
-			break;
+			return;
 		}
-		case 3: {
-			switch( numRows ) {
-				case 3: {		// 3x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm4, [edi]
-						mulss		xmm4, xmm0
-						movss		xmm1, [esi+4]
-						movss		xmm5, [edi+4]
-						mulss		xmm5, xmm1
-						addss		xmm4, xmm5
-						movss		xmm2, [esi+8]
-						movss		xmm6, [edi+8]
-						mulss		xmm6, xmm2
-						addss		xmm4, xmm6
-						movss		xmm3, [edi+12]
-						mulss		xmm3, xmm0
-						STORE1( 0, xmm4, xmm7 );
-						movss		xmm5, [edi+12+4]
-						mulss		xmm5, xmm1
-						addss		xmm3, xmm5
-						movss		xmm6, [edi+12+8]
-						mulss		xmm6, xmm2
-						addss		xmm3, xmm6
-						mulss		xmm0, [edi+24]
-						mulss		xmm1, [edi+24+4]
-						STORE1( 4, xmm3, xmm7 );
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+24+8]
-						addss		xmm0, xmm2
-						STORE1( 8, xmm0, xmm7 );
-					}
-					return;
-				}
-				case 6: {		// 6x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm5, [esi]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm6, [esi+4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
-						movlps		xmm1, [edi+4*4]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
-						movlps		xmm2, [edi+6*4]
-						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
-						mulps		xmm0, xmm5
-						movlps		xmm3, [edi+10*4]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
-						movaps		xmm3, xmm1
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
-						mulps		xmm1, xmm6
-						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
-						mulps		xmm3, xmm7
-						addps		xmm0, xmm1
-						addps		xmm0, xmm3
-						STORE4( 0, xmm0, xmm4 )
-						movss		xmm1, [edi+12*4]
-						mulss		xmm1, xmm5
-						movss		xmm2, [edi+13*4]
-						mulss		xmm2, xmm6
-						movss		xmm3, [edi+14*4]
-						mulss		xmm3, xmm7
-						addss		xmm1, xmm2
-						addss		xmm1, xmm3
-						STORE1( 16, xmm1, xmm4 )
-						mulss		xmm5, [edi+15*4]
-						mulss		xmm6, [edi+16*4]
-						mulss		xmm7, [edi+17*4]
-						addss		xmm5, xmm6
-						addss		xmm5, xmm7
-						STORE1( 20, xmm5, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
-						mPtr += 3;
-					}
-					return;
-				}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
+				mPtr += 3;
 			}
-			break;
+			return;
 		}
-		case 4: {
-			switch( numRows ) {
-				case 4: {		// 4x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi ]
-						movlps		xmm0, qword ptr [edi ]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						STORE4( 0, xmm0, xmm2 )
-					}
-					return;
-				}
-				case 6: {		// 6x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi+ 0]
-						movlps		xmm0, qword ptr [edi+ 0]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						movlps		xmm1, qword ptr [edi+64]
-						movhps		xmm1, qword ptr [edi+80]
-						STORE4( 0, xmm0, xmm4 )
-						mulps		xmm1, xmm6
-						movlps		xmm2, qword ptr [edi+72]
-						movhps		xmm2, qword ptr [edi+88]
-						mulps		xmm2, xmm7
-						addps		xmm1, xmm2
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm3, xmm1
-						addps		xmm1, xmm3
-						STORE2LO( 16, xmm1, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
-						mPtr += 4;
-					}
-					return;
-				}
+		}
+		break;
+	}
+	case 4: {
+		switch ( numRows ) {
+		case 4: {		// 4x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi ]
+				movlps		xmm0, qword ptr [edi ]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				STORE4( 0, xmm0, xmm2 )
 			}
-			break;
+			return;
 		}
-		case 5: {
-			switch( numRows ) {
-				case 5: {		// 5x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
-						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
-						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
-						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
-						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
-						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
-						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
-						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
-						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
-						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
-						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
-						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
-						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
-						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
-						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
-						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
-						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
-						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
-						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
-						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
-						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
-						movss		xmm7, [esi+0*4]
-						shufps		xmm7, xmm7, 0
-						mulps		xmm0, xmm7
-						movss		xmm5, [esi+1*4]
-						shufps		xmm5, xmm5, 0
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, 0
-						mulps		xmm2, xmm6
-						addps		xmm0, xmm2
-						movss		xmm1, [esi+3*4]
-						shufps		xmm1, xmm1, 0
-						mulps		xmm3, xmm1
-						addps		xmm0, xmm3
-						movss		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, 0
-						mulps		xmm4, xmm2
-						addps		xmm0, xmm4
-						mulss		xmm7, [edi+20*4]
-						mulss		xmm5, [edi+21*4]
-						addps		xmm7, xmm5
-						mulss		xmm6, [edi+22*4]
-						addps		xmm7, xmm6
-						mulss		xmm1, [edi+23*4]
-						addps		xmm7, xmm1
-						mulss		xmm2, [edi+24*4]
-						addps		xmm7, xmm2
-						STORE4( 0, xmm0, xmm3 )
-						STORE1( 16, xmm7, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [esi]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm0, [edi]
-						movhps		xmm3, [edi+8]
-						movaps		xmm1, [edi+16]
-						movlps		xmm2, [edi+32]
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
-						mulps		xmm0, xmm6
-						mulps		xmm3, xmm7
-						movlps		xmm2, [edi+40]
-						addps		xmm0, xmm3								// xmm0 + xmm1
-						movhps		xmm5, [edi+40+8]
-						movlps		xmm3, [edi+40+16]
-						movhps		xmm3, [edi+40+24]
-						movlps		xmm4, [edi+40+32]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
-						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
-						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
-						mulps		xmm2, xmm6
-						mulps		xmm5, xmm7
-						addps		xmm2, xmm5								// xmm2 + xmm3
-						movss		xmm5, [esi+16]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm0
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
-						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
-						addps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						STORE4( 0, xmm0, xmm2 )
-						movlps		xmm4, [edi+80]
-						movhps		xmm3, [edi+80+8]
-						movaps		xmm1, [edi+80+16]
-						movlps		xmm2, [edi+80+32]
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
-						mulps		xmm4, xmm6
-						mulps		xmm3, xmm7
-						mulps		xmm1, xmm5
-						addps		xmm4, xmm3								// xmm4 + xmm1
-						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
-						addps		xmm4, xmm1
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
-						addps		xmm4, xmm1
-						STORE2LO( 16, xmm4, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
-						mPtr += 5;
-					}
-					return;
-				}
+		case 6: {		// 6x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi+ 0]
+				movlps		xmm0, qword ptr [edi+ 0]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				movlps		xmm1, qword ptr [edi + 64]
+				movhps		xmm1, qword ptr [edi + 80]
+				STORE4( 0, xmm0, xmm4 )
+				mulps		xmm1, xmm6
+				movlps		xmm2, qword ptr [edi + 72]
+				movhps		xmm2, qword ptr [edi + 88]
+				mulps		xmm2, xmm7
+				addps		xmm1, xmm2
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm3, xmm1
+				addps		xmm1, xmm3
+				STORE2LO( 16, xmm1, xmm4 )
 			}
-			break;
+			return;
 		}
-		case 6: {
-			switch( numRows ) {
-				case 1: {		// 1x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						movss		xmm1, [esi+4]
-						mulss		xmm1, [edi+4]
-						movss		xmm2, [esi+8]
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+8]
-						movss		xmm3, [esi+12]
-						addss		xmm0, xmm2
-						mulss		xmm3, [edi+12]
-						movss		xmm4, [esi+16]
-						addss		xmm0, xmm3
-						mulss		xmm4, [edi+16]
-						movss		xmm5, [esi+20]
-						addss		xmm0, xmm4
-						mulss		xmm5, [edi+20]
-						movss		xmm6, [esi+24]
-						addss		xmm0, xmm5
-						mulss		xmm6, [edi+24]
-						addss		xmm0, xmm6
-						STORE1( 0, xmm0, xmm7 )
-					}
-					return;
-				}
-				case 2: {		// 2x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 3: {		// 3x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-						// row 2
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm0, xmm1
-						STORE1( 8, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 4: {		// 4x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 5: {		// 5x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						// row 5
-						movaps		xmm0, [edi+96]
-						movaps		xmm1, [edi+96+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, 0x01
-						addss		xmm0, xmm1
-						STORE1( 16, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, qword ptr [esi]
-						movlps		xmm6, qword ptr [esi+8]
-						shufps		xmm7, xmm7, 0x44
-						shufps		xmm6, xmm6, 0x44
-						movlps		xmm0, qword ptr [edi    ]
-						movhps		xmm0, qword ptr [edi+ 24]
-						mulps		xmm0, xmm7
-						movlps		xmm3, qword ptr [edi+  8]
-						movhps		xmm3, qword ptr [edi+ 32]
-						mulps		xmm3, xmm6
-						movlps		xmm1, qword ptr [edi+ 48]
-						movhps		xmm1, qword ptr [edi+ 72]
-						mulps		xmm1, xmm7
-						movlps		xmm2, qword ptr [edi+ 96]
-						movhps		xmm2, qword ptr [edi+120]
-						mulps		xmm2, xmm7
-						movlps		xmm4, qword ptr [edi+ 56]
-						movhps		xmm4, qword ptr [edi+ 80]
-						movlps		xmm5, qword ptr [edi+104]
-						movhps		xmm5, qword ptr [edi+128]
-						mulps		xmm4, xmm6
-						movlps		xmm7, qword ptr [esi+16]
-						addps		xmm0, xmm3
-						shufps		xmm7, xmm7, 0x44
-						mulps		xmm5, xmm6
-						addps		xmm1, xmm4
-						movlps		xmm3, qword ptr [edi+ 16]
-						movhps		xmm3, qword ptr [edi+ 40]
-						addps		xmm2, xmm5
-						movlps		xmm4, qword ptr [edi+ 64]
-						movhps		xmm4, qword ptr [edi+ 88]
-						mulps		xmm3, xmm7
-						movlps		xmm5, qword ptr [edi+112]
-						movhps		xmm5, qword ptr [edi+136]
-						addps		xmm0, xmm3
-						mulps		xmm4, xmm7
-						mulps		xmm5, xmm7
-						addps		xmm1, xmm4
-						addps		xmm2, xmm5
-						movaps		xmm6, xmm0
-						shufps		xmm0, xmm1, 0x88
-						shufps		xmm6, xmm1, 0xDD
-						movaps		xmm7, xmm2
-						shufps		xmm7, xmm2, 0x88
-						shufps		xmm2, xmm2, 0xDD
-						addps		xmm0, xmm6
-						addps		xmm2, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						STORE2LO( 16, xmm2, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
-									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
-						mPtr += 6;
-					}
-					return;
-				}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
+				mPtr += 4;
 			}
-			break;
+			return;
+		}
+		}
+		break;
+	}
+	case 5: {
+		switch ( numRows ) {
+		case 5: {		// 5x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
+				movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
+				movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
+				movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
+				movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
+				shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
+				movlps		xmm1, [edi + 6 * 4]							// xmm1 =  6,  7,  0,  1
+				movlps		xmm5, [edi + 16 * 4]						// xmm5 = 16, 17, 10, 11
+				movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
+				shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
+				movhps		xmm2, [edi + 2 * 4]							// xmm2 =  6,  7,  2,  3
+				movhps		xmm5, [edi + 12 * 4]						// xmm5 = 16, 17, 12, 13
+				movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
+				shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
+				movlps		xmm3, [edi + 8 * 4]							// xmm3 =  8,  9,  2,  3
+				movlps		xmm5, [edi + 18 * 4]						// xmm5 = 18, 19, 12, 13
+				movss		xmm4, [edi + 4 * 4]							// xmm4 =  4,  X,  X,  X
+				movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
+				shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
+				movhps		xmm5, [edi + 14 * 4]						// xmm6 = 18, 19, 14, 15
+				shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
+				movss		xmm7, [esi + 0 * 4]
+				shufps		xmm7, xmm7, 0
+				mulps		xmm0, xmm7
+				movss		xmm5, [esi + 1 * 4]
+				shufps		xmm5, xmm5, 0
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, 0
+				mulps		xmm2, xmm6
+				addps		xmm0, xmm2
+				movss		xmm1, [esi + 3 * 4]
+				shufps		xmm1, xmm1, 0
+				mulps		xmm3, xmm1
+				addps		xmm0, xmm3
+				movss		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, 0
+				mulps		xmm4, xmm2
+				addps		xmm0, xmm4
+				mulss		xmm7, [edi + 20 * 4]
+				mulss		xmm5, [edi + 21 * 4]
+				addps		xmm7, xmm5
+				mulss		xmm6, [edi + 22 * 4]
+				addps		xmm7, xmm6
+				mulss		xmm1, [edi + 23 * 4]
+				addps		xmm7, xmm1
+				mulss		xmm2, [edi + 24 * 4]
+				addps		xmm7, xmm2
+				STORE4( 0, xmm0, xmm3 )
+				STORE1( 16, xmm7, xmm4 )
+			}
+			return;
+		}
+		case 6: {		// 6x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [esi]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm0, [edi]
+				movhps		xmm3, [edi + 8]
+				movaps		xmm1, [edi + 16]
+				movlps		xmm2, [edi + 32]
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
+				mulps		xmm0, xmm6
+				mulps		xmm3, xmm7
+				movlps		xmm2, [edi + 40]
+				addps		xmm0, xmm3								// xmm0 + xmm1
+				movhps		xmm5, [edi + 40 + 8]
+				movlps		xmm3, [edi + 40 + 16]
+				movhps		xmm3, [edi + 40 + 24]
+				movlps		xmm4, [edi + 40 + 32]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
+				shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
+				shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
+				mulps		xmm2, xmm6
+				mulps		xmm5, xmm7
+				addps		xmm2, xmm5								// xmm2 + xmm3
+				movss		xmm5, [esi + 16]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm0
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
+				shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
+				addps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				STORE4( 0, xmm0, xmm2 )
+				movlps		xmm4, [edi + 80]
+				movhps		xmm3, [edi + 80 + 8]
+				movaps		xmm1, [edi + 80 + 16]
+				movlps		xmm2, [edi + 80 + 32]
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
+				mulps		xmm4, xmm6
+				mulps		xmm3, xmm7
+				mulps		xmm1, xmm5
+				addps		xmm4, xmm3								// xmm4 + xmm1
+				shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
+				addps		xmm4, xmm1
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
+				addps		xmm4, xmm1
+				STORE2LO( 16, xmm4, xmm2 )
+			}
+			return;
 		}
 		default: {
-			int numColumns = mat.GetNumColumns();
 			for ( int i = 0; i < numRows; i++ ) {
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numColumns; j++ ) {
-					sum += mPtr[j] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
-				mPtr += numColumns;
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
+				mPtr += 5;
 			}
-			break;
+			return;
+		}
+		}
+		break;
+	}
+	case 6: {
+		switch ( numRows ) {
+		case 1: {		// 1x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				movss		xmm1, [esi+4]
+				mulss		xmm1, [edi+4]
+				movss		xmm2, [esi+8]
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi+8]
+				movss		xmm3, [esi+12]
+				addss		xmm0, xmm2
+				mulss		xmm3, [edi+12]
+				movss		xmm4, [esi+16]
+				addss		xmm0, xmm3
+				mulss		xmm4, [edi+16]
+				movss		xmm5, [esi+20]
+				addss		xmm0, xmm4
+				mulss		xmm5, [edi+20]
+				movss		xmm6, [esi+24]
+				addss		xmm0, xmm5
+				mulss		xmm6, [edi+24]
+				addss		xmm0, xmm6
+				STORE1( 0, xmm0, xmm7 )
+			}
+			return;
+		}
+		case 2: {		// 2x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 3: {		// 3x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
+				// row 2
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm0, xmm1
+				STORE1( 8, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 4: {		// 4x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm4 )
+			}
+			return;
+		}
+		case 5: {		// 5x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				// row 5
+				movaps		xmm0, [edi + 96]
+				movaps		xmm1, [edi + 96 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, 0x01
+				addss		xmm0, xmm1
+				STORE1( 16, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, qword ptr [esi]
+				movlps		xmm6, qword ptr [esi+8]
+				shufps		xmm7, xmm7, 0x44
+				shufps		xmm6, xmm6, 0x44
+				movlps		xmm0, qword ptr [edi    ]
+				movhps		xmm0, qword ptr [edi+ 24]
+				mulps		xmm0, xmm7
+				movlps		xmm3, qword ptr [edi+  8]
+				movhps		xmm3, qword ptr [edi+ 32]
+				mulps		xmm3, xmm6
+				movlps		xmm1, qword ptr [edi+ 48]
+				movhps		xmm1, qword ptr [edi+ 72]
+				mulps		xmm1, xmm7
+				movlps		xmm2, qword ptr [edi+ 96]
+				movhps		xmm2, qword ptr [edi+120]
+				mulps		xmm2, xmm7
+				movlps		xmm4, qword ptr [edi+ 56]
+				movhps		xmm4, qword ptr [edi+ 80]
+				movlps		xmm5, qword ptr [edi+104]
+				movhps		xmm5, qword ptr [edi+128]
+				mulps		xmm4, xmm6
+				movlps		xmm7, qword ptr [esi+16]
+				addps		xmm0, xmm3
+				shufps		xmm7, xmm7, 0x44
+				mulps		xmm5, xmm6
+				addps		xmm1, xmm4
+				movlps		xmm3, qword ptr [edi+ 16]
+				movhps		xmm3, qword ptr [edi+ 40]
+				addps		xmm2, xmm5
+				movlps		xmm4, qword ptr [edi+ 64]
+				movhps		xmm4, qword ptr [edi+ 88]
+				mulps		xmm3, xmm7
+				movlps		xmm5, qword ptr [edi+112]
+				movhps		xmm5, qword ptr [edi+136]
+				addps		xmm0, xmm3
+				mulps		xmm4, xmm7
+				mulps		xmm5, xmm7
+				addps		xmm1, xmm4
+				addps		xmm2, xmm5
+				movaps		xmm6, xmm0
+				shufps		xmm0, xmm1, 0x88
+				shufps		xmm6, xmm1, 0xDD
+				movaps		xmm7, xmm2
+				shufps		xmm7, xmm2, 0x88
+				shufps		xmm2, xmm2, 0xDD
+				addps		xmm0, xmm6
+				addps		xmm2, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				STORE2LO( 16, xmm2, xmm4 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
+				mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
+				mPtr += 6;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	default: {
+		int numColumns = mat.GetNumColumns();
+		for ( int i = 0; i < numRows; i++ ) {
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numColumns; j++ ) {
+				sum += mPtr[j] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+			mPtr += numColumns;
 		}
+		break;
+	}
 	}
 
 #undef STOREC
@@ -5866,7 +5216,7 @@ void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, co
 
 /*
 ============
-idSIMD_SSE::MatX_MultiplySubVecX
+idSIMD_SSE::MatX_MultiplyAddVecX
 
 	optimizes the following matrix multiplications:
 
@@ -5877,26 +5227,26 @@ idSIMD_SSE::MatX_MultiplySubVecX
 	with N in the range [1-6]
 ============
 */
-void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
+void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
 #define STORE1( offset, reg1, reg2 )		\
 	__asm movss		reg2, [eax+offset]		\
-	__asm subss		reg2, reg1				\
+	__asm addss		reg2, reg1				\
 	__asm movss		[eax+offset], reg2
 #define STORE2LO( offset, reg1, reg2 )		\
 	__asm movlps	reg2, [eax+offset]		\
-	__asm subps		reg2, reg1				\
+	__asm addps		reg2, reg1				\
 	__asm movlps	[eax+offset], reg2
 #define STORE2HI( offset, reg1, reg2 )		\
 	__asm movhps	reg2, [eax+offset]		\
-	__asm subps		reg2, reg1				\
+	__asm addps		reg2, reg1				\
 	__asm movhps	[eax+offset], reg2
 #define STORE4( offset, reg1, reg2 )		\
 	__asm movlps	reg2, [eax+offset]		\
 	__asm movhps	reg2, [eax+offset+8]	\
-	__asm subps		reg2, reg1				\
+	__asm addps		reg2, reg1				\
 	__asm movlps	[eax+offset], reg2		\
 	__asm movhps	[eax+offset+8], reg2
-#define STOREC		-=
+#define STOREC		+=
 
 	int numRows;
 	const float *mPtr, *vPtr;
@@ -5909,1817 +5259,698 @@ void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, co
 	vPtr = vec.ToFloatPtr();
 	dstPtr = dst.ToFloatPtr();
 	numRows = mat.GetNumRows();
-	switch( mat.GetNumColumns() ) {
-		case 1: {
-			switch( numRows ) {
-				case 1: {		// 1x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						STORE1( 0, xmm0, xmm1 )
-					}
-					return;
-				}
-				case 6: {		// 6x1 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	switch ( mat.GetNumColumns() ) {
+	case 1: {
+		switch ( numRows ) {
+		case 1: {		// 1x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				STORE1( 0, xmm0, xmm1 )
 			}
-			break;
+			return;
 		}
-		case 2: {
-			switch( numRows ) {
-				case 2: {		// 2x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm1, [esi+4]
-						movss		xmm2, [edi]
-						mulss		xmm2, xmm0
-						movss		xmm3, [edi+4]
-						mulss		xmm3, xmm1
-						addss		xmm2, xmm3
-						STORE1( 0, xmm2, xmm4 )
-						mulss		xmm0, [edi+8]
-						mulss		xmm1, [edi+8+4]
-						addss		xmm0, xmm1
-						STORE1( 4, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x2 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, [esi]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movaps		xmm0, [edi]
-						mulps		xmm0, xmm7
-						movaps		xmm1, [edi+16]
-						mulps		xmm1, xmm7
-						movaps		xmm2, xmm0
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						movaps		xmm3, [edi+32]
-						addps		xmm0, xmm2
-						mulps		xmm3, xmm7
-						STORE4( 0, xmm0, xmm4 )
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm1, xmm3
-						addps		xmm3, xmm1
-						STORE2LO( 16, xmm3, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
-						mPtr += 2;
-					}
-					return;
-				}
+		case 6: {		// 6x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm2 )
 			}
-			break;
+			return;
 		}
-		case 3: {
-			switch( numRows ) {
-				case 3: {		// 3x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						movss		xmm4, [edi]
-						mulss		xmm4, xmm0
-						movss		xmm1, [esi+4]
-						movss		xmm5, [edi+4]
-						mulss		xmm5, xmm1
-						addss		xmm4, xmm5
-						movss		xmm2, [esi+8]
-						movss		xmm6, [edi+8]
-						mulss		xmm6, xmm2
-						addss		xmm4, xmm6
-						movss		xmm3, [edi+12]
-						mulss		xmm3, xmm0
-						STORE1( 0, xmm4, xmm7 );
-						movss		xmm5, [edi+12+4]
-						mulss		xmm5, xmm1
-						addss		xmm3, xmm5
-						movss		xmm6, [edi+12+8]
-						mulss		xmm6, xmm2
-						addss		xmm3, xmm6
-						mulss		xmm0, [edi+24]
-						mulss		xmm1, [edi+24+4]
-						STORE1( 4, xmm3, xmm7 );
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+24+8]
-						addss		xmm0, xmm2
-						STORE1( 8, xmm0, xmm7 );
-					}
-					return;
-				}
-				case 6: {		// 6x3 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm5, [esi]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm6, [esi+4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movss		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
-						movlps		xmm1, [edi+4*4]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
-						movlps		xmm2, [edi+6*4]
-						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
-						mulps		xmm0, xmm5
-						movlps		xmm3, [edi+10*4]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
-						movaps		xmm3, xmm1
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
-						mulps		xmm1, xmm6
-						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
-						mulps		xmm3, xmm7
-						addps		xmm0, xmm1
-						addps		xmm0, xmm3
-						STORE4( 0, xmm0, xmm4 )
-						movss		xmm1, [edi+12*4]
-						mulss		xmm1, xmm5
-						movss		xmm2, [edi+13*4]
-						mulss		xmm2, xmm6
-						movss		xmm3, [edi+14*4]
-						mulss		xmm3, xmm7
-						addss		xmm1, xmm2
-						addss		xmm1, xmm3
-						STORE1( 16, xmm1, xmm4 )
-						mulss		xmm5, [edi+15*4]
-						mulss		xmm6, [edi+16*4]
-						mulss		xmm7, [edi+17*4]
-						addss		xmm5, xmm6
-						addss		xmm5, xmm7
-						STORE1( 20, xmm5, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
-						mPtr += 3;
-					}
-					return;
-				}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0];
+				mPtr++;
 			}
-			break;
+			return;
 		}
-		case 4: {
-			switch( numRows ) {
-				case 4: {		// 4x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi ]
-						movlps		xmm0, qword ptr [edi ]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						STORE4( 0, xmm0, xmm2 )
-					}
-					return;
-				}
-				case 6: {		// 6x4 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, qword ptr [esi+ 0]
-						movlps		xmm0, qword ptr [edi+ 0]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm0, qword ptr [edi+16]
-						mulps		xmm0, xmm6
-						movlps		xmm7, qword ptr [esi+ 8]
-						movlps		xmm2, qword ptr [edi+ 8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movhps		xmm2, qword ptr [edi+24]
-						mulps		xmm2, xmm7
-						movlps		xmm1, qword ptr [edi+32]
-						movhps		xmm1, qword ptr [edi+48]
-						mulps		xmm1, xmm6
-						movlps		xmm3, qword ptr [edi+40]
-						addps		xmm0, xmm2
-						movhps		xmm3, qword ptr [edi+56]
-						mulps		xmm3, xmm7
-						movaps		xmm4, xmm0
-						addps		xmm1, xmm3
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm4
-						movlps		xmm1, qword ptr [edi+64]
-						movhps		xmm1, qword ptr [edi+80]
-						STORE4( 0, xmm0, xmm4 )
-						mulps		xmm1, xmm6
-						movlps		xmm2, qword ptr [edi+72]
-						movhps		xmm2, qword ptr [edi+88]
-						mulps		xmm2, xmm7
-						addps		xmm1, xmm2
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm3, xmm1
-						addps		xmm1, xmm3
-						STORE2LO( 16, xmm1, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
-						mPtr += 4;
-					}
-					return;
-				}
-			}
-			break;
 		}
-		case 5: {
-			switch( numRows ) {
-				case 5: {		// 5x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
-						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
-						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
-						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
-						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
-						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
-						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
-						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
-						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
-						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
-						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
-						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
-						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
-						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
-						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
-						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
-						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
-						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
-						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
-						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
-						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
-						movss		xmm7, [esi+0*4]
-						shufps		xmm7, xmm7, 0
-						mulps		xmm0, xmm7
-						movss		xmm5, [esi+1*4]
-						shufps		xmm5, xmm5, 0
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, 0
-						mulps		xmm2, xmm6
-						addps		xmm0, xmm2
-						movss		xmm1, [esi+3*4]
-						shufps		xmm1, xmm1, 0
-						mulps		xmm3, xmm1
-						addps		xmm0, xmm3
-						movss		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, 0
-						mulps		xmm4, xmm2
-						addps		xmm0, xmm4
-						mulss		xmm7, [edi+20*4]
-						mulss		xmm5, [edi+21*4]
-						addps		xmm7, xmm5
-						mulss		xmm6, [edi+22*4]
-						addps		xmm7, xmm6
-						mulss		xmm1, [edi+23*4]
-						addps		xmm7, xmm1
-						mulss		xmm2, [edi+24*4]
-						addps		xmm7, xmm2
-						STORE4( 0, xmm0, xmm3 )
-						STORE1( 16, xmm7, xmm4 )
-					}
-					return;
-				}
-				case 6: {		// 6x5 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [esi]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm7, [esi+8]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
-						movlps		xmm0, [edi]
-						movhps		xmm3, [edi+8]
-						movaps		xmm1, [edi+16]
-						movlps		xmm2, [edi+32]
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
-						mulps		xmm0, xmm6
-						mulps		xmm3, xmm7
-						movlps		xmm2, [edi+40]
-						addps		xmm0, xmm3								// xmm0 + xmm1
-						movhps		xmm5, [edi+40+8]
-						movlps		xmm3, [edi+40+16]
-						movhps		xmm3, [edi+40+24]
-						movlps		xmm4, [edi+40+32]
-						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
-						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
-						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
-						mulps		xmm2, xmm6
-						mulps		xmm5, xmm7
-						addps		xmm2, xmm5								// xmm2 + xmm3
-						movss		xmm5, [esi+16]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm0
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
-						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
-						addps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						STORE4( 0, xmm0, xmm2 )
-						movlps		xmm4, [edi+80]
-						movhps		xmm3, [edi+80+8]
-						movaps		xmm1, [edi+80+16]
-						movlps		xmm2, [edi+80+32]
-						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
-						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
-						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
-						mulps		xmm4, xmm6
-						mulps		xmm3, xmm7
-						mulps		xmm1, xmm5
-						addps		xmm4, xmm3								// xmm4 + xmm1
-						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
-						addps		xmm4, xmm1
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
-						addps		xmm4, xmm1
-						STORE2LO( 16, xmm4, xmm2 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
-						mPtr += 5;
-					}
-					return;
-				}
+		break;
+	}
+	case 2: {
+		switch ( numRows ) {
+		case 2: {		// 2x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm1, [esi+4]
+				movss		xmm2, [edi]
+				mulss		xmm2, xmm0
+				movss		xmm3, [edi+4]
+				mulss		xmm3, xmm1
+				addss		xmm2, xmm3
+				STORE1( 0, xmm2, xmm4 )
+				mulss		xmm0, [edi + 8]
+				mulss		xmm1, [edi + 8 + 4]
+				addss		xmm0, xmm1
+				STORE1( 4, xmm0, xmm4 )
 			}
-			break;
+			return;
 		}
-		case 6: {
-			switch( numRows ) {
-				case 1: {		// 1x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						mulss		xmm0, [edi]
-						movss		xmm1, [esi+4]
-						mulss		xmm1, [edi+4]
-						movss		xmm2, [esi+8]
-						addss		xmm0, xmm1
-						mulss		xmm2, [edi+8]
-						movss		xmm3, [esi+12]
-						addss		xmm0, xmm2
-						mulss		xmm3, [edi+12]
-						movss		xmm4, [esi+16]
-						addss		xmm0, xmm3
-						mulss		xmm4, [edi+16]
-						movss		xmm5, [esi+20]
-						addss		xmm0, xmm4
-						mulss		xmm5, [edi+20]
-						movss		xmm6, [esi+24]
-						addss		xmm0, xmm5
-						mulss		xmm6, [edi+24]
-						addss		xmm0, xmm6
-						STORE1( 0, xmm0, xmm7 )
-					}
-					return;
-				}
-				case 2: {		// 2x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 3: {		// 3x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
-						movhlps		xmm0, xmm1
-						addps		xmm0, xmm1
-						STORE2LO( 0, xmm0, xmm3 )
-						// row 2
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm0, xmm1
-						STORE1( 8, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 4: {		// 4x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm4 )
-					}
-					return;
-				}
-				case 5: {		// 5x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						// load idVecX
-						movlps		xmm4, [esi]
-						movhps		xmm4, [esi+8]
-						movlps		xmm5, [esi+16]
-						movlhps		xmm5, xmm4
-						movhlps		xmm6, xmm4
-						movlhps		xmm6, xmm5
-						// row 0 and 1
-						movaps		xmm0, [edi]
-						movaps		xmm1, [edi+16]
-						movaps		xmm2, [edi+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm7, xmm0
-						movlhps		xmm7, xmm2
-						addps		xmm7, xmm1
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm7, xmm0
-						// row 2 and 3
-						movaps		xmm0, [edi+48]
-						movaps		xmm1, [edi+48+16]
-						movaps		xmm2, [edi+48+32]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						mulps		xmm2, xmm6
-						movhlps		xmm3, xmm0
-						movlhps		xmm3, xmm2
-						addps		xmm1, xmm3
-						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
-						addps		xmm1, xmm0
-						// last 4 additions for the first 4 rows and store result
-						movaps		xmm0, xmm7
-						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
-						addps		xmm0, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						// row 5
-						movaps		xmm0, [edi+96]
-						movaps		xmm1, [edi+96+16]
-						mulps		xmm0, xmm4
-						mulps		xmm1, xmm5
-						addps		xmm0, xmm1
-						movhlps		xmm1, xmm0
-						addps		xmm0, xmm1
-						movaps		xmm1, xmm0
-						shufps		xmm1, xmm1, 0x01
-						addss		xmm0, xmm1
-						STORE1( 16, xmm0, xmm3 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm7, qword ptr [esi]
-						movlps		xmm6, qword ptr [esi+8]
-						shufps		xmm7, xmm7, 0x44
-						shufps		xmm6, xmm6, 0x44
-						movlps		xmm0, qword ptr [edi    ]
-						movhps		xmm0, qword ptr [edi+ 24]
-						mulps		xmm0, xmm7
-						movlps		xmm3, qword ptr [edi+  8]
-						movhps		xmm3, qword ptr [edi+ 32]
-						mulps		xmm3, xmm6
-						movlps		xmm1, qword ptr [edi+ 48]
-						movhps		xmm1, qword ptr [edi+ 72]
-						mulps		xmm1, xmm7
-						movlps		xmm2, qword ptr [edi+ 96]
-						movhps		xmm2, qword ptr [edi+120]
-						mulps		xmm2, xmm7
-						movlps		xmm4, qword ptr [edi+ 56]
-						movhps		xmm4, qword ptr [edi+ 80]
-						movlps		xmm5, qword ptr [edi+104]
-						movhps		xmm5, qword ptr [edi+128]
-						mulps		xmm4, xmm6
-						movlps		xmm7, qword ptr [esi+16]
-						addps		xmm0, xmm3
-						shufps		xmm7, xmm7, 0x44
-						mulps		xmm5, xmm6
-						addps		xmm1, xmm4
-						movlps		xmm3, qword ptr [edi+ 16]
-						movhps		xmm3, qword ptr [edi+ 40]
-						addps		xmm2, xmm5
-						movlps		xmm4, qword ptr [edi+ 64]
-						movhps		xmm4, qword ptr [edi+ 88]
-						mulps		xmm3, xmm7
-						movlps		xmm5, qword ptr [edi+112]
-						movhps		xmm5, qword ptr [edi+136]
-						addps		xmm0, xmm3
-						mulps		xmm4, xmm7
-						mulps		xmm5, xmm7
-						addps		xmm1, xmm4
-						addps		xmm2, xmm5
-						movaps		xmm6, xmm0
-						shufps		xmm0, xmm1, 0x88
-						shufps		xmm6, xmm1, 0xDD
-						movaps		xmm7, xmm2
-						shufps		xmm7, xmm2, 0x88
-						shufps		xmm2, xmm2, 0xDD
-						addps		xmm0, xmm6
-						addps		xmm2, xmm7
-						STORE4( 0, xmm0, xmm3 )
-						STORE2LO( 16, xmm2, xmm4 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numRows; i++ ) {
-						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
-									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
-						mPtr += 6;
-					}
-					return;
-				}
+		case 6: {		// 6x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, [esi]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movaps		xmm0, [edi]
+				mulps		xmm0, xmm7
+				movaps		xmm1, [edi + 16]
+				mulps		xmm1, xmm7
+				movaps		xmm2, xmm0
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				movaps		xmm3, [edi + 32]
+				addps		xmm0, xmm2
+				mulps		xmm3, xmm7
+				STORE4( 0, xmm0, xmm4 )
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm1, xmm3
+				addps		xmm3, xmm1
+				STORE2LO( 16, xmm3, xmm4 )
 			}
-			break;
+			return;
 		}
 		default: {
-			int numColumns = mat.GetNumColumns();
 			for ( int i = 0; i < numRows; i++ ) {
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numColumns; j++ ) {
-					sum += mPtr[j] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
-				mPtr += numColumns;
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
+				mPtr += 2;
 			}
-			break;
+			return;
+		}
 		}
+		break;
 	}
-
-#undef STOREC
-#undef STORE4
-#undef STORE2HI
-#undef STORE2LO
-#undef STORE1
-}
-
-/*
-============
-idSIMD_SSE::MatX_TransposeMultiplyVecX
-
-	optimizes the following matrix multiplications:
-
-	Nx6 * Nx1
-	6xN * 6x1
-
-	with N in the range [1-6]
-============
-*/
-void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
-#define STORE1( offset, reg1, reg2 )		\
-	__asm movss		[eax+offset], reg1
-#define STORE2LO( offset, reg1, reg2 )		\
-	__asm movlps	[eax+offset], reg1
-#define STORE2HI( offset, reg1, reg2 )		\
-	__asm movhps	[eax+offset], reg1
-#define STORE4( offset, reg1, reg2 )		\
-	__asm movlps	[eax+offset], reg1		\
-	__asm movhps	[eax+offset+8], reg1
-#define STOREC		=
-
-	int numColumns;
-	const float *mPtr, *vPtr;
-	float *dstPtr;
-
-	assert( vec.GetSize() >= mat.GetNumRows() );
-	assert( dst.GetSize() >= mat.GetNumColumns() );
-
-	mPtr = mat.ToFloatPtr();
-	vPtr = vec.ToFloatPtr();
-	dstPtr = dst.ToFloatPtr();
-	numColumns = mat.GetNumColumns();
-	switch( mat.GetNumRows() ) {
-		case 1:
-			switch( numColumns ) {
-				case 6: {		// 1x6 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm3 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	case 3: {
+		switch ( numRows ) {
+		case 3: {		// 3x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm4, [edi]
+				mulss		xmm4, xmm0
+				movss		xmm1, [esi+4]
+				movss		xmm5, [edi+4]
+				mulss		xmm5, xmm1
+				addss		xmm4, xmm5
+				movss		xmm2, [esi+8]
+				movss		xmm6, [edi+8]
+				mulss		xmm6, xmm2
+				addss		xmm4, xmm6
+				movss		xmm3, [edi+12]
+				mulss		xmm3, xmm0
+				STORE1( 0, xmm4, xmm7 );
+				movss		xmm5, [edi + 12 + 4]
+				mulss		xmm5, xmm1
+				addss		xmm3, xmm5
+				movss		xmm6, [edi + 12 + 8]
+				mulss		xmm6, xmm2
+				addss		xmm3, xmm6
+				mulss		xmm0, [edi + 24]
+				mulss		xmm1, [edi + 24 + 4]
+				STORE1( 4, xmm3, xmm7 );
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi + 24 + 8]
+				addss		xmm0, xmm2
+				STORE1( 8, xmm0, xmm7 );
 			}
-			break;
-		case 2:
-			switch( numColumns ) {
-				case 6: {		// 2x6 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movaps		xmm1, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movaps		xmm2, [edi]
-						mulps		xmm2, xmm0
-						movlps		xmm3, [edi+24]
-						movhps		xmm3, [edi+32]
-						mulps		xmm3, xmm1
-						addps		xmm2, xmm3
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movlps		xmm4, [edi+16]
-						movhps		xmm4, [edi+40]
-						mulps		xmm4, xmm0
-						movhlps		xmm3, xmm4
-						addps		xmm3, xmm4
-						STORE4( 0, xmm2, xmm5 )
-						STORE2LO( 16, xmm3, xmm6 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 6: {		// 6x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm5, [esi]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm6, [esi + 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
+				movlps		xmm1, [edi + 4 * 4]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
+				movlps		xmm2, [edi + 6 * 4]
+				movhps		xmm2, [edi + 8 * 4]							// xmm2 = 6, 7, 8, 9
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
+				mulps		xmm0, xmm5
+				movlps		xmm3, [edi + 10 * 4]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
+				movaps		xmm3, xmm1
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
+				mulps		xmm1, xmm6
+				shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
+				mulps		xmm3, xmm7
+				addps		xmm0, xmm1
+				addps		xmm0, xmm3
+				STORE4( 0, xmm0, xmm4 )
+				movss		xmm1, [edi + 12 * 4]
+				mulss		xmm1, xmm5
+				movss		xmm2, [edi + 13 * 4]
+				mulss		xmm2, xmm6
+				movss		xmm3, [edi + 14 * 4]
+				mulss		xmm3, xmm7
+				addss		xmm1, xmm2
+				addss		xmm1, xmm3
+				STORE1( 16, xmm1, xmm4 )
+				mulss		xmm5, [edi + 15 * 4]
+				mulss		xmm6, [edi + 16 * 4]
+				mulss		xmm7, [edi + 17 * 4]
+				addss		xmm5, xmm6
+				addss		xmm5, xmm7
+				STORE1( 20, xmm5, xmm4 )
 			}
-			break;
-		case 3:
-			switch( numColumns ) {
-				case 6: {		// 3x6 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movss		xmm1, [esi+2*4]
-						movlps		xmm3, [edi+(0*6+0)*4]
-						movhps		xmm3, [edi+(0*6+2)*4]
-						movaps		xmm4, xmm0
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(2*6+4)*4]
-						mulps		xmm5, xmm1
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
+				mPtr += 3;
 			}
-			break;
-		case 4:
-			switch( numColumns ) {
-				case 6: {		// 4x6 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		}
+		break;
+	}
+	case 4: {
+		switch ( numRows ) {
+		case 4: {		// 4x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi ]
+				movlps		xmm0, qword ptr [edi ]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				STORE4( 0, xmm0, xmm2 )
 			}
-			break;
-		case 5:
-			switch( numColumns ) {
-				case 6: {		// 5x6 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movss		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm2
-						mulps		xmm4, [edi+(4*6+0)*4]
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 6: {		// 6x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi+ 0]
+				movlps		xmm0, qword ptr [edi+ 0]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				movlps		xmm1, qword ptr [edi + 64]
+				movhps		xmm1, qword ptr [edi + 80]
+				STORE4( 0, xmm0, xmm4 )
+				mulps		xmm1, xmm6
+				movlps		xmm2, qword ptr [edi + 72]
+				movhps		xmm2, qword ptr [edi + 88]
+				mulps		xmm2, xmm7
+				addps		xmm1, xmm2
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm3, xmm1
+				addps		xmm1, xmm3
+				STORE2LO( 16, xmm1, xmm4 )
 			}
-			break;
-		case 6:
-			switch( numColumns ) {
-				case 1: {		// 6x1 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movhps		xmm0, [esi+8]
-						movlps		xmm1, [esi+16]
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
-						addps		xmm0, xmm1
-						movhlps		xmm2, xmm0
-						addss		xmm2, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm2, xmm0
-						STORE1( 0, xmm2, xmm3 )
-					}
-					return;
-				}
-				case 2: {		// 6x2 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm6, [edi+0*4]
-						mulps		xmm6, xmm0
-						movlps		xmm1, [esi+2*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+4*4]
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+8*4]
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movhlps		xmm3, xmm6
-						addps		xmm3, xmm6
-						STORE2LO( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 3: {		// 6x3 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+(0*3+2)*4]
-						movhps		xmm0, [edi+(0*3+0)*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm6, [esi+0*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movss		xmm1, [edi+(1*3+0)*4]
-						movhps		xmm1, [edi+(1*3+1)*4]
-						movss		xmm7, [esi+1*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movss		xmm2, [edi+(2*3+2)*4]
-						movhps		xmm2, [edi+(2*3+0)*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+2*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movss		xmm3, [edi+(3*3+0)*4]
-						movhps		xmm3, [edi+(3*3+1)*4]
-						movss		xmm7, [esi+3*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movss		xmm4, [edi+(4*3+2)*4]
-						movhps		xmm4, [edi+(4*3+0)*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+4*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movss		xmm5, [edi+(5*3+0)*4]
-						movhps		xmm5, [edi+(5*3+1)*4]
-						movss		xmm7, [esi+5*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE1( 0, xmm6, xmm7 )
-						STORE2HI( 4, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 4: {		// 6x4 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm3, [edi+(0*4+0)*4]
-						movhps		xmm3, [edi+(0*4+2)*4]
-						movss		xmm4, [esi+0*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*4+0)*4]
-						movhps		xmm5, [edi+(1*4+2)*4]
-						movss		xmm6, [esi+1*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*4+0)*4]
-						movhps		xmm4, [edi+(2*4+2)*4]
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*4+0)*4]
-						movhps		xmm5, [edi+(3*4+2)*4]
-						movss		xmm6, [esi+3*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(4*4+0)*4]
-						movhps		xmm4, [edi+(4*4+2)*4]
-						movss		xmm6, [esi+4*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(5*4+0)*4]
-						movhps		xmm5, [edi+(5*4+2)*4]
-						movss		xmm6, [esi+5*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 5: {		// 6x5 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [edi+(0*5+0)*4]
-						movhps		xmm6, [edi+(0*5+2)*4]
-						movss		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movlps		xmm7, [edi+(1*5+0)*4]
-						movhps		xmm7, [edi+(1*5+2)*4]
-						movss		xmm1, [esi+1*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(2*5+0)*4]
-						movhps		xmm7, [edi+(2*5+2)*4]
-						movss		xmm2, [esi+2*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(3*5+0)*4]
-						movhps		xmm7, [edi+(3*5+2)*4]
-						movss		xmm3, [esi+3*4]
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(4*5+0)*4]
-						movhps		xmm7, [edi+(4*5+2)*4]
-						movss		xmm4, [esi+4*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(5*5+0)*4]
-						movhps		xmm7, [edi+(5*5+2)*4]
-						movss		xmm5, [esi+5*4]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE4( 0, xmm6, xmm7 )
-						movss		xmm6, [edi+(0*5+4)*4]
-						mulss		xmm6, xmm0
-						movss		xmm7, [edi+(1*5+4)*4]
-						mulss		xmm7, xmm1
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(2*5+4)*4]
-						mulss		xmm7, xmm2
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(3*5+4)*4]
-						mulss		xmm7, xmm3
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(4*5+4)*4]
-						mulss		xmm7, xmm4
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(5*5+4)*4]
-						mulss		xmm7, xmm5
-						addss		xmm6, xmm7
-						STORE1( 16, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movlps		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(4*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(5*6+0)*4]
-						movhps		xmm5, [edi+(5*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						movhps		xmm5, [edi+(5*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
+				mPtr += 4;
 			}
-			break;
-		default:
-			int numRows = mat.GetNumRows();
-			for ( int i = 0; i < numColumns; i++ ) {
-				mPtr = mat.ToFloatPtr() + i;
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numRows; j++ ) {
-					mPtr += numColumns;
-					sum += mPtr[0] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
+			return;
+		}
+		}
+		break;
+	}
+	case 5: {
+		switch ( numRows ) {
+		case 5: {		// 5x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
+				movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
+				movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
+				movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
+				movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
+				shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
+				movlps		xmm1, [edi + 6 * 4]							// xmm1 =  6,  7,  0,  1
+				movlps		xmm5, [edi + 16 * 4]						// xmm5 = 16, 17, 10, 11
+				movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
+				shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
+				movhps		xmm2, [edi + 2 * 4]							// xmm2 =  6,  7,  2,  3
+				movhps		xmm5, [edi + 12 * 4]						// xmm5 = 16, 17, 12, 13
+				movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
+				shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
+				movlps		xmm3, [edi + 8 * 4]							// xmm3 =  8,  9,  2,  3
+				movlps		xmm5, [edi + 18 * 4]						// xmm5 = 18, 19, 12, 13
+				movss		xmm4, [edi + 4 * 4]							// xmm4 =  4,  X,  X,  X
+				movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
+				shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
+				movhps		xmm5, [edi + 14 * 4]						// xmm6 = 18, 19, 14, 15
+				shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
+				movss		xmm7, [esi + 0 * 4]
+				shufps		xmm7, xmm7, 0
+				mulps		xmm0, xmm7
+				movss		xmm5, [esi + 1 * 4]
+				shufps		xmm5, xmm5, 0
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, 0
+				mulps		xmm2, xmm6
+				addps		xmm0, xmm2
+				movss		xmm1, [esi + 3 * 4]
+				shufps		xmm1, xmm1, 0
+				mulps		xmm3, xmm1
+				addps		xmm0, xmm3
+				movss		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, 0
+				mulps		xmm4, xmm2
+				addps		xmm0, xmm4
+				mulss		xmm7, [edi + 20 * 4]
+				mulss		xmm5, [edi + 21 * 4]
+				addps		xmm7, xmm5
+				mulss		xmm6, [edi + 22 * 4]
+				addps		xmm7, xmm6
+				mulss		xmm1, [edi + 23 * 4]
+				addps		xmm7, xmm1
+				mulss		xmm2, [edi + 24 * 4]
+				addps		xmm7, xmm2
+				STORE4( 0, xmm0, xmm3 )
+				STORE1( 16, xmm7, xmm4 )
 			}
-			break;
+			return;
+		}
+		case 6: {		// 6x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [esi]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm0, [edi]
+				movhps		xmm3, [edi + 8]
+				movaps		xmm1, [edi + 16]
+				movlps		xmm2, [edi + 32]
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
+				mulps		xmm0, xmm6
+				mulps		xmm3, xmm7
+				movlps		xmm2, [edi + 40]
+				addps		xmm0, xmm3								// xmm0 + xmm1
+				movhps		xmm5, [edi + 40 + 8]
+				movlps		xmm3, [edi + 40 + 16]
+				movhps		xmm3, [edi + 40 + 24]
+				movlps		xmm4, [edi + 40 + 32]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
+				shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
+				shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
+				mulps		xmm2, xmm6
+				mulps		xmm5, xmm7
+				addps		xmm2, xmm5								// xmm2 + xmm3
+				movss		xmm5, [esi + 16]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm0
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
+				shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
+				addps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				STORE4( 0, xmm0, xmm2 )
+				movlps		xmm4, [edi + 80]
+				movhps		xmm3, [edi + 80 + 8]
+				movaps		xmm1, [edi + 80 + 16]
+				movlps		xmm2, [edi + 80 + 32]
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
+				mulps		xmm4, xmm6
+				mulps		xmm3, xmm7
+				mulps		xmm1, xmm5
+				addps		xmm4, xmm3								// xmm4 + xmm1
+				shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
+				addps		xmm4, xmm1
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
+				addps		xmm4, xmm1
+				STORE2LO( 16, xmm4, xmm2 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
+				mPtr += 5;
+			}
+			return;
+		}
+		}
+		break;
 	}
-
-#undef STOREC
-#undef STORE4
-#undef STORE2HI
-#undef STORE2LO
-#undef STORE1
-}
-
-/*
-============
-idSIMD_SSE::MatX_TransposeMultiplyAddVecX
-
-	optimizes the following matrix multiplications:
-
-	Nx6 * Nx1
-	6xN * 6x1
-
-	with N in the range [1-6]
-============
-*/
-void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
-#define STORE1( offset, reg1, reg2 )		\
-	__asm movss		reg2, [eax+offset]		\
-	__asm addss		reg2, reg1				\
-	__asm movss		[eax+offset], reg2
-#define STORE2LO( offset, reg1, reg2 )		\
-	__asm movlps	reg2, [eax+offset]		\
-	__asm addps		reg2, reg1				\
-	__asm movlps	[eax+offset], reg2
-#define STORE2HI( offset, reg1, reg2 )		\
-	__asm movhps	reg2, [eax+offset]		\
-	__asm addps		reg2, reg1				\
-	__asm movhps	[eax+offset], reg2
-#define STORE4( offset, reg1, reg2 )		\
-	__asm movlps	reg2, [eax+offset]		\
-	__asm movhps	reg2, [eax+offset+8]	\
-	__asm addps		reg2, reg1				\
-	__asm movlps	[eax+offset], reg2		\
-	__asm movhps	[eax+offset+8], reg2
-#define STOREC		+=
-
-	int numColumns;
-	const float *mPtr, *vPtr;
-	float *dstPtr;
-
-	assert( vec.GetSize() >= mat.GetNumRows() );
-	assert( dst.GetSize() >= mat.GetNumColumns() );
-
-	mPtr = mat.ToFloatPtr();
-	vPtr = vec.ToFloatPtr();
-	dstPtr = dst.ToFloatPtr();
-	numColumns = mat.GetNumColumns();
-	switch( mat.GetNumRows() ) {
-		case 1:
-			switch( numColumns ) {
-				case 6: {		// 1x6 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm3 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	case 6: {
+		switch ( numRows ) {
+		case 1: {		// 1x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				movss		xmm1, [esi+4]
+				mulss		xmm1, [edi+4]
+				movss		xmm2, [esi+8]
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi+8]
+				movss		xmm3, [esi+12]
+				addss		xmm0, xmm2
+				mulss		xmm3, [edi+12]
+				movss		xmm4, [esi+16]
+				addss		xmm0, xmm3
+				mulss		xmm4, [edi+16]
+				movss		xmm5, [esi+20]
+				addss		xmm0, xmm4
+				mulss		xmm5, [edi+20]
+				movss		xmm6, [esi+24]
+				addss		xmm0, xmm5
+				mulss		xmm6, [edi+24]
+				addss		xmm0, xmm6
+				STORE1( 0, xmm0, xmm7 )
 			}
-			break;
-		case 2:
-			switch( numColumns ) {
-				case 6: {		// 2x6 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movaps		xmm1, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movaps		xmm2, [edi]
-						mulps		xmm2, xmm0
-						movlps		xmm3, [edi+24]
-						movhps		xmm3, [edi+32]
-						mulps		xmm3, xmm1
-						addps		xmm2, xmm3
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movlps		xmm4, [edi+16]
-						movhps		xmm4, [edi+40]
-						mulps		xmm4, xmm0
-						movhlps		xmm3, xmm4
-						addps		xmm3, xmm4
-						STORE4( 0, xmm2, xmm5 )
-						STORE2LO( 16, xmm3, xmm6 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 2: {		// 2x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
 			}
-			break;
-		case 3:
-			switch( numColumns ) {
-				case 6: {		// 3x6 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movss		xmm1, [esi+2*4]
-						movlps		xmm3, [edi+(0*6+0)*4]
-						movhps		xmm3, [edi+(0*6+2)*4]
-						movaps		xmm4, xmm0
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(2*6+4)*4]
-						mulps		xmm5, xmm1
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 3: {		// 3x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
+				// row 2
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm0, xmm1
+				STORE1( 8, xmm0, xmm3 )
 			}
-			break;
-		case 4:
-			switch( numColumns ) {
-				case 6: {		// 4x6 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 4: {		// 4x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm4 )
 			}
-			break;
-		case 5:
-			switch( numColumns ) {
-				case 6: {		// 5x6 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movss		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm2
-						mulps		xmm4, [edi+(4*6+0)*4]
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 5: {		// 5x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				// row 5
+				movaps		xmm0, [edi + 96]
+				movaps		xmm1, [edi + 96 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, 0x01
+				addss		xmm0, xmm1
+				STORE1( 16, xmm0, xmm3 )
 			}
-			break;
-		case 6:
-			switch( numColumns ) {
-				case 1: {		// 6x1 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movhps		xmm0, [esi+8]
-						movlps		xmm1, [esi+16]
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
-						addps		xmm0, xmm1
-						movhlps		xmm2, xmm0
-						addss		xmm2, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm2, xmm0
-						STORE1( 0, xmm2, xmm3 )
-					}
-					return;
-				}
-				case 2: {		// 6x2 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm6, [edi+0*4]
-						mulps		xmm6, xmm0
-						movlps		xmm1, [esi+2*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+4*4]
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+8*4]
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movhlps		xmm3, xmm6
-						addps		xmm3, xmm6
-						STORE2LO( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 3: {		// 6x3 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+(0*3+2)*4]
-						movhps		xmm0, [edi+(0*3+0)*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm6, [esi+0*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movss		xmm1, [edi+(1*3+0)*4]
-						movhps		xmm1, [edi+(1*3+1)*4]
-						movss		xmm7, [esi+1*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movss		xmm2, [edi+(2*3+2)*4]
-						movhps		xmm2, [edi+(2*3+0)*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+2*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movss		xmm3, [edi+(3*3+0)*4]
-						movhps		xmm3, [edi+(3*3+1)*4]
-						movss		xmm7, [esi+3*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movss		xmm4, [edi+(4*3+2)*4]
-						movhps		xmm4, [edi+(4*3+0)*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+4*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movss		xmm5, [edi+(5*3+0)*4]
-						movhps		xmm5, [edi+(5*3+1)*4]
-						movss		xmm7, [esi+5*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE1( 0, xmm6, xmm7 )
-						STORE2HI( 4, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 4: {		// 6x4 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm3, [edi+(0*4+0)*4]
-						movhps		xmm3, [edi+(0*4+2)*4]
-						movss		xmm4, [esi+0*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*4+0)*4]
-						movhps		xmm5, [edi+(1*4+2)*4]
-						movss		xmm6, [esi+1*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*4+0)*4]
-						movhps		xmm4, [edi+(2*4+2)*4]
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*4+0)*4]
-						movhps		xmm5, [edi+(3*4+2)*4]
-						movss		xmm6, [esi+3*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(4*4+0)*4]
-						movhps		xmm4, [edi+(4*4+2)*4]
-						movss		xmm6, [esi+4*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(5*4+0)*4]
-						movhps		xmm5, [edi+(5*4+2)*4]
-						movss		xmm6, [esi+5*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 5: {		// 6x5 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [edi+(0*5+0)*4]
-						movhps		xmm6, [edi+(0*5+2)*4]
-						movss		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movlps		xmm7, [edi+(1*5+0)*4]
-						movhps		xmm7, [edi+(1*5+2)*4]
-						movss		xmm1, [esi+1*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(2*5+0)*4]
-						movhps		xmm7, [edi+(2*5+2)*4]
-						movss		xmm2, [esi+2*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(3*5+0)*4]
-						movhps		xmm7, [edi+(3*5+2)*4]
-						movss		xmm3, [esi+3*4]
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(4*5+0)*4]
-						movhps		xmm7, [edi+(4*5+2)*4]
-						movss		xmm4, [esi+4*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(5*5+0)*4]
-						movhps		xmm7, [edi+(5*5+2)*4]
-						movss		xmm5, [esi+5*4]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE4( 0, xmm6, xmm7 )
-						movss		xmm6, [edi+(0*5+4)*4]
-						mulss		xmm6, xmm0
-						movss		xmm7, [edi+(1*5+4)*4]
-						mulss		xmm7, xmm1
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(2*5+4)*4]
-						mulss		xmm7, xmm2
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(3*5+4)*4]
-						mulss		xmm7, xmm3
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(4*5+4)*4]
-						mulss		xmm7, xmm4
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(5*5+4)*4]
-						mulss		xmm7, xmm5
-						addss		xmm6, xmm7
-						STORE1( 16, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movlps		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(4*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(5*6+0)*4]
-						movhps		xmm5, [edi+(5*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						movhps		xmm5, [edi+(5*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, qword ptr [esi]
+				movlps		xmm6, qword ptr [esi+8]
+				shufps		xmm7, xmm7, 0x44
+				shufps		xmm6, xmm6, 0x44
+				movlps		xmm0, qword ptr [edi    ]
+				movhps		xmm0, qword ptr [edi+ 24]
+				mulps		xmm0, xmm7
+				movlps		xmm3, qword ptr [edi+  8]
+				movhps		xmm3, qword ptr [edi+ 32]
+				mulps		xmm3, xmm6
+				movlps		xmm1, qword ptr [edi+ 48]
+				movhps		xmm1, qword ptr [edi+ 72]
+				mulps		xmm1, xmm7
+				movlps		xmm2, qword ptr [edi+ 96]
+				movhps		xmm2, qword ptr [edi+120]
+				mulps		xmm2, xmm7
+				movlps		xmm4, qword ptr [edi+ 56]
+				movhps		xmm4, qword ptr [edi+ 80]
+				movlps		xmm5, qword ptr [edi+104]
+				movhps		xmm5, qword ptr [edi+128]
+				mulps		xmm4, xmm6
+				movlps		xmm7, qword ptr [esi+16]
+				addps		xmm0, xmm3
+				shufps		xmm7, xmm7, 0x44
+				mulps		xmm5, xmm6
+				addps		xmm1, xmm4
+				movlps		xmm3, qword ptr [edi+ 16]
+				movhps		xmm3, qword ptr [edi+ 40]
+				addps		xmm2, xmm5
+				movlps		xmm4, qword ptr [edi+ 64]
+				movhps		xmm4, qword ptr [edi+ 88]
+				mulps		xmm3, xmm7
+				movlps		xmm5, qword ptr [edi+112]
+				movhps		xmm5, qword ptr [edi+136]
+				addps		xmm0, xmm3
+				mulps		xmm4, xmm7
+				mulps		xmm5, xmm7
+				addps		xmm1, xmm4
+				addps		xmm2, xmm5
+				movaps		xmm6, xmm0
+				shufps		xmm0, xmm1, 0x88
+				shufps		xmm6, xmm1, 0xDD
+				movaps		xmm7, xmm2
+				shufps		xmm7, xmm2, 0x88
+				shufps		xmm2, xmm2, 0xDD
+				addps		xmm0, xmm6
+				addps		xmm2, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				STORE2LO( 16, xmm2, xmm4 )
 			}
-			break;
-		default:
-			int numRows = mat.GetNumRows();
-			for ( int i = 0; i < numColumns; i++ ) {
-				mPtr = mat.ToFloatPtr() + i;
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numRows; j++ ) {
-					mPtr += numColumns;
-					sum += mPtr[0] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
+				mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
+				mPtr += 6;
 			}
-			break;
+			return;
+		}
+		}
+		break;
+	}
+	default: {
+		int numColumns = mat.GetNumColumns();
+		for ( int i = 0; i < numRows; i++ ) {
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numColumns; j++ ) {
+				sum += mPtr[j] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+			mPtr += numColumns;
+		}
+		break;
+	}
 	}
 
 #undef STOREC
@@ -7731,17 +5962,18 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX
 
 /*
 ============
-void idSIMD_SSE::MatX_TransposeMultiplySubVecX
+idSIMD_SSE::MatX_MultiplySubVecX
 
 	optimizes the following matrix multiplications:
 
-	Nx6 * Nx1
-	6xN * 6x1
+	NxN * Nx1
+	Nx6 * 6x1
+	6xN * Nx1
 
 	with N in the range [1-6]
 ============
 */
-void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
+void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
 #define STORE1( offset, reg1, reg2 )		\
 	__asm movss		reg2, [eax+offset]		\
 	__asm subss		reg2, reg1				\
@@ -7762,528 +5994,2392 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX
 	__asm movhps	[eax+offset+8], reg2
 #define STOREC		-=
 
-	int numColumns;
+	int numRows;
 	const float *mPtr, *vPtr;
 	float *dstPtr;
 
-	assert( vec.GetSize() >= mat.GetNumRows() );
-	assert( dst.GetSize() >= mat.GetNumColumns() );
+	assert( vec.GetSize() >= mat.GetNumColumns() );
+	assert( dst.GetSize() >= mat.GetNumRows() );
+
+	mPtr = mat.ToFloatPtr();
+	vPtr = vec.ToFloatPtr();
+	dstPtr = dst.ToFloatPtr();
+	numRows = mat.GetNumRows();
+	switch ( mat.GetNumColumns() ) {
+	case 1: {
+		switch ( numRows ) {
+		case 1: {		// 1x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				STORE1( 0, xmm0, xmm1 )
+			}
+			return;
+		}
+		case 6: {		// 6x1 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm2 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	case 2: {
+		switch ( numRows ) {
+		case 2: {		// 2x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm1, [esi+4]
+				movss		xmm2, [edi]
+				mulss		xmm2, xmm0
+				movss		xmm3, [edi+4]
+				mulss		xmm3, xmm1
+				addss		xmm2, xmm3
+				STORE1( 0, xmm2, xmm4 )
+				mulss		xmm0, [edi + 8]
+				mulss		xmm1, [edi + 8 + 4]
+				addss		xmm0, xmm1
+				STORE1( 4, xmm0, xmm4 )
+			}
+			return;
+		}
+		case 6: {		// 6x2 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, [esi]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movaps		xmm0, [edi]
+				mulps		xmm0, xmm7
+				movaps		xmm1, [edi + 16]
+				mulps		xmm1, xmm7
+				movaps		xmm2, xmm0
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				movaps		xmm3, [edi + 32]
+				addps		xmm0, xmm2
+				mulps		xmm3, xmm7
+				STORE4( 0, xmm0, xmm4 )
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm1, xmm3
+				addps		xmm3, xmm1
+				STORE2LO( 16, xmm3, xmm4 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
+				mPtr += 2;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	case 3: {
+		switch ( numRows ) {
+		case 3: {		// 3x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				movss		xmm4, [edi]
+				mulss		xmm4, xmm0
+				movss		xmm1, [esi+4]
+				movss		xmm5, [edi+4]
+				mulss		xmm5, xmm1
+				addss		xmm4, xmm5
+				movss		xmm2, [esi+8]
+				movss		xmm6, [edi+8]
+				mulss		xmm6, xmm2
+				addss		xmm4, xmm6
+				movss		xmm3, [edi+12]
+				mulss		xmm3, xmm0
+				STORE1( 0, xmm4, xmm7 );
+				movss		xmm5, [edi + 12 + 4]
+				mulss		xmm5, xmm1
+				addss		xmm3, xmm5
+				movss		xmm6, [edi + 12 + 8]
+				mulss		xmm6, xmm2
+				addss		xmm3, xmm6
+				mulss		xmm0, [edi + 24]
+				mulss		xmm1, [edi + 24 + 4]
+				STORE1( 4, xmm3, xmm7 );
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi + 24 + 8]
+				addss		xmm0, xmm2
+				STORE1( 8, xmm0, xmm7 );
+			}
+			return;
+		}
+		case 6: {		// 6x3 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm5, [esi]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm6, [esi + 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movss		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
+				movlps		xmm1, [edi + 4 * 4]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
+				movlps		xmm2, [edi + 6 * 4]
+				movhps		xmm2, [edi + 8 * 4]							// xmm2 = 6, 7, 8, 9
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
+				mulps		xmm0, xmm5
+				movlps		xmm3, [edi + 10 * 4]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
+				movaps		xmm3, xmm1
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
+				mulps		xmm1, xmm6
+				shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
+				mulps		xmm3, xmm7
+				addps		xmm0, xmm1
+				addps		xmm0, xmm3
+				STORE4( 0, xmm0, xmm4 )
+				movss		xmm1, [edi + 12 * 4]
+				mulss		xmm1, xmm5
+				movss		xmm2, [edi + 13 * 4]
+				mulss		xmm2, xmm6
+				movss		xmm3, [edi + 14 * 4]
+				mulss		xmm3, xmm7
+				addss		xmm1, xmm2
+				addss		xmm1, xmm3
+				STORE1( 16, xmm1, xmm4 )
+				mulss		xmm5, [edi + 15 * 4]
+				mulss		xmm6, [edi + 16 * 4]
+				mulss		xmm7, [edi + 17 * 4]
+				addss		xmm5, xmm6
+				addss		xmm5, xmm7
+				STORE1( 20, xmm5, xmm4 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
+				mPtr += 3;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	case 4: {
+		switch ( numRows ) {
+		case 4: {		// 4x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi ]
+				movlps		xmm0, qword ptr [edi ]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				STORE4( 0, xmm0, xmm2 )
+			}
+			return;
+		}
+		case 6: {		// 6x4 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, qword ptr [esi+ 0]
+				movlps		xmm0, qword ptr [edi+ 0]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm0, qword ptr [edi + 16]
+				mulps		xmm0, xmm6
+				movlps		xmm7, qword ptr [esi + 8]
+				movlps		xmm2, qword ptr [edi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movhps		xmm2, qword ptr [edi + 24]
+				mulps		xmm2, xmm7
+				movlps		xmm1, qword ptr [edi + 32]
+				movhps		xmm1, qword ptr [edi + 48]
+				mulps		xmm1, xmm6
+				movlps		xmm3, qword ptr [edi + 40]
+				addps		xmm0, xmm2
+				movhps		xmm3, qword ptr [edi + 56]
+				mulps		xmm3, xmm7
+				movaps		xmm4, xmm0
+				addps		xmm1, xmm3
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm4
+				movlps		xmm1, qword ptr [edi + 64]
+				movhps		xmm1, qword ptr [edi + 80]
+				STORE4( 0, xmm0, xmm4 )
+				mulps		xmm1, xmm6
+				movlps		xmm2, qword ptr [edi + 72]
+				movhps		xmm2, qword ptr [edi + 88]
+				mulps		xmm2, xmm7
+				addps		xmm1, xmm2
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm3, xmm1
+				addps		xmm1, xmm3
+				STORE2LO( 16, xmm1, xmm4 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
+				mPtr += 4;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	case 5: {
+		switch ( numRows ) {
+		case 5: {		// 5x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
+				movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
+				movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
+				movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
+				movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
+				shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
+				movlps		xmm1, [edi + 6 * 4]							// xmm1 =  6,  7,  0,  1
+				movlps		xmm5, [edi + 16 * 4]						// xmm5 = 16, 17, 10, 11
+				movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
+				shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
+				movhps		xmm2, [edi + 2 * 4]							// xmm2 =  6,  7,  2,  3
+				movhps		xmm5, [edi + 12 * 4]						// xmm5 = 16, 17, 12, 13
+				movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
+				shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
+				movlps		xmm3, [edi + 8 * 4]							// xmm3 =  8,  9,  2,  3
+				movlps		xmm5, [edi + 18 * 4]						// xmm5 = 18, 19, 12, 13
+				movss		xmm4, [edi + 4 * 4]							// xmm4 =  4,  X,  X,  X
+				movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
+				shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
+				movhps		xmm5, [edi + 14 * 4]						// xmm6 = 18, 19, 14, 15
+				shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
+				movss		xmm7, [esi + 0 * 4]
+				shufps		xmm7, xmm7, 0
+				mulps		xmm0, xmm7
+				movss		xmm5, [esi + 1 * 4]
+				shufps		xmm5, xmm5, 0
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, 0
+				mulps		xmm2, xmm6
+				addps		xmm0, xmm2
+				movss		xmm1, [esi + 3 * 4]
+				shufps		xmm1, xmm1, 0
+				mulps		xmm3, xmm1
+				addps		xmm0, xmm3
+				movss		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, 0
+				mulps		xmm4, xmm2
+				addps		xmm0, xmm4
+				mulss		xmm7, [edi + 20 * 4]
+				mulss		xmm5, [edi + 21 * 4]
+				addps		xmm7, xmm5
+				mulss		xmm6, [edi + 22 * 4]
+				addps		xmm7, xmm6
+				mulss		xmm1, [edi + 23 * 4]
+				addps		xmm7, xmm1
+				mulss		xmm2, [edi + 24 * 4]
+				addps		xmm7, xmm2
+				STORE4( 0, xmm0, xmm3 )
+				STORE1( 16, xmm7, xmm4 )
+			}
+			return;
+		}
+		case 6: {		// 6x5 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [esi]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm7, [esi + 8]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
+				movlps		xmm0, [edi]
+				movhps		xmm3, [edi + 8]
+				movaps		xmm1, [edi + 16]
+				movlps		xmm2, [edi + 32]
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
+				mulps		xmm0, xmm6
+				mulps		xmm3, xmm7
+				movlps		xmm2, [edi + 40]
+				addps		xmm0, xmm3								// xmm0 + xmm1
+				movhps		xmm5, [edi + 40 + 8]
+				movlps		xmm3, [edi + 40 + 16]
+				movhps		xmm3, [edi + 40 + 24]
+				movlps		xmm4, [edi + 40 + 32]
+				shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
+				shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
+				shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
+				mulps		xmm2, xmm6
+				mulps		xmm5, xmm7
+				addps		xmm2, xmm5								// xmm2 + xmm3
+				movss		xmm5, [esi + 16]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm0
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
+				shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
+				addps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				STORE4( 0, xmm0, xmm2 )
+				movlps		xmm4, [edi + 80]
+				movhps		xmm3, [edi + 80 + 8]
+				movaps		xmm1, [edi + 80 + 16]
+				movlps		xmm2, [edi + 80 + 32]
+				shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
+				shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
+				shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
+				mulps		xmm4, xmm6
+				mulps		xmm3, xmm7
+				mulps		xmm1, xmm5
+				addps		xmm4, xmm3								// xmm4 + xmm1
+				shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
+				addps		xmm4, xmm1
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
+				addps		xmm4, xmm1
+				STORE2LO( 16, xmm4, xmm2 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
+				mPtr += 5;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	case 6: {
+		switch ( numRows ) {
+		case 1: {		// 1x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				mulss		xmm0, [edi]
+				movss		xmm1, [esi+4]
+				mulss		xmm1, [edi+4]
+				movss		xmm2, [esi+8]
+				addss		xmm0, xmm1
+				mulss		xmm2, [edi+8]
+				movss		xmm3, [esi+12]
+				addss		xmm0, xmm2
+				mulss		xmm3, [edi+12]
+				movss		xmm4, [esi+16]
+				addss		xmm0, xmm3
+				mulss		xmm4, [edi+16]
+				movss		xmm5, [esi+20]
+				addss		xmm0, xmm4
+				mulss		xmm5, [edi+20]
+				movss		xmm6, [esi+24]
+				addss		xmm0, xmm5
+				mulss		xmm6, [edi+24]
+				addss		xmm0, xmm6
+				STORE1( 0, xmm0, xmm7 )
+			}
+			return;
+		}
+		case 2: {		// 2x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 3: {		// 3x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
+				movhlps		xmm0, xmm1
+				addps		xmm0, xmm1
+				STORE2LO( 0, xmm0, xmm3 )
+				// row 2
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm0, xmm1
+				STORE1( 8, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 4: {		// 4x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm4 )
+			}
+			return;
+		}
+		case 5: {		// 5x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				// load idVecX
+				movlps		xmm4, [esi]
+				movhps		xmm4, [esi+8]
+				movlps		xmm5, [esi+16]
+				movlhps		xmm5, xmm4
+				movhlps		xmm6, xmm4
+				movlhps		xmm6, xmm5
+				// row 0 and 1
+				movaps		xmm0, [edi]
+				movaps		xmm1, [edi+16]
+				movaps		xmm2, [edi+32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm7, xmm0
+				movlhps		xmm7, xmm2
+				addps		xmm7, xmm1
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm7, xmm0
+				// row 2 and 3
+				movaps		xmm0, [edi + 48]
+				movaps		xmm1, [edi + 48 + 16]
+				movaps		xmm2, [edi + 48 + 32]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				mulps		xmm2, xmm6
+				movhlps		xmm3, xmm0
+				movlhps		xmm3, xmm2
+				addps		xmm1, xmm3
+				shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
+				addps		xmm1, xmm0
+				// last 4 additions for the first 4 rows and store result
+				movaps		xmm0, xmm7
+				shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
+				addps		xmm0, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				// row 5
+				movaps		xmm0, [edi + 96]
+				movaps		xmm1, [edi + 96 + 16]
+				mulps		xmm0, xmm4
+				mulps		xmm1, xmm5
+				addps		xmm0, xmm1
+				movhlps		xmm1, xmm0
+				addps		xmm0, xmm1
+				movaps		xmm1, xmm0
+				shufps		xmm1, xmm1, 0x01
+				addss		xmm0, xmm1
+				STORE1( 16, xmm0, xmm3 )
+			}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm7, qword ptr [esi]
+				movlps		xmm6, qword ptr [esi+8]
+				shufps		xmm7, xmm7, 0x44
+				shufps		xmm6, xmm6, 0x44
+				movlps		xmm0, qword ptr [edi    ]
+				movhps		xmm0, qword ptr [edi+ 24]
+				mulps		xmm0, xmm7
+				movlps		xmm3, qword ptr [edi+  8]
+				movhps		xmm3, qword ptr [edi+ 32]
+				mulps		xmm3, xmm6
+				movlps		xmm1, qword ptr [edi+ 48]
+				movhps		xmm1, qword ptr [edi+ 72]
+				mulps		xmm1, xmm7
+				movlps		xmm2, qword ptr [edi+ 96]
+				movhps		xmm2, qword ptr [edi+120]
+				mulps		xmm2, xmm7
+				movlps		xmm4, qword ptr [edi+ 56]
+				movhps		xmm4, qword ptr [edi+ 80]
+				movlps		xmm5, qword ptr [edi+104]
+				movhps		xmm5, qword ptr [edi+128]
+				mulps		xmm4, xmm6
+				movlps		xmm7, qword ptr [esi+16]
+				addps		xmm0, xmm3
+				shufps		xmm7, xmm7, 0x44
+				mulps		xmm5, xmm6
+				addps		xmm1, xmm4
+				movlps		xmm3, qword ptr [edi+ 16]
+				movhps		xmm3, qword ptr [edi+ 40]
+				addps		xmm2, xmm5
+				movlps		xmm4, qword ptr [edi+ 64]
+				movhps		xmm4, qword ptr [edi+ 88]
+				mulps		xmm3, xmm7
+				movlps		xmm5, qword ptr [edi+112]
+				movhps		xmm5, qword ptr [edi+136]
+				addps		xmm0, xmm3
+				mulps		xmm4, xmm7
+				mulps		xmm5, xmm7
+				addps		xmm1, xmm4
+				addps		xmm2, xmm5
+				movaps		xmm6, xmm0
+				shufps		xmm0, xmm1, 0x88
+				shufps		xmm6, xmm1, 0xDD
+				movaps		xmm7, xmm2
+				shufps		xmm7, xmm2, 0x88
+				shufps		xmm2, xmm2, 0xDD
+				addps		xmm0, xmm6
+				addps		xmm2, xmm7
+				STORE4( 0, xmm0, xmm3 )
+				STORE2LO( 16, xmm2, xmm4 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numRows; i++ ) {
+				dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
+				mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
+				mPtr += 6;
+			}
+			return;
+		}
+		}
+		break;
+	}
+	default: {
+		int numColumns = mat.GetNumColumns();
+		for ( int i = 0; i < numRows; i++ ) {
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numColumns; j++ ) {
+				sum += mPtr[j] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+			mPtr += numColumns;
+		}
+		break;
+	}
+	}
+
+#undef STOREC
+#undef STORE4
+#undef STORE2HI
+#undef STORE2LO
+#undef STORE1
+}
+
+/*
+============
+idSIMD_SSE::MatX_TransposeMultiplyVecX
+
+	optimizes the following matrix multiplications:
+
+	Nx6 * Nx1
+	6xN * 6x1
+
+	with N in the range [1-6]
+============
+*/
+void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
+#define STORE1( offset, reg1, reg2 )		\
+	__asm movss		[eax+offset], reg1
+#define STORE2LO( offset, reg1, reg2 )		\
+	__asm movlps	[eax+offset], reg1
+#define STORE2HI( offset, reg1, reg2 )		\
+	__asm movhps	[eax+offset], reg1
+#define STORE4( offset, reg1, reg2 )		\
+	__asm movlps	[eax+offset], reg1		\
+	__asm movhps	[eax+offset+8], reg1
+#define STOREC		=
+
+	int numColumns;
+	const float *mPtr, *vPtr;
+	float *dstPtr;
+
+	assert( vec.GetSize() >= mat.GetNumRows() );
+	assert( dst.GetSize() >= mat.GetNumColumns() );
 
 	mPtr = mat.ToFloatPtr();
 	vPtr = vec.ToFloatPtr();
 	dstPtr = dst.ToFloatPtr();
 	numColumns = mat.GetNumColumns();
-	switch( mat.GetNumRows() ) {
-		case 1:
-			switch( numColumns ) {
-				case 6: {		// 1x6 * 1x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [esi]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm1, xmm0
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						STORE4( 0, xmm0, xmm2 )
-						STORE2LO( 16, xmm1, xmm3 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0];
-						mPtr++;
-					}
-					return;
-				}
+	switch ( mat.GetNumRows() ) {
+	case 1:
+		switch ( numColumns ) {
+		case 6: {		// 1x6 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm3 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 2:
+		switch ( numColumns ) {
+		case 6: {		// 2x6 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movaps		xmm1, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movaps		xmm2, [edi]
+				mulps		xmm2, xmm0
+				movlps		xmm3, [edi + 24]
+				movhps		xmm3, [edi + 32]
+				mulps		xmm3, xmm1
+				addps		xmm2, xmm3
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movlps		xmm4, [edi + 16]
+				movhps		xmm4, [edi + 40]
+				mulps		xmm4, xmm0
+				movhlps		xmm3, xmm4
+				addps		xmm3, xmm4
+				STORE4( 0, xmm2, xmm5 )
+				STORE2LO( 16, xmm3, xmm6 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 3:
+		switch ( numColumns ) {
+		case 6: {		// 3x6 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movss		xmm1, [esi+2*4]
+				movlps		xmm3, [edi+( 0*6+0 )*4]
+				movhps		xmm3, [edi+( 0*6+2 )*4]
+				movaps		xmm4, xmm0
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 2 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm1
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 4:
+		switch ( numColumns ) {
+		case 6: {		// 4x6 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 5:
+		switch ( numColumns ) {
+		case 6: {		// 5x6 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movss		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm2
+				mulps		xmm4, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 6:
+		switch ( numColumns ) {
+		case 1: {		// 6x1 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movhps		xmm0, [esi+8]
+				movlps		xmm1, [esi+16]
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi+16]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
+				addps		xmm0, xmm1
+				movhlps		xmm2, xmm0
+				addss		xmm2, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm2, xmm0
+				STORE1( 0, xmm2, xmm3 )
+			}
+			return;
+		}
+		case 2: {		// 6x2 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm6, [edi + 0 * 4]
+				mulps		xmm6, xmm0
+				movlps		xmm1, [esi + 2 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 4 * 4]
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 8 * 4]
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movhlps		xmm3, xmm6
+				addps		xmm3, xmm6
+				STORE2LO( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 3: {		// 6x3 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+( 0*3+2 )*4]
+				movhps		xmm0, [edi+( 0*3+0 )*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm6, [esi + 0 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movss		xmm1, [edi + ( 1 * 3 + 0 ) * 4]
+				movhps		xmm1, [edi + ( 1 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 1 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movss		xmm2, [edi + ( 2 * 3 + 2 ) * 4]
+				movhps		xmm2, [edi + ( 2 * 3 + 0 ) * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 2 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movss		xmm3, [edi + ( 3 * 3 + 0 ) * 4]
+				movhps		xmm3, [edi + ( 3 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 3 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movss		xmm4, [edi + ( 4 * 3 + 2 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 3 + 0 ) * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 4 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movss		xmm5, [edi + ( 5 * 3 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 5 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE1( 0, xmm6, xmm7 )
+				STORE2HI( 4, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 4: {		// 6x4 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm3, [edi+( 0*4+0 )*4]
+				movhps		xmm3, [edi+( 0*4+2 )*4]
+				movss		xmm4, [esi+0*4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 1 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 3 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 4 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 4 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 5 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 5 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 5: {		// 6x5 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [edi+( 0*5+0 )*4]
+				movhps		xmm6, [edi+( 0*5+2 )*4]
+				movss		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movlps		xmm7, [edi + ( 1 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 1 * 5 + 2 ) * 4]
+				movss		xmm1, [esi + 1 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 2 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 2 * 5 + 2 ) * 4]
+				movss		xmm2, [esi + 2 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 3 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 3 * 5 + 2 ) * 4]
+				movss		xmm3, [esi + 3 * 4]
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 4 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 4 * 5 + 2 ) * 4]
+				movss		xmm4, [esi + 4 * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 5 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 5 * 5 + 2 ) * 4]
+				movss		xmm5, [esi + 5 * 4]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE4( 0, xmm6, xmm7 )
+				movss		xmm6, [edi + ( 0 * 5 + 4 ) * 4]
+				mulss		xmm6, xmm0
+				movss		xmm7, [edi + ( 1 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm1
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 2 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm2
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 3 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm3
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 4 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm4
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 5 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm5
+				addss		xmm6, xmm7
+				STORE1( 16, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movlps		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 5 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	default:
+		int numRows = mat.GetNumRows();
+		for ( int i = 0; i < numColumns; i++ ) {
+			mPtr = mat.ToFloatPtr() + i;
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numRows; j++ ) {
+				mPtr += numColumns;
+				sum += mPtr[0] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+		}
+		break;
+	}
+
+#undef STOREC
+#undef STORE4
+#undef STORE2HI
+#undef STORE2LO
+#undef STORE1
+}
+
+/*
+============
+idSIMD_SSE::MatX_TransposeMultiplyAddVecX
+
+	optimizes the following matrix multiplications:
+
+	Nx6 * Nx1
+	6xN * 6x1
+
+	with N in the range [1-6]
+============
+*/
+void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
+#define STORE1( offset, reg1, reg2 )		\
+	__asm movss		reg2, [eax+offset]		\
+	__asm addss		reg2, reg1				\
+	__asm movss		[eax+offset], reg2
+#define STORE2LO( offset, reg1, reg2 )		\
+	__asm movlps	reg2, [eax+offset]		\
+	__asm addps		reg2, reg1				\
+	__asm movlps	[eax+offset], reg2
+#define STORE2HI( offset, reg1, reg2 )		\
+	__asm movhps	reg2, [eax+offset]		\
+	__asm addps		reg2, reg1				\
+	__asm movhps	[eax+offset], reg2
+#define STORE4( offset, reg1, reg2 )		\
+	__asm movlps	reg2, [eax+offset]		\
+	__asm movhps	reg2, [eax+offset+8]	\
+	__asm addps		reg2, reg1				\
+	__asm movlps	[eax+offset], reg2		\
+	__asm movhps	[eax+offset+8], reg2
+#define STOREC		+=
+
+	int numColumns;
+	const float *mPtr, *vPtr;
+	float *dstPtr;
+
+	assert( vec.GetSize() >= mat.GetNumRows() );
+	assert( dst.GetSize() >= mat.GetNumColumns() );
+
+	mPtr = mat.ToFloatPtr();
+	vPtr = vec.ToFloatPtr();
+	dstPtr = dst.ToFloatPtr();
+	numColumns = mat.GetNumColumns();
+	switch ( mat.GetNumRows() ) {
+	case 1:
+		switch ( numColumns ) {
+		case 6: {		// 1x6 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm3 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 2:
+		switch ( numColumns ) {
+		case 6: {		// 2x6 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movaps		xmm1, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movaps		xmm2, [edi]
+				mulps		xmm2, xmm0
+				movlps		xmm3, [edi + 24]
+				movhps		xmm3, [edi + 32]
+				mulps		xmm3, xmm1
+				addps		xmm2, xmm3
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movlps		xmm4, [edi + 16]
+				movhps		xmm4, [edi + 40]
+				mulps		xmm4, xmm0
+				movhlps		xmm3, xmm4
+				addps		xmm3, xmm4
+				STORE4( 0, xmm2, xmm5 )
+				STORE2LO( 16, xmm3, xmm6 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 3:
+		switch ( numColumns ) {
+		case 6: {		// 3x6 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movss		xmm1, [esi+2*4]
+				movlps		xmm3, [edi+( 0*6+0 )*4]
+				movhps		xmm3, [edi+( 0*6+2 )*4]
+				movaps		xmm4, xmm0
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 2 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm1
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 4:
+		switch ( numColumns ) {
+		case 6: {		// 4x6 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 5:
+		switch ( numColumns ) {
+		case 6: {		// 5x6 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movss		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm2
+				mulps		xmm4, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 6:
+		switch ( numColumns ) {
+		case 1: {		// 6x1 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movhps		xmm0, [esi+8]
+				movlps		xmm1, [esi+16]
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi+16]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
+				addps		xmm0, xmm1
+				movhlps		xmm2, xmm0
+				addss		xmm2, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm2, xmm0
+				STORE1( 0, xmm2, xmm3 )
+			}
+			return;
+		}
+		case 2: {		// 6x2 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm6, [edi + 0 * 4]
+				mulps		xmm6, xmm0
+				movlps		xmm1, [esi + 2 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 4 * 4]
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 8 * 4]
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movhlps		xmm3, xmm6
+				addps		xmm3, xmm6
+				STORE2LO( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 3: {		// 6x3 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+( 0*3+2 )*4]
+				movhps		xmm0, [edi+( 0*3+0 )*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm6, [esi + 0 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movss		xmm1, [edi + ( 1 * 3 + 0 ) * 4]
+				movhps		xmm1, [edi + ( 1 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 1 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movss		xmm2, [edi + ( 2 * 3 + 2 ) * 4]
+				movhps		xmm2, [edi + ( 2 * 3 + 0 ) * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 2 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movss		xmm3, [edi + ( 3 * 3 + 0 ) * 4]
+				movhps		xmm3, [edi + ( 3 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 3 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movss		xmm4, [edi + ( 4 * 3 + 2 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 3 + 0 ) * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 4 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movss		xmm5, [edi + ( 5 * 3 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 5 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE1( 0, xmm6, xmm7 )
+				STORE2HI( 4, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 4: {		// 6x4 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm3, [edi+( 0*4+0 )*4]
+				movhps		xmm3, [edi+( 0*4+2 )*4]
+				movss		xmm4, [esi+0*4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 1 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 3 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 4 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 4 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 5 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 5 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 5: {		// 6x5 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [edi+( 0*5+0 )*4]
+				movhps		xmm6, [edi+( 0*5+2 )*4]
+				movss		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movlps		xmm7, [edi + ( 1 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 1 * 5 + 2 ) * 4]
+				movss		xmm1, [esi + 1 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 2 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 2 * 5 + 2 ) * 4]
+				movss		xmm2, [esi + 2 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 3 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 3 * 5 + 2 ) * 4]
+				movss		xmm3, [esi + 3 * 4]
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 4 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 4 * 5 + 2 ) * 4]
+				movss		xmm4, [esi + 4 * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 5 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 5 * 5 + 2 ) * 4]
+				movss		xmm5, [esi + 5 * 4]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE4( 0, xmm6, xmm7 )
+				movss		xmm6, [edi + ( 0 * 5 + 4 ) * 4]
+				mulss		xmm6, xmm0
+				movss		xmm7, [edi + ( 1 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm1
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 2 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm2
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 3 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm3
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 4 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm4
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 5 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm5
+				addss		xmm6, xmm7
+				STORE1( 16, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movlps		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 5 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	default:
+		int numRows = mat.GetNumRows();
+		for ( int i = 0; i < numColumns; i++ ) {
+			mPtr = mat.ToFloatPtr() + i;
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numRows; j++ ) {
+				mPtr += numColumns;
+				sum += mPtr[0] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+		}
+		break;
+	}
+
+#undef STOREC
+#undef STORE4
+#undef STORE2HI
+#undef STORE2LO
+#undef STORE1
+}
+
+/*
+============
+void idSIMD_SSE::MatX_TransposeMultiplySubVecX
+
+	optimizes the following matrix multiplications:
+
+	Nx6 * Nx1
+	6xN * 6x1
+
+	with N in the range [1-6]
+============
+*/
+void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
+#define STORE1( offset, reg1, reg2 )		\
+	__asm movss		reg2, [eax+offset]		\
+	__asm subss		reg2, reg1				\
+	__asm movss		[eax+offset], reg2
+#define STORE2LO( offset, reg1, reg2 )		\
+	__asm movlps	reg2, [eax+offset]		\
+	__asm subps		reg2, reg1				\
+	__asm movlps	[eax+offset], reg2
+#define STORE2HI( offset, reg1, reg2 )		\
+	__asm movhps	reg2, [eax+offset]		\
+	__asm subps		reg2, reg1				\
+	__asm movhps	[eax+offset], reg2
+#define STORE4( offset, reg1, reg2 )		\
+	__asm movlps	reg2, [eax+offset]		\
+	__asm movhps	reg2, [eax+offset+8]	\
+	__asm subps		reg2, reg1				\
+	__asm movlps	[eax+offset], reg2		\
+	__asm movhps	[eax+offset+8], reg2
+#define STOREC		-=
+
+	int numColumns;
+	const float *mPtr, *vPtr;
+	float *dstPtr;
+
+	assert( vec.GetSize() >= mat.GetNumRows() );
+	assert( dst.GetSize() >= mat.GetNumColumns() );
+
+	mPtr = mat.ToFloatPtr();
+	vPtr = vec.ToFloatPtr();
+	dstPtr = dst.ToFloatPtr();
+	numColumns = mat.GetNumColumns();
+	switch ( mat.GetNumRows() ) {
+	case 1:
+		switch ( numColumns ) {
+		case 6: {		// 1x6 * 1x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [esi]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm1, xmm0
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi + 16]
+				STORE4( 0, xmm0, xmm2 )
+				STORE2LO( 16, xmm1, xmm3 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0];
+				mPtr++;
 			}
-			break;
-		case 2:
-			switch( numColumns ) {
-				case 6: {		// 2x6 * 2x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movaps		xmm1, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movaps		xmm2, [edi]
-						mulps		xmm2, xmm0
-						movlps		xmm3, [edi+24]
-						movhps		xmm3, [edi+32]
-						mulps		xmm3, xmm1
-						addps		xmm2, xmm3
-						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movlps		xmm4, [edi+16]
-						movhps		xmm4, [edi+40]
-						mulps		xmm4, xmm0
-						movhlps		xmm3, xmm4
-						addps		xmm3, xmm4
-						STORE4( 0, xmm2, xmm5 )
-						STORE2LO( 16, xmm3, xmm6 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		}
+		break;
+	case 2:
+		switch ( numColumns ) {
+		case 6: {		// 2x6 * 2x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movaps		xmm1, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movaps		xmm2, [edi]
+				mulps		xmm2, xmm0
+				movlps		xmm3, [edi + 24]
+				movhps		xmm3, [edi + 32]
+				mulps		xmm3, xmm1
+				addps		xmm2, xmm3
+				shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movlps		xmm4, [edi + 16]
+				movhps		xmm4, [edi + 40]
+				mulps		xmm4, xmm0
+				movhlps		xmm3, xmm4
+				addps		xmm3, xmm4
+				STORE4( 0, xmm2, xmm5 )
+				STORE2LO( 16, xmm3, xmm6 )
 			}
-			break;
-		case 3:
-			switch( numColumns ) {
-				case 6: {		// 3x6 * 3x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movss		xmm1, [esi+2*4]
-						movlps		xmm3, [edi+(0*6+0)*4]
-						movhps		xmm3, [edi+(0*6+2)*4]
-						movaps		xmm4, xmm0
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(2*6+4)*4]
-						mulps		xmm5, xmm1
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1];
+				mPtr++;
 			}
-			break;
-		case 4:
-			switch( numColumns ) {
-				case 6: {		// 4x6 * 4x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*6+0)*4]
-						movhps		xmm4, [edi+(2*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		}
+		break;
+	case 3:
+		switch ( numColumns ) {
+		case 6: {		// 3x6 * 3x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movss		xmm1, [esi+2*4]
+				movlps		xmm3, [edi+( 0*6+0 )*4]
+				movhps		xmm3, [edi+( 0*6+2 )*4]
+				movaps		xmm4, xmm0
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 2 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm1
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
 			}
-			break;
-		case 5:
-			switch( numColumns ) {
-				case 6: {		// 5x6 * 5x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movss		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						movaps		xmm4, xmm2
-						mulps		xmm4, [edi+(4*6+0)*4]
-						addps		xmm3, xmm4
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2];
+				mPtr++;
 			}
-			break;
-		case 6:
-			switch( numColumns ) {
-				case 1: {		// 6x1 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi]
-						movhps		xmm0, [esi+8]
-						movlps		xmm1, [esi+16]
-						mulps		xmm0, [edi]
-						mulps		xmm1, [edi+16]
-						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
-						addps		xmm0, xmm1
-						movhlps		xmm2, xmm0
-						addss		xmm2, xmm0
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
-						addss		xmm2, xmm0
-						STORE1( 0, xmm2, xmm3 )
-					}
-					return;
-				}
-				case 2: {		// 6x2 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm6, [edi+0*4]
-						mulps		xmm6, xmm0
-						movlps		xmm1, [esi+2*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+4*4]
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm2, [esi+4*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movaps		xmm7, [edi+8*4]
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movhlps		xmm3, xmm6
-						addps		xmm3, xmm6
-						STORE2LO( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 3: {		// 6x3 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movss		xmm0, [edi+(0*3+2)*4]
-						movhps		xmm0, [edi+(0*3+0)*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm6, [esi+0*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movss		xmm1, [edi+(1*3+0)*4]
-						movhps		xmm1, [edi+(1*3+1)*4]
-						movss		xmm7, [esi+1*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movss		xmm2, [edi+(2*3+2)*4]
-						movhps		xmm2, [edi+(2*3+0)*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+2*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movss		xmm3, [edi+(3*3+0)*4]
-						movhps		xmm3, [edi+(3*3+1)*4]
-						movss		xmm7, [esi+3*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movss		xmm4, [edi+(4*3+2)*4]
-						movhps		xmm4, [edi+(4*3+0)*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
-						movss		xmm7, [esi+4*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movss		xmm5, [edi+(5*3+0)*4]
-						movhps		xmm5, [edi+(5*3+1)*4]
-						movss		xmm7, [esi+5*4]
-						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE1( 0, xmm6, xmm7 )
-						STORE2HI( 4, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 4: {		// 6x4 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm3, [edi+(0*4+0)*4]
-						movhps		xmm3, [edi+(0*4+2)*4]
-						movss		xmm4, [esi+0*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, xmm4
-						movlps		xmm5, [edi+(1*4+0)*4]
-						movhps		xmm5, [edi+(1*4+2)*4]
-						movss		xmm6, [esi+1*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(2*4+0)*4]
-						movhps		xmm4, [edi+(2*4+2)*4]
-						movss		xmm6, [esi+2*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(3*4+0)*4]
-						movhps		xmm5, [edi+(3*4+2)*4]
-						movss		xmm6, [esi+3*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movlps		xmm4, [edi+(4*4+0)*4]
-						movhps		xmm4, [edi+(4*4+2)*4]
-						movss		xmm6, [esi+4*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm4, xmm6
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(5*4+0)*4]
-						movhps		xmm5, [edi+(5*4+2)*4]
-						movss		xmm6, [esi+5*4]
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-					}
-					return;
-				}
-				case 5: {		// 6x5 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm6, [edi+(0*5+0)*4]
-						movhps		xmm6, [edi+(0*5+2)*4]
-						movss		xmm0, [esi+0*4]
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, xmm0
-						movlps		xmm7, [edi+(1*5+0)*4]
-						movhps		xmm7, [edi+(1*5+2)*4]
-						movss		xmm1, [esi+1*4]
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm1
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(2*5+0)*4]
-						movhps		xmm7, [edi+(2*5+2)*4]
-						movss		xmm2, [esi+2*4]
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm2
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(3*5+0)*4]
-						movhps		xmm7, [edi+(3*5+2)*4]
-						movss		xmm3, [esi+3*4]
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm3
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(4*5+0)*4]
-						movhps		xmm7, [edi+(4*5+2)*4]
-						movss		xmm4, [esi+4*4]
-						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm4
-						addps		xmm6, xmm7
-						movlps		xmm7, [edi+(5*5+0)*4]
-						movhps		xmm7, [edi+(5*5+2)*4]
-						movss		xmm5, [esi+5*4]
-						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm7, xmm5
-						addps		xmm6, xmm7
-						STORE4( 0, xmm6, xmm7 )
-						movss		xmm6, [edi+(0*5+4)*4]
-						mulss		xmm6, xmm0
-						movss		xmm7, [edi+(1*5+4)*4]
-						mulss		xmm7, xmm1
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(2*5+4)*4]
-						mulss		xmm7, xmm2
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(3*5+4)*4]
-						mulss		xmm7, xmm3
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(4*5+4)*4]
-						mulss		xmm7, xmm4
-						addss		xmm6, xmm7
-						movss		xmm7, [edi+(5*5+4)*4]
-						mulss		xmm7, xmm5
-						addss		xmm6, xmm7
-						STORE1( 16, xmm6, xmm7 )
-					}
-					return;
-				}
-				case 6: {		// 6x6 * 6x1
-					__asm {
-						mov			esi, vPtr
-						mov			edi, mPtr
-						mov			eax, dstPtr
-						movlps		xmm0, [esi+0*4]
-						movlps		xmm1, [esi+2*4]
-						movlps		xmm2, [esi+4*4]
-						movaps		xmm3, xmm0
-						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm3, [edi+(0*6+0)*4]
-						movlps		xmm5, [edi+(1*6+0)*4]
-						movhps		xmm5, [edi+(1*6+2)*4]
-						movaps		xmm6, xmm0
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(2*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm1
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(3*6+0)*4]
-						movhps		xmm5, [edi+(3*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
-						mulps		xmm6, [edi+(4*6+0)*4]
-						addps		xmm3, xmm6
-						movaps		xmm6, xmm2
-						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-						movlps		xmm5, [edi+(5*6+0)*4]
-						movhps		xmm5, [edi+(5*6+2)*4]
-						mulps		xmm5, xmm6
-						addps		xmm3, xmm5
-						STORE4( 0, xmm3, xmm7 )
-						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
-						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
-						movlps		xmm3, [edi+(0*6+4)*4]
-						movhps		xmm3, [edi+(1*6+4)*4]
-						mulps		xmm3, xmm0
-						movlps		xmm4, [edi+(2*6+4)*4]
-						movhps		xmm4, [edi+(3*6+4)*4]
-						mulps		xmm4, xmm1
-						addps		xmm3, xmm4
-						movlps		xmm5, [edi+(4*6+4)*4]
-						movhps		xmm5, [edi+(5*6+4)*4]
-						mulps		xmm5, xmm2
-						addps		xmm3, xmm5
-						movhlps		xmm4, xmm3
-						addps		xmm3, xmm4
-						STORE2LO( 16, xmm3, xmm7 )
-					}
-					return;
-				}
-				default: {
-					for ( int i = 0; i < numColumns; i++ ) {
-						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
-								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
-						mPtr++;
-					}
-					return;
-				}
+			return;
+		}
+		}
+		break;
+	case 4:
+		switch ( numColumns ) {
+		case 6: {		// 4x6 * 4x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 6 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
 			}
-			break;
-		default:
-			int numRows = mat.GetNumRows();
+			return;
+		}
+		default: {
 			for ( int i = 0; i < numColumns; i++ ) {
-				mPtr = mat.ToFloatPtr() + i;
-				float sum = mPtr[0] * vPtr[0];
-				for ( int j = 1; j < numRows; j++ ) {
-					mPtr += numColumns;
-					sum += mPtr[0] * vPtr[j];
-				}
-				dstPtr[i] STOREC sum;
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3];
+				mPtr++;
 			}
-			break;
+			return;
+		}
+		}
+		break;
+	case 5:
+		switch ( numColumns ) {
+		case 6: {		// 5x6 * 5x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movss		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps		xmm4, xmm2
+				mulps		xmm4, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm4
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	case 6:
+		switch ( numColumns ) {
+		case 1: {		// 6x1 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi]
+				movhps		xmm0, [esi+8]
+				movlps		xmm1, [esi+16]
+				mulps		xmm0, [edi]
+				mulps		xmm1, [edi+16]
+				shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
+				addps		xmm0, xmm1
+				movhlps		xmm2, xmm0
+				addss		xmm2, xmm0
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
+				addss		xmm2, xmm0
+				STORE1( 0, xmm2, xmm3 )
+			}
+			return;
+		}
+		case 2: {		// 6x2 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm6, [edi + 0 * 4]
+				mulps		xmm6, xmm0
+				movlps		xmm1, [esi + 2 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 4 * 4]
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm2, [esi + 4 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movaps		xmm7, [edi + 8 * 4]
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movhlps		xmm3, xmm6
+				addps		xmm3, xmm6
+				STORE2LO( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 3: {		// 6x3 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movss		xmm0, [edi+( 0*3+2 )*4]
+				movhps		xmm0, [edi+( 0*3+0 )*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm6, [esi + 0 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movss		xmm1, [edi + ( 1 * 3 + 0 ) * 4]
+				movhps		xmm1, [edi + ( 1 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 1 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movss		xmm2, [edi + ( 2 * 3 + 2 ) * 4]
+				movhps		xmm2, [edi + ( 2 * 3 + 0 ) * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 2 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movss		xmm3, [edi + ( 3 * 3 + 0 ) * 4]
+				movhps		xmm3, [edi + ( 3 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 3 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movss		xmm4, [edi + ( 4 * 3 + 2 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 3 + 0 ) * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
+				movss		xmm7, [esi + 4 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movss		xmm5, [edi + ( 5 * 3 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 3 + 1 ) * 4]
+				movss		xmm7, [esi + 5 * 4]
+				shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE1( 0, xmm6, xmm7 )
+				STORE2HI( 4, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 4: {		// 6x4 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm3, [edi+( 0*4+0 )*4]
+				movhps		xmm3, [edi+( 0*4+2 )*4]
+				movss		xmm4, [esi+0*4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 1 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 1 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 2 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 2 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 2 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 3 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 3 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movlps		xmm4, [edi + ( 4 * 4 + 0 ) * 4]
+				movhps		xmm4, [edi + ( 4 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 4 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm4, xmm6
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 5 * 4 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 4 + 2 ) * 4]
+				movss		xmm6, [esi + 5 * 4]
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+			}
+			return;
+		}
+		case 5: {		// 6x5 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm6, [edi+( 0*5+0 )*4]
+				movhps		xmm6, [edi+( 0*5+2 )*4]
+				movss		xmm0, [esi+0*4]
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, xmm0
+				movlps		xmm7, [edi + ( 1 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 1 * 5 + 2 ) * 4]
+				movss		xmm1, [esi + 1 * 4]
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm1
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 2 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 2 * 5 + 2 ) * 4]
+				movss		xmm2, [esi + 2 * 4]
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm2
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 3 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 3 * 5 + 2 ) * 4]
+				movss		xmm3, [esi + 3 * 4]
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm3
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 4 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 4 * 5 + 2 ) * 4]
+				movss		xmm4, [esi + 4 * 4]
+				shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm4
+				addps		xmm6, xmm7
+				movlps		xmm7, [edi + ( 5 * 5 + 0 ) * 4]
+				movhps		xmm7, [edi + ( 5 * 5 + 2 ) * 4]
+				movss		xmm5, [esi + 5 * 4]
+				shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm7, xmm5
+				addps		xmm6, xmm7
+				STORE4( 0, xmm6, xmm7 )
+				movss		xmm6, [edi + ( 0 * 5 + 4 ) * 4]
+				mulss		xmm6, xmm0
+				movss		xmm7, [edi + ( 1 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm1
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 2 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm2
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 3 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm3
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 4 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm4
+				addss		xmm6, xmm7
+				movss		xmm7, [edi + ( 5 * 5 + 4 ) * 4]
+				mulss		xmm7, xmm5
+				addss		xmm6, xmm7
+				STORE1( 16, xmm6, xmm7 )
+			}
+			return;
+		}
+		case 6: {		// 6x6 * 6x1
+			__asm {
+				mov			esi, vPtr
+				mov			edi, mPtr
+				mov			eax, dstPtr
+				movlps		xmm0, [esi+0*4]
+				movlps		xmm1, [esi+2*4]
+				movlps		xmm2, [esi+4*4]
+				movaps		xmm3, xmm0
+				shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm3, [edi + ( 0 * 6 + 0 ) * 4]
+				movlps		xmm5, [edi + ( 1 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 1 * 6 + 2 ) * 4]
+				movaps		xmm6, xmm0
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 2 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm1
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 3 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 3 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
+				mulps		xmm6, [edi + ( 4 * 6 + 0 ) * 4]
+				addps		xmm3, xmm6
+				movaps		xmm6, xmm2
+				shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+				movlps		xmm5, [edi + ( 5 * 6 + 0 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 2 ) * 4]
+				mulps		xmm5, xmm6
+				addps		xmm3, xmm5
+				STORE4( 0, xmm3, xmm7 )
+				shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
+				shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
+				movlps		xmm3, [edi + ( 0 * 6 + 4 ) * 4]
+				movhps		xmm3, [edi + ( 1 * 6 + 4 ) * 4]
+				mulps		xmm3, xmm0
+				movlps		xmm4, [edi + ( 2 * 6 + 4 ) * 4]
+				movhps		xmm4, [edi + ( 3 * 6 + 4 ) * 4]
+				mulps		xmm4, xmm1
+				addps		xmm3, xmm4
+				movlps		xmm5, [edi + ( 4 * 6 + 4 ) * 4]
+				movhps		xmm5, [edi + ( 5 * 6 + 4 ) * 4]
+				mulps		xmm5, xmm2
+				addps		xmm3, xmm5
+				movhlps		xmm4, xmm3
+				addps		xmm3, xmm4
+				STORE2LO( 16, xmm3, xmm7 )
+			}
+			return;
+		}
+		default: {
+			for ( int i = 0; i < numColumns; i++ ) {
+				dstPtr[i] STOREC *( mPtr ) * vPtr[0] + *( mPtr + numColumns ) * vPtr[1] + *( mPtr + 2 * numColumns ) * vPtr[2] +
+				*( mPtr + 3 * numColumns ) * vPtr[3] + *( mPtr + 4 * numColumns ) * vPtr[4] + *( mPtr + 5 * numColumns ) * vPtr[5];
+				mPtr++;
+			}
+			return;
+		}
+		}
+		break;
+	default:
+		int numRows = mat.GetNumRows();
+		for ( int i = 0; i < numColumns; i++ ) {
+			mPtr = mat.ToFloatPtr() + i;
+			float sum = mPtr[0] * vPtr[0];
+			for ( int j = 1; j < numRows; j++ ) {
+				mPtr += numColumns;
+				sum += mPtr[0] * vPtr[j];
+			}
+			dstPtr[i] STOREC sum;
+		}
+		break;
 	}
 
 #undef STOREC
@@ -8317,1280 +8413,1280 @@ void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const
 	const float *m1Ptr, *m2Ptr;
 	double sum;
 
-	assert( m1.GetNumColumns() == m2.GetNumRows() );
+	assert( m1.GetNumColumns() == m2.GetNumRows() );
+
+	dstPtr = dst.ToFloatPtr();
+	m1Ptr = m1.ToFloatPtr();
+	m2Ptr = m2.ToFloatPtr();
+	k = m1.GetNumRows();
+	l = m2.GetNumColumns();
+	n = m1.GetNumColumns();
+
+	switch ( n ) {
+	case 1: {
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 1:	{			// 1x1 * 1x6, no precision loss compared to FPU version
+				__asm {
+					mov			esi, m2Ptr
+					mov			edi, m1Ptr
+					mov			eax, dstPtr
+					movss		xmm0, [edi]
+					shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+					movaps		xmm1, [esi]
+					mulps		xmm1, xmm0
+					movaps		[eax], xmm1
+					movlps		xmm2, [esi + 16]
+					mulps		xmm2, xmm0
+					movlps		[eax + 16], xmm2
+				}
+				return;
+			}
+			case 6: {			// 6x1 * 1x6, no precision loss compared to FPU version
+				__asm {
+					mov			esi, m2Ptr
+					mov			edi, m1Ptr
+					mov			eax, dstPtr
+					xorps		xmm1, xmm1
+					movaps		xmm0, [edi]
+					movlps		xmm1, [edi+16]
+					movlhps		xmm1, xmm0
+					movhlps		xmm2, xmm0
+					movlhps		xmm2, xmm1
+					// row 0 and 1
+					movaps		xmm3, [esi]
+					movaps		xmm4, xmm3
+					shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+					movaps		xmm5, xmm3
+					shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
+					movaps		xmm6, xmm3
+					shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					movaps		[eax], xmm4
+					movaps		[eax + 16], xmm5
+					movaps		[eax + 32], xmm6
+					// row 2 and 3
+					movaps		xmm4, xmm3
+					shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
+					movaps		xmm5, xmm3
+					shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
+					shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm3, xmm2
+					movaps		[eax + 48], xmm4
+					movaps		[eax + 64], xmm5
+					movaps		[eax + 80], xmm3
+					// row 4 and 5
+					movlps		xmm3, [esi + 16]
+					movaps		xmm4, xmm3
+					shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
+					movaps		xmm5, xmm3
+					shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
+					shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm3, xmm2
+					movaps		[eax + 96], xmm4
+					movaps		[eax + 112], xmm5
+					movaps		[eax + 128], xmm3
+				}
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0];
+				m2Ptr++;
+			}
+			m1Ptr++;
+		}
+		break;
+	}
+	case 2: {
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 2: {			// 2x2 * 2x6
+
+#define MUL_Nx2_2x6_INIT								\
+				__asm mov		esi, m2Ptr								\
+				__asm mov		edi, m1Ptr								\
+				__asm mov		eax, dstPtr								\
+				__asm movaps	xmm0, [esi]								\
+				__asm movlps	xmm1, [esi+16]							\
+				__asm movhps	xmm1, [esi+40]							\
+				__asm movlps	xmm2, [esi+24]							\
+				__asm movhps	xmm2, [esi+32]
+
+#define MUL_Nx2_2x6_ROW2( row )							\
+				__asm movaps	xmm3, [edi+row*16]						\
+				__asm movaps	xmm5, xmm0								\
+				__asm movaps	xmm4, xmm3								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm5, xmm4								\
+				__asm movaps	xmm4, xmm3								\
+				__asm movaps	xmm6, xmm2								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 )	\
+				__asm mulps		xmm6, xmm4								\
+				__asm addps		xmm5, xmm6								\
+				__asm movaps	[eax+row*48], xmm5						\
+				__asm movaps	xmm4, xmm3								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+				__asm movaps	xmm7, xmm1								\
+				__asm mulps		xmm7, xmm4								\
+				__asm movaps	xmm4, xmm3								\
+				__asm movaps	xmm5, xmm0								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )	\
+				__asm mulps		xmm5, xmm4								\
+				__asm movaps	xmm4, xmm3								\
+				__asm movaps	xmm6, xmm2								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 )	\
+				__asm mulps		xmm6, xmm4								\
+				__asm addps		xmm5, xmm6								\
+				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )	\
+				__asm movaps	xmm6, xmm1								\
+				__asm mulps		xmm6, xmm3								\
+				__asm movaps	xmm4, xmm7								\
+				__asm movlhps	xmm7, xmm6								\
+				__asm movhlps	xmm6, xmm4								\
+				__asm addps		xmm6, xmm7								\
+				__asm movlps	[eax+row*48+16], xmm6					\
+				__asm movlps	[eax+row*48+24], xmm5					\
+				__asm movhps	[eax+row*48+32], xmm5					\
+				__asm movhps	[eax+row*48+40], xmm6
+
+				MUL_Nx2_2x6_INIT
+				MUL_Nx2_2x6_ROW2( 0 )
+
+				return;
+			}
+			case 6: {			// 6x2 * 2x6
+
+				MUL_Nx2_2x6_INIT
+				MUL_Nx2_2x6_ROW2( 0 )
+				MUL_Nx2_2x6_ROW2( 1 )
+				MUL_Nx2_2x6_ROW2( 2 )
+
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
+				m2Ptr++;
+			}
+			m1Ptr += 2;
+		}
+		break;
+	}
+	case 3: {
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 3: {			// 3x3 * 3x6
+				__asm {
+					mov		esi, m2Ptr
+					mov		edi, m1Ptr
+					mov		eax, dstPtr
+					movaps	xmm5, xmmword ptr [esi]
+					movlps	xmm6, qword ptr [esi+24]
+					movhps	xmm6, qword ptr [esi+32]
+					movaps	xmm7, xmmword ptr [esi+48]
+					movss	xmm0, dword ptr [edi]
+					shufps	xmm0, xmm0, 0
+					mulps	xmm0, xmm5
+					movss	xmm1, dword ptr [edi+4]
+					shufps	xmm1, xmm1, 0
+					mulps	xmm1, xmm6
+					movss	xmm2, dword ptr [edi+8]
+					shufps	xmm2, xmm2, 0
+					mulps	xmm2, xmm7
+					addps	xmm0, xmm1
+					addps	xmm0, xmm2
+					movaps	xmmword ptr [eax], xmm0
+					movss	xmm3, dword ptr [edi+12]
+					shufps	xmm3, xmm3, 0
+					mulps	xmm3, xmm5
+					movss	xmm4, dword ptr [edi+16]
+					shufps	xmm4, xmm4, 0
+					mulps	xmm4, xmm6
+					movss	xmm0, dword ptr [edi+20]
+					shufps	xmm0, xmm0, 0
+					mulps	xmm0, xmm7
+					addps	xmm3, xmm4
+					addps	xmm0, xmm3
+					movlps	qword ptr [eax+24], xmm0
+					movhps	qword ptr [eax+32], xmm0
+					movss	xmm1, dword ptr [edi+24]
+					shufps	xmm1, xmm1, 0
+					mulps	xmm1, xmm5
+					movss	xmm2, dword ptr [edi+28]
+					shufps	xmm2, xmm2, 0
+					mulps	xmm2, xmm6
+					movss	xmm3, dword ptr [edi+32]
+					shufps	xmm3, xmm3, 0
+					mulps	xmm3, xmm7
+					addps	xmm1, xmm2
+					addps	xmm1, xmm3
+					movaps	xmmword ptr [eax+48], xmm1
+					movlps	xmm5, qword ptr [esi+16]
+					movlps	xmm6, qword ptr [esi+40]
+					movlps	xmm7, qword ptr [esi+64]
+					shufps	xmm5, xmm5, 0x44
+					shufps	xmm6, xmm6, 0x44
+					shufps	xmm7, xmm7, 0x44
+					movaps	xmm3, xmmword ptr [edi]
+					movlps	xmm4, qword ptr [edi+16]
+					movaps	xmm0, xmm3
+					shufps	xmm0, xmm0, 0xF0
+					mulps	xmm0, xmm5
+					movaps	xmm1, xmm3
+					shufps	xmm1, xmm4, 0x05
+					mulps	xmm1, xmm6
+					shufps	xmm3, xmm4, 0x5A
+					mulps	xmm3, xmm7
+					addps	xmm1, xmm0
+					addps	xmm1, xmm3
+					movlps	qword ptr [eax+16], xmm1
+					movhps	qword ptr [eax+40], xmm1
+					movss	xmm0, dword ptr [edi+24]
+					shufps	xmm0, xmm0, 0
+					mulps	xmm0, xmm5
+					movss	xmm2, dword ptr [edi+28]
+					shufps	xmm2, xmm2, 0
+					mulps	xmm2, xmm6
+					movss	xmm4, dword ptr [edi+32]
+					shufps	xmm4, xmm4, 0
+					mulps	xmm4, xmm7
+					addps	xmm0, xmm2
+					addps	xmm0, xmm4
+					movlps	qword ptr [eax+64], xmm0
+				}
+				return;
+			}
+			case 6: {			// 6x3 * 3x6
+#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT						\
+				__asm mov			esi, m2Ptr								\
+				__asm mov			edi, m1Ptr								\
+				__asm mov			eax, dstPtr								\
+				__asm movlps		xmm0, [esi+ 0*4]						\
+				__asm movhps		xmm0, [esi+ 2*4]						\
+				__asm movlps		xmm1, [esi+ 6*4]						\
+				__asm movhps		xmm1, [esi+ 8*4]						\
+				__asm movlps		xmm2, [esi+12*4]						\
+				__asm movhps		xmm2, [esi+14*4]
+
+#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row )				\
+				__asm movss			xmm3, [edi+(row*3+0)*4]					\
+				__asm shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm3, xmm0								\
+				__asm movss			xmm4, [edi+(row*3+1)*4]					\
+				__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm4, xmm1								\
+				__asm addps			xmm3, xmm4								\
+				__asm movss			xmm5, [edi+(row*3+2)*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm3, xmm5								\
+				__asm movlps		[eax+(row*6+0)*4], xmm3					\
+				__asm movhps		[eax+(row*6+2)*4], xmm3
+
+#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6						\
+				__asm movlps		xmm0, [esi+ 4*4]						\
+				__asm movlps		xmm1, [esi+10*4]						\
+				__asm movlps		xmm2, [esi+16*4]						\
+				__asm shufps		xmm0, xmm0, 0x44						\
+				__asm shufps		xmm1, xmm1, 0x44						\
+				__asm shufps		xmm2, xmm2, 0x44						\
+				__asm movlps		xmm3, [edi+0*4]							\
+				__asm movhps		xmm3, [edi+2*4]							\
+				__asm movaps		xmm4, xmm3								\
+				__asm movaps		xmm5, xmm3								\
+				__asm shufps		xmm3, xmm3, 0xF0						\
+				__asm mulps			xmm3, xmm0								\
+				__asm movlps		xmm6, [edi+4*4]							\
+				__asm movhps		xmm6, [edi+6*4]							\
+				__asm shufps		xmm4, xmm6, 0x05						\
+				__asm mulps			xmm4, xmm1								\
+				__asm addps			xmm3, xmm4								\
+				__asm shufps		xmm5, xmm6, 0x5A						\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm3, xmm5								\
+				__asm movlps		[eax+4*4], xmm3							\
+				__asm movhps		[eax+10*4], xmm3						\
+				__asm movaps		xmm5, xmm6								\
+				__asm movlps		xmm3, [edi+8*4]							\
+				__asm movhps		xmm3, [edi+10*4]						\
+				__asm movaps		xmm4, xmm3								\
+				__asm shufps		xmm5, xmm3, 0x5A						\
+				__asm mulps			xmm5, xmm0								\
+				__asm shufps		xmm6, xmm3, 0xAF						\
+				__asm mulps			xmm6, xmm1								\
+				__asm addps			xmm5, xmm6								\
+				__asm shufps		xmm4, xmm4, 0xF0						\
+				__asm mulps			xmm4, xmm2								\
+				__asm addps			xmm4, xmm5								\
+				__asm movlps		[eax+16*4], xmm4						\
+				__asm movhps		[eax+22*4], xmm4						\
+				__asm movlps		xmm6, [edi+12*4]						\
+				__asm movhps		xmm6, [edi+14*4]						\
+				__asm movaps		xmm5, xmm6								\
+				__asm movaps		xmm4, xmm6								\
+				__asm shufps		xmm6, xmm6, 0xF0						\
+				__asm mulps			xmm6, xmm0								\
+				__asm movlps		xmm3, [edi+16*4]						\
+				__asm shufps		xmm5, xmm3, 0x05						\
+				__asm mulps			xmm5, xmm1								\
+				__asm addps			xmm5, xmm6								\
+				__asm shufps		xmm4, xmm3, 0x5A						\
+				__asm mulps			xmm4, xmm2								\
+				__asm addps			xmm4, xmm5								\
+				__asm movlps		[eax+28*4], xmm4						\
+				__asm movhps		[eax+34*4], xmm4
+
+				MUL_Nx3_3x6_FIRST4COLUMNS_INIT
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
+				MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
+				MUL_Nx3_3x6_LAST2COLUMNS_ROW6
+
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l];
+				m2Ptr++;
+			}
+			m1Ptr += 3;
+		}
+		break;
+	}
+	case 4: {
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 4: {			// 4x4 * 4x6
+
+#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT						\
+				__asm mov			esi, m2Ptr								\
+				__asm mov			edi, m1Ptr								\
+				__asm mov			eax, dstPtr								\
+				__asm movlps		xmm0, [esi+ 0*4]						\
+				__asm movhps		xmm0, [esi+ 2*4]						\
+				__asm movlps		xmm1, [esi+ 6*4]						\
+				__asm movhps		xmm1, [esi+ 8*4]						\
+				__asm movlps		xmm2, [esi+12*4]						\
+				__asm movhps		xmm2, [esi+14*4]						\
+				__asm movlps		xmm3, [esi+18*4]						\
+				__asm movhps		xmm3, [esi+20*4]
+
+#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row )				\
+				__asm movss			xmm4, [edi+row*16+0*4]					\
+				__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm4, xmm0								\
+				__asm movss			xmm5, [edi+row*16+1*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm1								\
+				__asm addps			xmm4, xmm5								\
+				__asm movss			xmm6, [edi+row*16+2*4]					\
+				__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm6, xmm2								\
+				__asm addps			xmm4, xmm6								\
+				__asm movss			xmm7, [edi+row*16+3*4]					\
+				__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm7, xmm3								\
+				__asm addps			xmm4, xmm7								\
+				__asm movlps		[eax+row*24+0], xmm4					\
+				__asm movhps		[eax+row*24+8], xmm4
+
+#define MUL_Nx4_4x6_LAST2COLUMNS_INIT						\
+				__asm movlps		xmm0, [esi+ 4*4]						\
+				__asm movlps		xmm1, [esi+10*4]						\
+				__asm movlps		xmm2, [esi+16*4]						\
+				__asm movlps		xmm3, [esi+22*4]						\
+				__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
+
+#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row )				\
+				__asm movlps		xmm7, [edi+row*32+ 0*4]					\
+				__asm movhps		xmm7, [edi+row*32+ 4*4]					\
+				__asm movaps		xmm6, xmm7								\
+				__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 )	\
+				__asm mulps			xmm6, xmm0								\
+				__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 )	\
+				__asm mulps			xmm7, xmm1								\
+				__asm addps			xmm6, xmm7								\
+				__asm movlps		xmm4, [edi+row*32+ 2*4]					\
+				__asm movhps		xmm4, [edi+row*32+ 6*4]					\
+				__asm movaps		xmm5, xmm4								\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 )	\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm6, xmm5								\
+				__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 )	\
+				__asm mulps			xmm4, xmm3								\
+				__asm addps			xmm6, xmm4								\
+				__asm movlps		[eax+row*48+ 4*4], xmm6					\
+				__asm movhps		[eax+row*48+10*4], xmm6
+
+				MUL_Nx4_4x6_FIRST4COLUMNS_INIT
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
+				MUL_Nx4_4x6_LAST2COLUMNS_INIT
+				MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
+				MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
+
+				return;
+			}
+			case 6: {			// 6x4 * 4x6
+
+				MUL_Nx4_4x6_FIRST4COLUMNS_INIT
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
+				MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
+				MUL_Nx4_4x6_LAST2COLUMNS_INIT
+				MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
+				MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
+				MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
+
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] +
+				            m1Ptr[3] * m2Ptr[3 * l];
+				m2Ptr++;
+			}
+			m1Ptr += 4;
+		}
+		break;
+	}
+	case 5: {
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 5: {			// 5x5 * 5x6
+
+#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT						\
+				__asm mov			esi, m2Ptr								\
+				__asm mov			edi, m1Ptr								\
+				__asm mov			eax, dstPtr								\
+				__asm movlps		xmm0, [esi+ 0*4]						\
+				__asm movhps		xmm0, [esi+ 2*4]						\
+				__asm movlps		xmm1, [esi+ 6*4]						\
+				__asm movhps		xmm1, [esi+ 8*4]						\
+				__asm movlps		xmm2, [esi+12*4]						\
+				__asm movhps		xmm2, [esi+14*4]						\
+				__asm movlps		xmm3, [esi+18*4]						\
+				__asm movhps		xmm3, [esi+20*4]						\
+				__asm movlps		xmm4, [esi+24*4]						\
+				__asm movhps		xmm4, [esi+26*4]
+
+#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row )				\
+				__asm movss			xmm6, [edi+row*20+0*4]					\
+				__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm6, xmm0								\
+				__asm movss			xmm5, [edi+row*20+1*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm1								\
+				__asm addps			xmm6, xmm5								\
+				__asm movss			xmm5, [edi+row*20+2*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm6, xmm5								\
+				__asm movss			xmm5, [edi+row*20+3*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm3								\
+				__asm addps			xmm6, xmm5								\
+				__asm movss			xmm5, [edi+row*20+4*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps			xmm5, xmm4								\
+				__asm addps			xmm6, xmm5								\
+				__asm movlps		[eax+row*24+0], xmm6					\
+				__asm movhps		[eax+row*24+8], xmm6
+
+#define MUL_Nx5_5x6_LAST2COLUMNS_INIT						\
+				__asm movlps		xmm0, [esi+ 4*4]						\
+				__asm movlps		xmm1, [esi+10*4]						\
+				__asm movlps		xmm2, [esi+16*4]						\
+				__asm movlps		xmm3, [esi+22*4]						\
+				__asm movlps		xmm4, [esi+28*4]						\
+				__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+				__asm shufps		xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
+
+#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row )				\
+				__asm movlps		xmm7, [edi+row*40+ 0*4]					\
+				__asm movhps		xmm7, [edi+row*40+ 6*4]					\
+				__asm movaps		xmm6, xmm7								\
+				__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 )	\
+				__asm mulps			xmm6, xmm0								\
+				__asm movaps		xmm5, xmm7								\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
+				__asm mulps			xmm5, xmm1								\
+				__asm addps			xmm6, xmm5								\
+				__asm movlps		xmm7, [edi+row*40+ 2*4]					\
+				__asm movhps		xmm7, [edi+row*40+ 8*4]					\
+				__asm movaps		xmm5, xmm7								\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 )	\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm6, xmm5								\
+				__asm movaps		xmm5, xmm7								\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
+				__asm mulps			xmm5, xmm3								\
+				__asm addps			xmm6, xmm5								\
+				__asm movlps		xmm5, [edi+row*40+ 4*4]					\
+				__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+				__asm mulps			xmm5, xmm4								\
+				__asm addps			xmm6, xmm5								\
+				__asm movlps		[eax+row*48+ 4*4], xmm6					\
+				__asm movhps		[eax+row*48+10*4], xmm6
+
+#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row )					\
+				__asm movlps		xmm6, [edi+20*4+0*4]					\
+				__asm unpcklps		xmm6, xmm6								\
+				__asm mulps			xmm6, xmm0								\
+				__asm movlps		xmm5, [edi+20*4+2*4]					\
+				__asm unpcklps		xmm5, xmm5								\
+				__asm mulps			xmm5, xmm2								\
+				__asm addps			xmm6, xmm5								\
+				__asm movss			xmm5, [edi+20*4+4*4]					\
+				__asm unpcklps		xmm5, xmm5								\
+				__asm mulps			xmm5, xmm4								\
+				__asm addps			xmm6, xmm5								\
+				__asm movhlps		xmm7, xmm6								\
+				__asm addps			xmm6, xmm7								\
+				__asm movlps		[eax+row*24+4*4], xmm6
+
+				MUL_Nx5_5x6_FIRST4COLUMNS_INIT
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
+				MUL_Nx5_5x6_LAST2COLUMNS_INIT
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
+
+				return;
+			}
+			case 6: {			// 6x5 * 5x6
+
+				MUL_Nx5_5x6_FIRST4COLUMNS_INIT
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
+				MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
+				MUL_Nx5_5x6_LAST2COLUMNS_INIT
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
+				MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
+
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] +
+				            m1Ptr[3] * m2Ptr[3 * l] + m1Ptr[4] * m2Ptr[4 * l];
+				m2Ptr++;
+			}
+			m1Ptr += 5;
+		}
+		break;
+	}
+	case 6: {
+		switch ( k ) {
+		case 1: {
+			if ( !( l ^ 1 ) ) {		// 1x6 * 6x1
+				dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
+				            m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
+				return;
+			}
+			break;
+		}
+		case 2: {
+			if ( !( l ^ 2 ) ) {		// 2x6 * 6x2
+
+#define MUL_Nx6_6x2_INIT								\
+				__asm mov		esi, m2Ptr								\
+				__asm mov		edi, m1Ptr								\
+				__asm mov		eax, dstPtr								\
+				__asm movaps	xmm0, [esi]								\
+				__asm movaps	xmm1, [esi+16]							\
+				__asm movaps	xmm2, [esi+32]
+
+#define MUL_Nx6_6x2_ROW2( row )							\
+				__asm movaps	xmm7, [edi+row*48+0*4]					\
+				__asm movaps	xmm6, xmm7								\
+				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+				__asm mulps		xmm7, xmm0								\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 )	\
+				__asm mulps		xmm6, xmm1								\
+				__asm addps		xmm7, xmm6								\
+				__asm movaps	xmm6, [edi+row*48+4*4]					\
+				__asm movaps	xmm5, xmm6								\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+				__asm mulps		xmm6, xmm2								\
+				__asm addps		xmm7, xmm6								\
+				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )	\
+				__asm mulps		xmm5, xmm0								\
+				__asm movaps	xmm6, [edi+row*48+24+2*4]				\
+				__asm movaps	xmm4, xmm6								\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+				__asm mulps		xmm6, xmm1								\
+				__asm addps		xmm5, xmm6								\
+				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 )	\
+				__asm mulps		xmm4, xmm2								\
+				__asm addps		xmm5, xmm4								\
+				__asm movaps	xmm4, xmm5								\
+				__asm movhlps	xmm5, xmm7								\
+				__asm movlhps	xmm7, xmm4								\
+				__asm addps		xmm7, xmm5								\
+				__asm movaps	[eax+row*16], xmm7
+
+				MUL_Nx6_6x2_INIT
+				MUL_Nx6_6x2_ROW2( 0 )
+
+				return;
+			}
+			break;
+		}
+		case 3: {
+			if ( !( l ^ 3 ) ) {		// 3x6 * 6x3
+
+#define MUL_Nx6_6x3_INIT								\
+				__asm mov		esi, m2Ptr								\
+				__asm mov		edi, m1Ptr								\
+				__asm mov		eax, dstPtr								\
+				__asm movss		xmm0, [esi+ 0*4]						\
+				__asm movhps	xmm0, [esi+ 1*4]						\
+				__asm movss		xmm1, [esi+ 3*4]						\
+				__asm movhps	xmm1, [esi+ 4*4]						\
+				__asm movss		xmm2, [esi+ 6*4]						\
+				__asm movhps	xmm2, [esi+ 7*4]						\
+				__asm movss		xmm3, [esi+ 9*4]						\
+				__asm movhps	xmm3, [esi+10*4]						\
+				__asm movss		xmm4, [esi+12*4]						\
+				__asm movhps	xmm4, [esi+13*4]						\
+				__asm movss		xmm5, [esi+15*4]						\
+				__asm movhps	xmm5, [esi+16*4]
+
+#define MUL_Nx6_6x3_ROW( row )							\
+				__asm movss		xmm7, [edi+row*24+0]					\
+				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm7, xmm0								\
+				__asm movss		xmm6, [edi+row*24+4]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm1								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+8]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm2								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+12]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm3								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+16]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm4								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+20]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm5								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		[eax+row*12+0], xmm7					\
+				__asm movhps	[eax+row*12+4], xmm7
+
+				MUL_Nx6_6x3_INIT
+				MUL_Nx6_6x3_ROW( 0 )
+				MUL_Nx6_6x3_ROW( 1 )
+				MUL_Nx6_6x3_ROW( 2 )
+
+				return;
+			}
+			break;
+		}
+		case 4: {
+			if ( !( l ^ 4 ) ) {		// 4x6 * 6x4
+
+#define MUL_Nx6_6x4_INIT								\
+				__asm mov		esi, m2Ptr								\
+				__asm mov		edi, m1Ptr								\
+				__asm mov		eax, dstPtr								\
+				__asm movaps	xmm0, [esi]								\
+				__asm movaps	xmm1, [esi+16]							\
+				__asm movaps	xmm2, [esi+32]							\
+				__asm movaps	xmm3, [esi+48]							\
+				__asm movaps	xmm4, [esi+64]							\
+				__asm movaps	xmm5, [esi+80]
+
+#define MUL_Nx6_6x4_ROW( row )							\
+				__asm movss		xmm7, [edi+row*24+0]					\
+				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm7, xmm0								\
+				__asm movss		xmm6, [edi+row*24+4]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm1								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+8]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm2								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+12]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm3								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+16]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm4								\
+				__asm addps		xmm7, xmm6								\
+				__asm movss		xmm6, [edi+row*24+20]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm5								\
+				__asm addps		xmm7, xmm6								\
+				__asm movaps	[eax+row*16], xmm7
 
-	dstPtr = dst.ToFloatPtr();
-	m1Ptr = m1.ToFloatPtr();
-	m2Ptr = m2.ToFloatPtr();
-	k = m1.GetNumRows();
-	l = m2.GetNumColumns();
-	n = m1.GetNumColumns();
+				MUL_Nx6_6x4_INIT
+				MUL_Nx6_6x4_ROW( 0 )
+				MUL_Nx6_6x4_ROW( 1 )
+				MUL_Nx6_6x4_ROW( 2 )
+				MUL_Nx6_6x4_ROW( 3 )
 
-	switch( n ) {
-		case 1: {
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 1:	{			// 1x1 * 1x6, no precision loss compared to FPU version
-						__asm {
-							mov			esi, m2Ptr
-							mov			edi, m1Ptr
-							mov			eax, dstPtr
-							movss		xmm0, [edi]
-							shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-							movaps		xmm1, [esi]
-							mulps		xmm1, xmm0
-							movaps		[eax], xmm1
-							movlps		xmm2, [esi+16]
-							mulps		xmm2, xmm0
-							movlps		[eax+16], xmm2
-						}
-						return;
-					}
-					case 6: {			// 6x1 * 1x6, no precision loss compared to FPU version
-						__asm {
-							mov			esi, m2Ptr
-							mov			edi, m1Ptr
-							mov			eax, dstPtr
-							xorps		xmm1, xmm1
-							movaps		xmm0, [edi]
-							movlps		xmm1, [edi+16]
-							movlhps		xmm1, xmm0
-							movhlps		xmm2, xmm0
-							movlhps		xmm2, xmm1
-							// row 0 and 1
-							movaps		xmm3, [esi]
-							movaps		xmm4, xmm3
-							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-							movaps		xmm5, xmm3
-							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
-							movaps		xmm6, xmm3
-							shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
-							mulps		xmm4, xmm0
-							mulps		xmm5, xmm1
-							mulps		xmm6, xmm2
-							movaps		[eax], xmm4
-							movaps		[eax+16], xmm5
-							movaps		[eax+32], xmm6
-							// row 2 and 3
-							movaps		xmm4, xmm3
-							shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
-							movaps		xmm5, xmm3
-							shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
-							shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
-							mulps		xmm4, xmm0
-							mulps		xmm5, xmm1
-							mulps		xmm3, xmm2
-							movaps		[eax+48], xmm4
-							movaps		[eax+64], xmm5
-							movaps		[eax+80], xmm3
-							// row 4 and 5
-							movlps		xmm3, [esi+16]
-							movaps		xmm4, xmm3
-							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-							movaps		xmm5, xmm3
-							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
-							shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
-							mulps		xmm4, xmm0
-							mulps		xmm5, xmm1
-							mulps		xmm3, xmm2
-							movaps		[eax+96], xmm4
-							movaps		[eax+112], xmm5
-							movaps		[eax+128], xmm3
-						}
-						return;
-					}
-				}
-			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
-					m2Ptr++;
-				}
-				m1Ptr++;
+				return;
 			}
 			break;
 		}
-		case 2: {
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 2: {			// 2x2 * 2x6
-
-						#define MUL_Nx2_2x6_INIT								\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movaps	xmm0, [esi]								\
-						__asm movlps	xmm1, [esi+16]							\
-						__asm movhps	xmm1, [esi+40]							\
-						__asm movlps	xmm2, [esi+24]							\
-						__asm movhps	xmm2, [esi+32]
-
-						#define MUL_Nx2_2x6_ROW2( row )							\
-						__asm movaps	xmm3, [edi+row*16]						\
-						__asm movaps	xmm5, xmm0								\
-						__asm movaps	xmm4, xmm3								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm5, xmm4								\
-						__asm movaps	xmm4, xmm3								\
-						__asm movaps	xmm6, xmm2								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm5, xmm6								\
-						__asm movaps	[eax+row*48], xmm5						\
-						__asm movaps	xmm4, xmm3								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm movaps	xmm7, xmm1								\
-						__asm mulps		xmm7, xmm4								\
-						__asm movaps	xmm4, xmm3								\
-						__asm movaps	xmm5, xmm0								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )	\
-						__asm mulps		xmm5, xmm4								\
-						__asm movaps	xmm4, xmm3								\
-						__asm movaps	xmm6, xmm2								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm5, xmm6								\
-						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )	\
-						__asm movaps	xmm6, xmm1								\
-						__asm mulps		xmm6, xmm3								\
-						__asm movaps	xmm4, xmm7								\
-						__asm movlhps	xmm7, xmm6								\
-						__asm movhlps	xmm6, xmm4								\
-						__asm addps		xmm6, xmm7								\
-						__asm movlps	[eax+row*48+16], xmm6					\
-						__asm movlps	[eax+row*48+24], xmm5					\
-						__asm movhps	[eax+row*48+32], xmm5					\
-						__asm movhps	[eax+row*48+40], xmm6
-
-						MUL_Nx2_2x6_INIT
-						MUL_Nx2_2x6_ROW2( 0 )
-
-						return;
-					}
-					case 6: {			// 6x2 * 2x6
-
-						MUL_Nx2_2x6_INIT
-						MUL_Nx2_2x6_ROW2( 0 )
-						MUL_Nx2_2x6_ROW2( 1 )
-						MUL_Nx2_2x6_ROW2( 2 )
-
-						return;
-					}
-				}
-			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
-					m2Ptr++;
-				}
-				m1Ptr += 2;
+		case 5: {
+			if ( !( l ^ 5 ) ) {		// 5x6 * 6x5
+
+#define MUL_Nx6_6x5_INIT								\
+				__asm mov		esi, m2Ptr								\
+				__asm mov		edi, m1Ptr								\
+				__asm mov		eax, dstPtr								\
+				__asm movaps	xmm0, [esi]								\
+				__asm movlps	xmm1, [esi+20]							\
+				__asm movhps	xmm1, [esi+28]							\
+				__asm movlps	xmm2, [esi+40]							\
+				__asm movhps	xmm2, [esi+48]							\
+				__asm movlps	xmm3, [esi+60]							\
+				__asm movhps	xmm3, [esi+68]							\
+				__asm movaps	xmm4, [esi+80]							\
+				__asm movlps	xmm5, [esi+100]							\
+				__asm movhps	xmm5, [esi+108]
+
+#define MUL_Nx6_6x5_ROW( row )							\
+				__asm movss		xmm7, [edi+row*24+0]					\
+				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm7, xmm0								\
+				__asm fld		dword ptr [edi+(row*6+0)*4]				\
+				__asm fmul		dword ptr [esi+(4+0*5)*4]				\
+				__asm movss		xmm6, [edi+row*24+4]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm1								\
+				__asm addps		xmm7, xmm6								\
+				__asm fld		dword ptr [edi+(row*6+1)*4]				\
+				__asm fmul		dword ptr [esi+(4+1*5)*4]				\
+				__asm faddp		st(1),st								\
+				__asm movss		xmm6, [edi+row*24+8]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm2								\
+				__asm addps		xmm7, xmm6								\
+				__asm fld		dword ptr [edi+(row*6+2)*4]				\
+				__asm fmul		dword ptr [esi+(4+2*5)*4]				\
+				__asm faddp		st(1),st								\
+				__asm movss		xmm6, [edi+row*24+12]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm3								\
+				__asm addps		xmm7, xmm6								\
+				__asm fld		dword ptr [edi+(row*6+3)*4]				\
+				__asm fmul		dword ptr [esi+(4+3*5)*4]				\
+				__asm faddp		st(1),st								\
+				__asm movss		xmm6, [edi+row*24+16]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm4								\
+				__asm addps		xmm7, xmm6								\
+				__asm fld		dword ptr [edi+(row*6+4)*4]				\
+				__asm fmul		dword ptr [esi+(4+4*5)*4]				\
+				__asm faddp		st(1),st								\
+				__asm movss		xmm6, [edi+row*24+20]					\
+				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+				__asm mulps		xmm6, xmm5								\
+				__asm addps		xmm7, xmm6								\
+				__asm fld		dword ptr [edi+(row*6+5)*4]				\
+				__asm fmul		dword ptr [esi+(4+5*5)*4]				\
+				__asm faddp		st(1),st								\
+				__asm fstp		dword ptr [eax+(row*5+4)*4]				\
+				__asm movlps	[eax+row*20], xmm7						\
+				__asm movhps	[eax+row*20+8], xmm7
+
+				MUL_Nx6_6x5_INIT
+				MUL_Nx6_6x5_ROW( 0 )
+				MUL_Nx6_6x5_ROW( 1 )
+				MUL_Nx6_6x5_ROW( 2 )
+				MUL_Nx6_6x5_ROW( 3 )
+				MUL_Nx6_6x5_ROW( 4 )
+
+				return;
 			}
 			break;
 		}
-		case 3: {
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 3: {			// 3x3 * 3x6
-						__asm {
-							mov		esi, m2Ptr
-							mov		edi, m1Ptr
-							mov		eax, dstPtr
-							movaps	xmm5, xmmword ptr [esi]
-							movlps	xmm6, qword ptr [esi+24]
-							movhps	xmm6, qword ptr [esi+32]
-							movaps	xmm7, xmmword ptr [esi+48]
-							movss	xmm0, dword ptr [edi]
-							shufps	xmm0, xmm0, 0
-							mulps	xmm0, xmm5
-							movss	xmm1, dword ptr [edi+4]
-							shufps	xmm1, xmm1, 0
-							mulps	xmm1, xmm6
-							movss	xmm2, dword ptr [edi+8]
-							shufps	xmm2, xmm2, 0
-							mulps	xmm2, xmm7
-							addps	xmm0, xmm1
-							addps	xmm0, xmm2
-							movaps	xmmword ptr [eax], xmm0
-							movss	xmm3, dword ptr [edi+12]
-							shufps	xmm3, xmm3, 0
-							mulps	xmm3, xmm5
-							movss	xmm4, dword ptr [edi+16]
-							shufps	xmm4, xmm4, 0
-							mulps	xmm4, xmm6
-							movss	xmm0, dword ptr [edi+20]
-							shufps	xmm0, xmm0, 0
-							mulps	xmm0, xmm7
-							addps	xmm3, xmm4
-							addps	xmm0, xmm3
-							movlps	qword ptr [eax+24], xmm0
-							movhps	qword ptr [eax+32], xmm0
-							movss	xmm1, dword ptr [edi+24]
-							shufps	xmm1, xmm1, 0
-							mulps	xmm1, xmm5
-							movss	xmm2, dword ptr [edi+28]
-							shufps	xmm2, xmm2, 0
-							mulps	xmm2, xmm6
-							movss	xmm3, dword ptr [edi+32]
-							shufps	xmm3, xmm3, 0
-							mulps	xmm3, xmm7
-							addps	xmm1, xmm2
-							addps	xmm1, xmm3
-							movaps	xmmword ptr [eax+48], xmm1
-							movlps	xmm5, qword ptr [esi+16]
-							movlps	xmm6, qword ptr [esi+40]
-							movlps	xmm7, qword ptr [esi+64]
-							shufps	xmm5, xmm5, 0x44
-							shufps	xmm6, xmm6, 0x44
-							shufps	xmm7, xmm7, 0x44
-							movaps	xmm3, xmmword ptr [edi]
-							movlps	xmm4, qword ptr [edi+16]
-							movaps	xmm0, xmm3
-							shufps	xmm0, xmm0, 0xF0
-							mulps	xmm0, xmm5
-							movaps	xmm1, xmm3
-							shufps	xmm1, xmm4, 0x05
-							mulps	xmm1, xmm6
-							shufps	xmm3, xmm4, 0x5A
-							mulps	xmm3, xmm7
-							addps	xmm1, xmm0
-							addps	xmm1, xmm3
-							movlps	qword ptr [eax+16], xmm1
-							movhps	qword ptr [eax+40], xmm1
-							movss	xmm0, dword ptr [edi+24]
-							shufps	xmm0, xmm0, 0
-							mulps	xmm0, xmm5
-							movss	xmm2, dword ptr [edi+28]
-							shufps	xmm2, xmm2, 0
-							mulps	xmm2, xmm6
-							movss	xmm4, dword ptr [edi+32]
-							shufps	xmm4, xmm4, 0
-							mulps	xmm4, xmm7
-							addps	xmm0, xmm2
-							addps	xmm0, xmm4
-							movlps	qword ptr [eax+64], xmm0
-						}
-						return;
-					}
-					case 6: {			// 6x3 * 3x6
-						#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT						\
-						__asm mov			esi, m2Ptr								\
-						__asm mov			edi, m1Ptr								\
-						__asm mov			eax, dstPtr								\
-						__asm movlps		xmm0, [esi+ 0*4]						\
-						__asm movhps		xmm0, [esi+ 2*4]						\
-						__asm movlps		xmm1, [esi+ 6*4]						\
-						__asm movhps		xmm1, [esi+ 8*4]						\
-						__asm movlps		xmm2, [esi+12*4]						\
-						__asm movhps		xmm2, [esi+14*4]
-
-						#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row )				\
-						__asm movss			xmm3, [edi+(row*3+0)*4]					\
-						__asm shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm3, xmm0								\
-						__asm movss			xmm4, [edi+(row*3+1)*4]					\
-						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm4, xmm1								\
-						__asm addps			xmm3, xmm4								\
-						__asm movss			xmm5, [edi+(row*3+2)*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm3, xmm5								\
-						__asm movlps		[eax+(row*6+0)*4], xmm3					\
-						__asm movhps		[eax+(row*6+2)*4], xmm3
-
-						#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6						\
-						__asm movlps		xmm0, [esi+ 4*4]						\
-						__asm movlps		xmm1, [esi+10*4]						\
-						__asm movlps		xmm2, [esi+16*4]						\
-						__asm shufps		xmm0, xmm0, 0x44						\
-						__asm shufps		xmm1, xmm1, 0x44						\
-						__asm shufps		xmm2, xmm2, 0x44						\
-						__asm movlps		xmm3, [edi+0*4]							\
-						__asm movhps		xmm3, [edi+2*4]							\
-						__asm movaps		xmm4, xmm3								\
-						__asm movaps		xmm5, xmm3								\
-						__asm shufps		xmm3, xmm3, 0xF0						\
-						__asm mulps			xmm3, xmm0								\
-						__asm movlps		xmm6, [edi+4*4]							\
-						__asm movhps		xmm6, [edi+6*4]							\
-						__asm shufps		xmm4, xmm6, 0x05						\
-						__asm mulps			xmm4, xmm1								\
-						__asm addps			xmm3, xmm4								\
-						__asm shufps		xmm5, xmm6, 0x5A						\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm3, xmm5								\
-						__asm movlps		[eax+4*4], xmm3							\
-						__asm movhps		[eax+10*4], xmm3						\
-						__asm movaps		xmm5, xmm6								\
-						__asm movlps		xmm3, [edi+8*4]							\
-						__asm movhps		xmm3, [edi+10*4]						\
-						__asm movaps		xmm4, xmm3								\
-						__asm shufps		xmm5, xmm3, 0x5A						\
-						__asm mulps			xmm5, xmm0								\
-						__asm shufps		xmm6, xmm3, 0xAF						\
-						__asm mulps			xmm6, xmm1								\
-						__asm addps			xmm5, xmm6								\
-						__asm shufps		xmm4, xmm4, 0xF0						\
-						__asm mulps			xmm4, xmm2								\
-						__asm addps			xmm4, xmm5								\
-						__asm movlps		[eax+16*4], xmm4						\
-						__asm movhps		[eax+22*4], xmm4						\
-						__asm movlps		xmm6, [edi+12*4]						\
-						__asm movhps		xmm6, [edi+14*4]						\
-						__asm movaps		xmm5, xmm6								\
-						__asm movaps		xmm4, xmm6								\
-						__asm shufps		xmm6, xmm6, 0xF0						\
-						__asm mulps			xmm6, xmm0								\
-						__asm movlps		xmm3, [edi+16*4]						\
-						__asm shufps		xmm5, xmm3, 0x05						\
-						__asm mulps			xmm5, xmm1								\
-						__asm addps			xmm5, xmm6								\
-						__asm shufps		xmm4, xmm3, 0x5A						\
-						__asm mulps			xmm4, xmm2								\
-						__asm addps			xmm4, xmm5								\
-						__asm movlps		[eax+28*4], xmm4						\
-						__asm movhps		[eax+34*4], xmm4
-
-						MUL_Nx3_3x6_FIRST4COLUMNS_INIT
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
-						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
-						MUL_Nx3_3x6_LAST2COLUMNS_ROW6
-
-						return;
-					}
+		case 6: {
+			switch ( l ) {
+			case 1: {		// 6x6 * 6x1
+				__asm {
+					mov			esi, m2Ptr
+					mov			edi, m1Ptr
+					mov			eax, dstPtr
+					movlps		xmm7, qword ptr [esi]
+					movlps		xmm6, qword ptr [esi+8]
+					shufps		xmm7, xmm7, 0x44
+					shufps		xmm6, xmm6, 0x44
+					movlps		xmm0, qword ptr [edi    ]
+					movhps		xmm0, qword ptr [edi+ 24]
+					mulps		xmm0, xmm7
+					movlps		xmm3, qword ptr [edi+  8]
+					movhps		xmm3, qword ptr [edi+ 32]
+					mulps		xmm3, xmm6
+					movlps		xmm1, qword ptr [edi+ 48]
+					movhps		xmm1, qword ptr [edi+ 72]
+					mulps		xmm1, xmm7
+					movlps		xmm2, qword ptr [edi+ 96]
+					movhps		xmm2, qword ptr [edi+120]
+					mulps		xmm2, xmm7
+					movlps		xmm4, qword ptr [edi+ 56]
+					movhps		xmm4, qword ptr [edi+ 80]
+					movlps		xmm5, qword ptr [edi+104]
+					movhps		xmm5, qword ptr [edi+128]
+					mulps		xmm4, xmm6
+					movlps		xmm7, qword ptr [esi+16]
+					addps		xmm0, xmm3
+					shufps		xmm7, xmm7, 0x44
+					mulps		xmm5, xmm6
+					addps		xmm1, xmm4
+					movlps		xmm3, qword ptr [edi+ 16]
+					movhps		xmm3, qword ptr [edi+ 40]
+					addps		xmm2, xmm5
+					movlps		xmm4, qword ptr [edi+ 64]
+					movhps		xmm4, qword ptr [edi+ 88]
+					mulps		xmm3, xmm7
+					movlps		xmm5, qword ptr [edi+112]
+					movhps		xmm5, qword ptr [edi+136]
+					addps		xmm0, xmm3
+					mulps		xmm4, xmm7
+					mulps		xmm5, xmm7
+					addps		xmm1, xmm4
+					addps		xmm2, xmm5
+					movaps		xmm6, xmm0
+					shufps		xmm0, xmm1, 0x88
+					shufps		xmm6, xmm1, 0xDD
+					movaps		xmm7, xmm2
+					shufps		xmm7, xmm2, 0x88
+					shufps		xmm2, xmm2, 0xDD
+					addps		xmm0, xmm6
+					addps		xmm2, xmm7
+					movlps		[eax], xmm0
+					movhps		[eax+8], xmm0
+					movlps		[eax+16], xmm2
 				}
+				return;
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
-					m2Ptr++;
-				}
-				m1Ptr += 3;
+			case 2: {		// 6x6 * 6x2
+
+				MUL_Nx6_6x2_INIT
+				MUL_Nx6_6x2_ROW2( 0 )
+				MUL_Nx6_6x2_ROW2( 1 )
+				MUL_Nx6_6x2_ROW2( 2 )
+
+				return;
 			}
-			break;
-		}
-		case 4: {
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 4: {			// 4x4 * 4x6
-
-						#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT						\
-						__asm mov			esi, m2Ptr								\
-						__asm mov			edi, m1Ptr								\
-						__asm mov			eax, dstPtr								\
-						__asm movlps		xmm0, [esi+ 0*4]						\
-						__asm movhps		xmm0, [esi+ 2*4]						\
-						__asm movlps		xmm1, [esi+ 6*4]						\
-						__asm movhps		xmm1, [esi+ 8*4]						\
-						__asm movlps		xmm2, [esi+12*4]						\
-						__asm movhps		xmm2, [esi+14*4]						\
-						__asm movlps		xmm3, [esi+18*4]						\
-						__asm movhps		xmm3, [esi+20*4]
-
-						#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row )				\
-						__asm movss			xmm4, [edi+row*16+0*4]					\
-						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm4, xmm0								\
-						__asm movss			xmm5, [edi+row*16+1*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm1								\
-						__asm addps			xmm4, xmm5								\
-						__asm movss			xmm6, [edi+row*16+2*4]					\
-						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm6, xmm2								\
-						__asm addps			xmm4, xmm6								\
-						__asm movss			xmm7, [edi+row*16+3*4]					\
-						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm7, xmm3								\
-						__asm addps			xmm4, xmm7								\
-						__asm movlps		[eax+row*24+0], xmm4					\
-						__asm movhps		[eax+row*24+8], xmm4
-
-						#define MUL_Nx4_4x6_LAST2COLUMNS_INIT						\
-						__asm movlps		xmm0, [esi+ 4*4]						\
-						__asm movlps		xmm1, [esi+10*4]						\
-						__asm movlps		xmm2, [esi+16*4]						\
-						__asm movlps		xmm3, [esi+22*4]						\
-						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
-
-						#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row )				\
-						__asm movlps		xmm7, [edi+row*32+ 0*4]					\
-						__asm movhps		xmm7, [edi+row*32+ 4*4]					\
-						__asm movaps		xmm6, xmm7								\
-						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 )	\
-						__asm mulps			xmm6, xmm0								\
-						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 )	\
-						__asm mulps			xmm7, xmm1								\
-						__asm addps			xmm6, xmm7								\
-						__asm movlps		xmm4, [edi+row*32+ 2*4]					\
-						__asm movhps		xmm4, [edi+row*32+ 6*4]					\
-						__asm movaps		xmm5, xmm4								\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 )	\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm6, xmm5								\
-						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 )	\
-						__asm mulps			xmm4, xmm3								\
-						__asm addps			xmm6, xmm4								\
-						__asm movlps		[eax+row*48+ 4*4], xmm6					\
-						__asm movhps		[eax+row*48+10*4], xmm6
-
-						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
-						MUL_Nx4_4x6_LAST2COLUMNS_INIT
-						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
-						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
-
-						return;
-					}
-					case 6: {			// 6x4 * 4x6
-
-						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
-						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
-						MUL_Nx4_4x6_LAST2COLUMNS_INIT
-						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
-						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
-						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
-
-						return;
-					}
-				}
+			case 3: {		// 6x6 * 6x3
+
+				MUL_Nx6_6x3_INIT
+				MUL_Nx6_6x3_ROW( 0 )
+				MUL_Nx6_6x3_ROW( 1 )
+				MUL_Nx6_6x3_ROW( 2 )
+				MUL_Nx6_6x3_ROW( 3 )
+				MUL_Nx6_6x3_ROW( 4 )
+				MUL_Nx6_6x3_ROW( 5 )
+
+				return;
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
-									 m1Ptr[3] * m2Ptr[3*l];
-					m2Ptr++;
-				}
-				m1Ptr += 4;
+			case 4: {		// 6x6 * 6x4
+
+				MUL_Nx6_6x4_INIT
+				MUL_Nx6_6x4_ROW( 0 )
+				MUL_Nx6_6x4_ROW( 1 )
+				MUL_Nx6_6x4_ROW( 2 )
+				MUL_Nx6_6x4_ROW( 3 )
+				MUL_Nx6_6x4_ROW( 4 )
+				MUL_Nx6_6x4_ROW( 5 )
+
+				return;
 			}
-			break;
-		}
-		case 5: {
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 5: {			// 5x5 * 5x6
-
-						#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT						\
-						__asm mov			esi, m2Ptr								\
-						__asm mov			edi, m1Ptr								\
-						__asm mov			eax, dstPtr								\
-						__asm movlps		xmm0, [esi+ 0*4]						\
-						__asm movhps		xmm0, [esi+ 2*4]						\
-						__asm movlps		xmm1, [esi+ 6*4]						\
-						__asm movhps		xmm1, [esi+ 8*4]						\
-						__asm movlps		xmm2, [esi+12*4]						\
-						__asm movhps		xmm2, [esi+14*4]						\
-						__asm movlps		xmm3, [esi+18*4]						\
-						__asm movhps		xmm3, [esi+20*4]						\
-						__asm movlps		xmm4, [esi+24*4]						\
-						__asm movhps		xmm4, [esi+26*4]
-
-						#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row )				\
-						__asm movss			xmm6, [edi+row*20+0*4]					\
-						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm6, xmm0								\
-						__asm movss			xmm5, [edi+row*20+1*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm1								\
-						__asm addps			xmm6, xmm5								\
-						__asm movss			xmm5, [edi+row*20+2*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm6, xmm5								\
-						__asm movss			xmm5, [edi+row*20+3*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm3								\
-						__asm addps			xmm6, xmm5								\
-						__asm movss			xmm5, [edi+row*20+4*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps			xmm5, xmm4								\
-						__asm addps			xmm6, xmm5								\
-						__asm movlps		[eax+row*24+0], xmm6					\
-						__asm movhps		[eax+row*24+8], xmm6
-
-						#define MUL_Nx5_5x6_LAST2COLUMNS_INIT						\
-						__asm movlps		xmm0, [esi+ 4*4]						\
-						__asm movlps		xmm1, [esi+10*4]						\
-						__asm movlps		xmm2, [esi+16*4]						\
-						__asm movlps		xmm3, [esi+22*4]						\
-						__asm movlps		xmm4, [esi+28*4]						\
-						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps		xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
-
-						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row )				\
-						__asm movlps		xmm7, [edi+row*40+ 0*4]					\
-						__asm movhps		xmm7, [edi+row*40+ 6*4]					\
-						__asm movaps		xmm6, xmm7								\
-						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 )	\
-						__asm mulps			xmm6, xmm0								\
-						__asm movaps		xmm5, xmm7								\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
-						__asm mulps			xmm5, xmm1								\
-						__asm addps			xmm6, xmm5								\
-						__asm movlps		xmm7, [edi+row*40+ 2*4]					\
-						__asm movhps		xmm7, [edi+row*40+ 8*4]					\
-						__asm movaps		xmm5, xmm7								\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 )	\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm6, xmm5								\
-						__asm movaps		xmm5, xmm7								\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
-						__asm mulps			xmm5, xmm3								\
-						__asm addps			xmm6, xmm5								\
-						__asm movlps		xmm5, [edi+row*40+ 4*4]					\
-						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps			xmm5, xmm4								\
-						__asm addps			xmm6, xmm5								\
-						__asm movlps		[eax+row*48+ 4*4], xmm6					\
-						__asm movhps		[eax+row*48+10*4], xmm6
-
-						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row )					\
-						__asm movlps		xmm6, [edi+20*4+0*4]					\
-						__asm unpcklps		xmm6, xmm6								\
-						__asm mulps			xmm6, xmm0								\
-						__asm movlps		xmm5, [edi+20*4+2*4]					\
-						__asm unpcklps		xmm5, xmm5								\
-						__asm mulps			xmm5, xmm2								\
-						__asm addps			xmm6, xmm5								\
-						__asm movss			xmm5, [edi+20*4+4*4]					\
-						__asm unpcklps		xmm5, xmm5								\
-						__asm mulps			xmm5, xmm4								\
-						__asm addps			xmm6, xmm5								\
-						__asm movhlps		xmm7, xmm6								\
-						__asm addps			xmm6, xmm7								\
-						__asm movlps		[eax+row*24+4*4], xmm6
-
-						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
-						MUL_Nx5_5x6_LAST2COLUMNS_INIT
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
-
-						return;
-					}
-					case 6: {			// 6x5 * 5x6
-
-						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
-						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
-						MUL_Nx5_5x6_LAST2COLUMNS_INIT
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
-						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
-
-						return;
-					}
-				}
+			case 5: {		// 6x6 * 6x5
+
+				MUL_Nx6_6x5_INIT
+				MUL_Nx6_6x5_ROW( 0 )
+				MUL_Nx6_6x5_ROW( 1 )
+				MUL_Nx6_6x5_ROW( 2 )
+				MUL_Nx6_6x5_ROW( 3 )
+				MUL_Nx6_6x5_ROW( 4 )
+				MUL_Nx6_6x5_ROW( 5 )
+
+				return;
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
-									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
-					m2Ptr++;
+			case 6: {		// 6x6 * 6x6
+				__asm {
+					mov			ecx, dword ptr m2Ptr
+					movlps		xmm3, qword ptr [ecx+72]
+					mov			edx, dword ptr m1Ptr
+					// Loading first 4 columns (upper 4 rows) of m2Ptr.
+					movaps		xmm0, xmmword ptr [ecx]
+					movlps		xmm1, qword ptr [ecx+24]
+					movhps		xmm1, qword ptr [ecx+32]
+					movaps		xmm2, xmmword ptr [ecx+48]
+					movhps		xmm3, qword ptr [ecx+80]
+					// Calculating first 4 elements in the first row of the destination matrix.
+					movss		xmm4, dword ptr [edx]
+					movss		xmm5, dword ptr [edx+4]
+					mov			eax, dword ptr dstPtr
+					shufps		xmm4, xmm4, 0
+					movss		xmm6, dword ptr [edx+8]
+					shufps		xmm5, xmm5, 0
+					movss		xmm7, dword ptr [edx+12]
+					mulps		xmm4, xmm0
+					shufps		xmm6, xmm6, 0
+					shufps		xmm7, xmm7, 0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					addps		xmm5, xmm4
+					mulps		xmm7, xmm3
+					addps		xmm6, xmm5
+					addps		xmm7, xmm6
+					movaps		xmmword ptr [eax], xmm7
+					// Calculating first 4 elements in the second row of the destination matrix.
+					movss		xmm4, dword ptr [edx+24]
+					shufps		xmm4, xmm4, 0
+					mulps		xmm4, xmm0
+					movss		xmm5, dword ptr [edx+28]
+					shufps		xmm5, xmm5, 0
+					mulps		xmm5, xmm1
+					movss		xmm6, dword ptr [edx+32]
+					shufps		xmm6, xmm6, 0
+					movss		xmm7, dword ptr [edx+36]
+					shufps		xmm7, xmm7, 0
+					mulps		xmm6, xmm2
+					mulps		xmm7, xmm3
+					addps		xmm7, xmm6
+					addps		xmm5, xmm4
+					addps		xmm7, xmm5
+					// Calculating first 4 elements in the third row of the destination matrix.
+					movss		xmm4, dword ptr [edx+48]
+					movss		xmm5, dword ptr [edx+52]
+					movlps		qword ptr [eax+24], xmm7 ; save 2nd
+					movhps		qword ptr [eax+32], xmm7 ; row
+					movss		xmm6, dword ptr [edx+56]
+					movss		xmm7, dword ptr [edx+60]
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					shufps		xmm6, xmm6, 0
+					shufps		xmm7, xmm7, 0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					mulps		xmm7, xmm3
+					addps		xmm5, xmm4
+					addps		xmm7, xmm6
+					addps		xmm7, xmm5
+					movaps		xmmword ptr [eax+48], xmm7
+					// Calculating first 4 elements in the fourth row of the destination matrix.
+					movss		xmm4, dword ptr [edx+72]
+					movss		xmm5, dword ptr [edx+76]
+					movss		xmm6, dword ptr [edx+80]
+					movss		xmm7, dword ptr [edx+84]
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					shufps		xmm6, xmm6, 0
+					shufps		xmm7, xmm7, 0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					mulps		xmm7, xmm3
+					addps		xmm4, xmm5
+					addps		xmm6, xmm4
+					addps		xmm7, xmm6
+					movlps		qword ptr [eax+72], xmm7
+					movhps		qword ptr [eax+80], xmm7
+					// Calculating first 4 elements in the fifth row of the destination matrix.
+					movss		xmm4, dword ptr [edx+96]
+					movss		xmm5, dword ptr [edx+100]
+					movss		xmm6, dword ptr [edx+104]
+					movss		xmm7, dword ptr [edx+108]
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					shufps		xmm6, xmm6, 0
+					shufps		xmm7, xmm7, 0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					mulps		xmm7, xmm3
+					addps		xmm5, xmm4
+					addps		xmm7, xmm6
+					addps		xmm7, xmm5
+					movaps		xmmword ptr [eax+96], xmm7
+					// Calculating first 4 elements in the sixth row of the destination matrix.
+					movss		xmm4, dword ptr [edx+120]
+					movss		xmm5, dword ptr [edx+124]
+					movss		xmm6, dword ptr [edx+128]
+					movss		xmm7, dword ptr [edx+132]
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					shufps		xmm6, xmm6, 0
+					shufps		xmm7, xmm7, 0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm6, xmm2
+					mulps		xmm7, xmm3
+					addps		xmm4, xmm5
+					addps		xmm6, xmm4
+					addps		xmm7, xmm6
+					movhps		qword ptr [eax+128], xmm7
+					movlps		qword ptr [eax+120], xmm7
+					// Loading first 4 columns (lower 2 rows) of m2Ptr.
+					movlps		xmm0, qword ptr [ecx+96]
+					movhps		xmm0, qword ptr [ecx+104]
+					movlps		xmm1, qword ptr [ecx+120]
+					movhps		xmm1, qword ptr [ecx+128]
+					// Calculating first 4 elements in the first row of the destination matrix.
+					movss		xmm2, dword ptr [edx+16]
+					shufps		xmm2, xmm2, 0
+					movss		xmm4, dword ptr [edx+40]
+					movss		xmm3, dword ptr [edx+20]
+					movss		xmm5, dword ptr [edx+44]
+					movaps		xmm6, xmmword ptr [eax]
+					movlps		xmm7, qword ptr [eax+24]
+					shufps		xmm3, xmm3, 0
+					shufps		xmm5, xmm5, 0
+					movhps		xmm7, qword ptr [eax+32]
+					shufps		xmm4, xmm4, 0
+					mulps		xmm5, xmm1
+					mulps		xmm2, xmm0
+					mulps		xmm3, xmm1
+					mulps		xmm4, xmm0
+					addps		xmm6, xmm2
+					addps		xmm7, xmm4
+					addps		xmm7, xmm5
+					addps		xmm6, xmm3
+					movlps		qword ptr [eax+24], xmm7
+					movaps		xmmword ptr [eax], xmm6
+					movhps		qword ptr [eax+32], xmm7
+					// Calculating first 4 elements in the third row of the destination matrix.
+					movss		xmm2, dword ptr [edx+64]
+					movss		xmm4, dword ptr [edx+88]
+					movss		xmm5, dword ptr [edx+92]
+					movss		xmm3, dword ptr [edx+68]
+					movaps		xmm6, xmmword ptr [eax+48]
+					movlps		xmm7, qword ptr [eax+72]
+					movhps		xmm7, qword ptr [eax+80]
+					shufps		xmm2, xmm2, 0
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					shufps		xmm3, xmm3, 0
+					mulps		xmm2, xmm0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					mulps		xmm3, xmm1
+					addps		xmm6, xmm2
+					addps		xmm6, xmm3
+					addps		xmm7, xmm4
+					addps		xmm7, xmm5
+					movlps		qword ptr [eax+72], xmm7
+					movaps		xmmword ptr [eax+48], xmm6
+					movhps		qword ptr [eax+80], xmm7
+					// Calculating first 4 elements in the fifth row of the destination matrix.
+					movss		xmm2, dword ptr [edx+112]
+					movss		xmm3, dword ptr [edx+116]
+					movaps		xmm6, xmmword ptr [eax+96]
+					shufps		xmm2, xmm2, 0
+					shufps		xmm3, xmm3, 0
+					mulps		xmm2, xmm0
+					mulps		xmm3, xmm1
+					addps		xmm6, xmm2
+					addps		xmm6, xmm3
+					movaps		xmmword ptr [eax+96], xmm6
+					// Calculating first 4 elements in the sixth row of the destination matrix.
+					movss		xmm4, dword ptr [edx+136]
+					movss		xmm5, dword ptr [edx+140]
+					movhps		xmm7, qword ptr [eax+128]
+					movlps		xmm7, qword ptr [eax+120]
+					shufps		xmm4, xmm4, 0
+					shufps		xmm5, xmm5, 0
+					mulps		xmm4, xmm0
+					mulps		xmm5, xmm1
+					addps		xmm7, xmm4
+					addps		xmm7, xmm5
+					// Calculating last 2 columns of the destination matrix.
+					movlps		xmm0, qword ptr [ecx+16]
+					movhps		xmm0, qword ptr [ecx+40]
+					movhps		qword ptr [eax+128], xmm7
+					movlps		qword ptr [eax+120], xmm7
+					movlps		xmm2, qword ptr [ecx+64]
+					movhps		xmm2, qword ptr [ecx+88]
+					movaps		xmm3, xmm2
+					shufps		xmm3, xmm3, 4Eh
+					movlps		xmm4, qword ptr [ecx+112]
+					movhps		xmm4, qword ptr [ecx+136]
+					movaps		xmm5, xmm4
+					shufps		xmm5, xmm5, 4Eh
+					movlps		xmm6, qword ptr [edx]
+					movhps		xmm6, qword ptr [edx+24]
+					movaps		xmm7, xmm6
+					shufps		xmm7, xmm7, 0F0h
+					mulps		xmm7, xmm0
+					shufps		xmm6, xmm6, 0A5h
+					movaps		xmm1, xmm0
+					shufps		xmm1, xmm1, 4Eh
+					mulps		xmm1, xmm6
+					addps		xmm7, xmm1
+					movlps		xmm6, qword ptr [edx+8]
+					movhps		xmm6, qword ptr [edx+32]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm2
+					mulps		xmm6, xmm3
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movhps		xmm6, qword ptr [edx+40]
+					movlps		xmm6, qword ptr [edx+16]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm4
+					mulps		xmm6, xmm5
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movlps		qword ptr [eax+16], xmm7
+					movhps		qword ptr [eax+40], xmm7
+					movlps		xmm6, qword ptr [edx+48]
+					movhps		xmm6, qword ptr [edx+72]
+					movaps		xmm7, xmm6
+					shufps		xmm7, xmm7, 0F0h
+					mulps		xmm7, xmm0
+					shufps		xmm6, xmm6, 0A5h
+					movaps		xmm1, xmm0
+					shufps		xmm1, xmm1, 4Eh
+					mulps		xmm1, xmm6
+					addps		xmm7, xmm1
+					movhps		xmm6, qword ptr [edx+80]
+					movlps		xmm6, qword ptr [edx+56]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm2
+					mulps		xmm6, xmm3
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movlps		xmm6, qword ptr [edx+64]
+					movhps		xmm6, qword ptr [edx+88]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm4
+					mulps		xmm6, xmm5
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movlps		qword ptr [eax+64], xmm7
+					movhps		qword ptr [eax+88], xmm7
+					movlps		xmm6, qword ptr [edx+96]
+					movhps		xmm6, qword ptr [edx+120]
+					movaps		xmm7, xmm6
+					shufps		xmm7, xmm7, 0F0h
+					mulps		xmm7, xmm0
+					shufps		xmm6, xmm6, 0A5h
+					movaps		xmm1, xmm0
+					shufps		xmm1, xmm1, 4Eh
+					mulps		xmm1, xmm6
+					addps		xmm7, xmm1
+					movlps		xmm6, qword ptr [edx+104]
+					movhps		xmm6, qword ptr [edx+128]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm2
+					mulps		xmm6, xmm3
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movlps		xmm6, qword ptr [edx+112]
+					movhps		xmm6, qword ptr [edx+136]
+					movaps		xmm1, xmm6
+					shufps		xmm1, xmm1, 0F0h
+					shufps		xmm6, xmm6, 0A5h
+					mulps		xmm1, xmm4
+					mulps		xmm6, xmm5
+					addps		xmm7, xmm1
+					addps		xmm7, xmm6
+					movlps		qword ptr [eax+112], xmm7
+					movhps		qword ptr [eax+136], xmm7
 				}
-				m1Ptr += 5;
+				return;
 			}
-			break;
-		}
-		case 6: {
-			switch( k ) {
-				case 1: {
-					if ( !(l^1) ) {		// 1x6 * 6x1
-						dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
-									 m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
-						return;
-					}
-					break;
-				}
-				case 2: {
-					if ( !(l^2) ) {		// 2x6 * 6x2
-
-						#define MUL_Nx6_6x2_INIT								\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movaps	xmm0, [esi]								\
-						__asm movaps	xmm1, [esi+16]							\
-						__asm movaps	xmm2, [esi+32]
-
-						#define MUL_Nx6_6x2_ROW2( row )							\
-						__asm movaps	xmm7, [edi+row*48+0*4]					\
-						__asm movaps	xmm6, xmm7								\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movaps	xmm6, [edi+row*48+4*4]					\
-						__asm movaps	xmm5, xmm6								\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )	\
-						__asm mulps		xmm5, xmm0								\
-						__asm movaps	xmm6, [edi+row*48+24+2*4]				\
-						__asm movaps	xmm4, xmm6								\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm5, xmm6								\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 )	\
-						__asm mulps		xmm4, xmm2								\
-						__asm addps		xmm5, xmm4								\
-						__asm movaps	xmm4, xmm5								\
-						__asm movhlps	xmm5, xmm7								\
-						__asm movlhps	xmm7, xmm4								\
-						__asm addps		xmm7, xmm5								\
-						__asm movaps	[eax+row*16], xmm7
-
-						MUL_Nx6_6x2_INIT
-						MUL_Nx6_6x2_ROW2( 0 )
-
-						return;
-					}
-					break;
-				}
-				case 3: {
-					if ( !(l^3) ) {		// 3x6 * 6x3
-
-						#define MUL_Nx6_6x3_INIT								\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movss		xmm0, [esi+ 0*4]						\
-						__asm movhps	xmm0, [esi+ 1*4]						\
-						__asm movss		xmm1, [esi+ 3*4]						\
-						__asm movhps	xmm1, [esi+ 4*4]						\
-						__asm movss		xmm2, [esi+ 6*4]						\
-						__asm movhps	xmm2, [esi+ 7*4]						\
-						__asm movss		xmm3, [esi+ 9*4]						\
-						__asm movhps	xmm3, [esi+10*4]						\
-						__asm movss		xmm4, [esi+12*4]						\
-						__asm movhps	xmm4, [esi+13*4]						\
-						__asm movss		xmm5, [esi+15*4]						\
-						__asm movhps	xmm5, [esi+16*4]
-
-						#define MUL_Nx6_6x3_ROW( row )							\
-						__asm movss		xmm7, [edi+row*24+0]					\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm movss		xmm6, [edi+row*24+4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+8]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+12]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+16]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+20]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		[eax+row*12+0], xmm7					\
-						__asm movhps	[eax+row*12+4], xmm7
-
-						MUL_Nx6_6x3_INIT
-						MUL_Nx6_6x3_ROW( 0 )
-						MUL_Nx6_6x3_ROW( 1 )
-						MUL_Nx6_6x3_ROW( 2 )
-
-						return;
-					}
-					break;
-				}
-				case 4: {
-					if ( !(l^4) ) {		// 4x6 * 6x4
-
-						#define MUL_Nx6_6x4_INIT								\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movaps	xmm0, [esi]								\
-						__asm movaps	xmm1, [esi+16]							\
-						__asm movaps	xmm2, [esi+32]							\
-						__asm movaps	xmm3, [esi+48]							\
-						__asm movaps	xmm4, [esi+64]							\
-						__asm movaps	xmm5, [esi+80]
-
-						#define MUL_Nx6_6x4_ROW( row )							\
-						__asm movss		xmm7, [edi+row*24+0]					\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm movss		xmm6, [edi+row*24+4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+8]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+12]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+16]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+row*24+20]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm movaps	[eax+row*16], xmm7
-
-						MUL_Nx6_6x4_INIT
-						MUL_Nx6_6x4_ROW( 0 )
-						MUL_Nx6_6x4_ROW( 1 )
-						MUL_Nx6_6x4_ROW( 2 )
-						MUL_Nx6_6x4_ROW( 3 )
-
-						return;
-					}
-					break;
-				}
-				case 5: {
-					if ( !(l^5) ) {		// 5x6 * 6x5
-
-						#define MUL_Nx6_6x5_INIT								\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movaps	xmm0, [esi]								\
-						__asm movlps	xmm1, [esi+20]							\
-						__asm movhps	xmm1, [esi+28]							\
-						__asm movlps	xmm2, [esi+40]							\
-						__asm movhps	xmm2, [esi+48]							\
-						__asm movlps	xmm3, [esi+60]							\
-						__asm movhps	xmm3, [esi+68]							\
-						__asm movaps	xmm4, [esi+80]							\
-						__asm movlps	xmm5, [esi+100]							\
-						__asm movhps	xmm5, [esi+108]
-
-						#define MUL_Nx6_6x5_ROW( row )							\
-						__asm movss		xmm7, [edi+row*24+0]					\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm fld		dword ptr [edi+(row*6+0)*4]				\
-						__asm fmul		dword ptr [esi+(4+0*5)*4]				\
-						__asm movss		xmm6, [edi+row*24+4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm fld		dword ptr [edi+(row*6+1)*4]				\
-						__asm fmul		dword ptr [esi+(4+1*5)*4]				\
-						__asm faddp		st(1),st								\
-						__asm movss		xmm6, [edi+row*24+8]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm fld		dword ptr [edi+(row*6+2)*4]				\
-						__asm fmul		dword ptr [esi+(4+2*5)*4]				\
-						__asm faddp		st(1),st								\
-						__asm movss		xmm6, [edi+row*24+12]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm fld		dword ptr [edi+(row*6+3)*4]				\
-						__asm fmul		dword ptr [esi+(4+3*5)*4]				\
-						__asm faddp		st(1),st								\
-						__asm movss		xmm6, [edi+row*24+16]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm fld		dword ptr [edi+(row*6+4)*4]				\
-						__asm fmul		dword ptr [esi+(4+4*5)*4]				\
-						__asm faddp		st(1),st								\
-						__asm movss		xmm6, [edi+row*24+20]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm fld		dword ptr [edi+(row*6+5)*4]				\
-						__asm fmul		dword ptr [esi+(4+5*5)*4]				\
-						__asm faddp		st(1),st								\
-						__asm fstp		dword ptr [eax+(row*5+4)*4]				\
-						__asm movlps	[eax+row*20], xmm7						\
-						__asm movhps	[eax+row*20+8], xmm7
-
-						MUL_Nx6_6x5_INIT
-						MUL_Nx6_6x5_ROW( 0 )
-						MUL_Nx6_6x5_ROW( 1 )
-						MUL_Nx6_6x5_ROW( 2 )
-						MUL_Nx6_6x5_ROW( 3 )
-						MUL_Nx6_6x5_ROW( 4 )
-
-						return;
-					}
-					break;
-				}
-				case 6: {
-					switch( l ) {
-						case 1: {		// 6x6 * 6x1
-							__asm {
-								mov			esi, m2Ptr
-								mov			edi, m1Ptr
-								mov			eax, dstPtr
-								movlps		xmm7, qword ptr [esi]
-								movlps		xmm6, qword ptr [esi+8]
-								shufps		xmm7, xmm7, 0x44
-								shufps		xmm6, xmm6, 0x44
-								movlps		xmm0, qword ptr [edi    ]
-								movhps		xmm0, qword ptr [edi+ 24]
-								mulps		xmm0, xmm7
-								movlps		xmm3, qword ptr [edi+  8]
-								movhps		xmm3, qword ptr [edi+ 32]
-								mulps		xmm3, xmm6
-								movlps		xmm1, qword ptr [edi+ 48]
-								movhps		xmm1, qword ptr [edi+ 72]
-								mulps		xmm1, xmm7
-								movlps		xmm2, qword ptr [edi+ 96]
-								movhps		xmm2, qword ptr [edi+120]
-								mulps		xmm2, xmm7
-								movlps		xmm4, qword ptr [edi+ 56]
-								movhps		xmm4, qword ptr [edi+ 80]
-								movlps		xmm5, qword ptr [edi+104]
-								movhps		xmm5, qword ptr [edi+128]
-								mulps		xmm4, xmm6
-								movlps		xmm7, qword ptr [esi+16]
-								addps		xmm0, xmm3
-								shufps		xmm7, xmm7, 0x44
-								mulps		xmm5, xmm6
-								addps		xmm1, xmm4
-								movlps		xmm3, qword ptr [edi+ 16]
-								movhps		xmm3, qword ptr [edi+ 40]
-								addps		xmm2, xmm5
-								movlps		xmm4, qword ptr [edi+ 64]
-								movhps		xmm4, qword ptr [edi+ 88]
-								mulps		xmm3, xmm7
-								movlps		xmm5, qword ptr [edi+112]
-								movhps		xmm5, qword ptr [edi+136]
-								addps		xmm0, xmm3
-								mulps		xmm4, xmm7
-								mulps		xmm5, xmm7
-								addps		xmm1, xmm4
-								addps		xmm2, xmm5
-								movaps		xmm6, xmm0
-								shufps		xmm0, xmm1, 0x88
-								shufps		xmm6, xmm1, 0xDD
-								movaps		xmm7, xmm2
-								shufps		xmm7, xmm2, 0x88
-								shufps		xmm2, xmm2, 0xDD
-								addps		xmm0, xmm6
-								addps		xmm2, xmm7
-								movlps		[eax], xmm0
-								movhps		[eax+8], xmm0
-								movlps		[eax+16], xmm2
-							}
-							return;
-						}
-						case 2: {		// 6x6 * 6x2
-
-							MUL_Nx6_6x2_INIT
-							MUL_Nx6_6x2_ROW2( 0 )
-							MUL_Nx6_6x2_ROW2( 1 )
-							MUL_Nx6_6x2_ROW2( 2 )
-
-							return;
-						}
-						case 3: {		// 6x6 * 6x3
-
-							MUL_Nx6_6x3_INIT
-							MUL_Nx6_6x3_ROW( 0 )
-							MUL_Nx6_6x3_ROW( 1 )
-							MUL_Nx6_6x3_ROW( 2 )
-							MUL_Nx6_6x3_ROW( 3 )
-							MUL_Nx6_6x3_ROW( 4 )
-							MUL_Nx6_6x3_ROW( 5 )
-
-							return;
-						}
-						case 4: {		// 6x6 * 6x4
-
-							MUL_Nx6_6x4_INIT
-							MUL_Nx6_6x4_ROW( 0 )
-							MUL_Nx6_6x4_ROW( 1 )
-							MUL_Nx6_6x4_ROW( 2 )
-							MUL_Nx6_6x4_ROW( 3 )
-							MUL_Nx6_6x4_ROW( 4 )
-							MUL_Nx6_6x4_ROW( 5 )
-
-							return;
-						}
-						case 5: {		// 6x6 * 6x5
-
-							MUL_Nx6_6x5_INIT
-							MUL_Nx6_6x5_ROW( 0 )
-							MUL_Nx6_6x5_ROW( 1 )
-							MUL_Nx6_6x5_ROW( 2 )
-							MUL_Nx6_6x5_ROW( 3 )
-							MUL_Nx6_6x5_ROW( 4 )
-							MUL_Nx6_6x5_ROW( 5 )
-
-							return;
-						}
-						case 6: {		// 6x6 * 6x6
-							__asm {
-								mov			ecx, dword ptr m2Ptr
-								movlps		xmm3, qword ptr [ecx+72]
-								mov			edx, dword ptr m1Ptr
-								// Loading first 4 columns (upper 4 rows) of m2Ptr.
-								movaps		xmm0, xmmword ptr [ecx]
-								movlps		xmm1, qword ptr [ecx+24]
-								movhps		xmm1, qword ptr [ecx+32]
-								movaps		xmm2, xmmword ptr [ecx+48]
-								movhps		xmm3, qword ptr [ecx+80]
-								// Calculating first 4 elements in the first row of the destination matrix.
-								movss		xmm4, dword ptr [edx]
-								movss		xmm5, dword ptr [edx+4]
-								mov			eax, dword ptr dstPtr
-								shufps		xmm4, xmm4, 0
-								movss		xmm6, dword ptr [edx+8]
-								shufps		xmm5, xmm5, 0
-								movss		xmm7, dword ptr [edx+12]
-								mulps		xmm4, xmm0
-								shufps		xmm6, xmm6, 0
-								shufps		xmm7, xmm7, 0
-								mulps		xmm5, xmm1
-								mulps		xmm6, xmm2
-								addps		xmm5, xmm4
-								mulps		xmm7, xmm3
-								addps		xmm6, xmm5
-								addps		xmm7, xmm6
-								movaps		xmmword ptr [eax], xmm7
-								// Calculating first 4 elements in the second row of the destination matrix.
-								movss		xmm4, dword ptr [edx+24]
-								shufps		xmm4, xmm4, 0
-								mulps		xmm4, xmm0
-								movss		xmm5, dword ptr [edx+28]
-								shufps		xmm5, xmm5, 0
-								mulps		xmm5, xmm1
-								movss		xmm6, dword ptr [edx+32]
-								shufps		xmm6, xmm6, 0
-								movss		xmm7, dword ptr [edx+36]
-								shufps		xmm7, xmm7, 0
-								mulps		xmm6, xmm2
-								mulps		xmm7, xmm3
-								addps		xmm7, xmm6
-								addps		xmm5, xmm4
-								addps		xmm7, xmm5
-								// Calculating first 4 elements in the third row of the destination matrix.
-								movss		xmm4, dword ptr [edx+48]
-								movss		xmm5, dword ptr [edx+52]
-								movlps		qword ptr [eax+24], xmm7 ; save 2nd
-								movhps		qword ptr [eax+32], xmm7 ; row
-								movss		xmm6, dword ptr [edx+56]
-								movss		xmm7, dword ptr [edx+60]
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								shufps		xmm6, xmm6, 0
-								shufps		xmm7, xmm7, 0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								mulps		xmm6, xmm2
-								mulps		xmm7, xmm3
-								addps		xmm5, xmm4
-								addps		xmm7, xmm6
-								addps		xmm7, xmm5
-								movaps		xmmword ptr [eax+48], xmm7
-								// Calculating first 4 elements in the fourth row of the destination matrix.
-								movss		xmm4, dword ptr [edx+72]
-								movss		xmm5, dword ptr [edx+76]
-								movss		xmm6, dword ptr [edx+80]
-								movss		xmm7, dword ptr [edx+84]
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								shufps		xmm6, xmm6, 0
-								shufps		xmm7, xmm7, 0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								mulps		xmm6, xmm2
-								mulps		xmm7, xmm3
-								addps		xmm4, xmm5
-								addps		xmm6, xmm4
-								addps		xmm7, xmm6
-								movlps		qword ptr [eax+72], xmm7
-								movhps		qword ptr [eax+80], xmm7
-								// Calculating first 4 elements in the fifth row of the destination matrix.
-								movss		xmm4, dword ptr [edx+96]
-								movss		xmm5, dword ptr [edx+100]
-								movss		xmm6, dword ptr [edx+104]
-								movss		xmm7, dword ptr [edx+108]
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								shufps		xmm6, xmm6, 0
-								shufps		xmm7, xmm7, 0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								mulps		xmm6, xmm2
-								mulps		xmm7, xmm3
-								addps		xmm5, xmm4
-								addps		xmm7, xmm6
-								addps		xmm7, xmm5
-								movaps		xmmword ptr [eax+96], xmm7
-								// Calculating first 4 elements in the sixth row of the destination matrix.
-								movss		xmm4, dword ptr [edx+120]
-								movss		xmm5, dword ptr [edx+124]
-								movss		xmm6, dword ptr [edx+128]
-								movss		xmm7, dword ptr [edx+132]
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								shufps		xmm6, xmm6, 0
-								shufps		xmm7, xmm7, 0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								mulps		xmm6, xmm2
-								mulps		xmm7, xmm3
-								addps		xmm4, xmm5
-								addps		xmm6, xmm4
-								addps		xmm7, xmm6
-								movhps		qword ptr [eax+128], xmm7
-								movlps		qword ptr [eax+120], xmm7
-								// Loading first 4 columns (lower 2 rows) of m2Ptr.
-								movlps		xmm0, qword ptr [ecx+96]
-								movhps		xmm0, qword ptr [ecx+104]
-								movlps		xmm1, qword ptr [ecx+120]
-								movhps		xmm1, qword ptr [ecx+128]
-								// Calculating first 4 elements in the first row of the destination matrix.
-								movss		xmm2, dword ptr [edx+16]
-								shufps		xmm2, xmm2, 0
-								movss		xmm4, dword ptr [edx+40]
-								movss		xmm3, dword ptr [edx+20]
-								movss		xmm5, dword ptr [edx+44]
-								movaps		xmm6, xmmword ptr [eax]
-								movlps		xmm7, qword ptr [eax+24]
-								shufps		xmm3, xmm3, 0
-								shufps		xmm5, xmm5, 0
-								movhps		xmm7, qword ptr [eax+32]
-								shufps		xmm4, xmm4, 0
-								mulps		xmm5, xmm1
-								mulps		xmm2, xmm0
-								mulps		xmm3, xmm1
-								mulps		xmm4, xmm0
-								addps		xmm6, xmm2
-								addps		xmm7, xmm4
-								addps		xmm7, xmm5
-								addps		xmm6, xmm3
-								movlps		qword ptr [eax+24], xmm7
-								movaps		xmmword ptr [eax], xmm6
-								movhps		qword ptr [eax+32], xmm7
-								// Calculating first 4 elements in the third row of the destination matrix.
-								movss		xmm2, dword ptr [edx+64]
-								movss		xmm4, dword ptr [edx+88]
-								movss		xmm5, dword ptr [edx+92]
-								movss		xmm3, dword ptr [edx+68]
-								movaps		xmm6, xmmword ptr [eax+48]
-								movlps		xmm7, qword ptr [eax+72]
-								movhps		xmm7, qword ptr [eax+80]
-								shufps		xmm2, xmm2, 0
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								shufps		xmm3, xmm3, 0
-								mulps		xmm2, xmm0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								mulps		xmm3, xmm1
-								addps		xmm6, xmm2
-								addps		xmm6, xmm3
-								addps		xmm7, xmm4
-								addps		xmm7, xmm5
-								movlps		qword ptr [eax+72], xmm7
-								movaps		xmmword ptr [eax+48], xmm6
-								movhps		qword ptr [eax+80], xmm7
-								// Calculating first 4 elements in the fifth row of the destination matrix.
-								movss		xmm2, dword ptr [edx+112]
-								movss		xmm3, dword ptr [edx+116]
-								movaps		xmm6, xmmword ptr [eax+96]
-								shufps		xmm2, xmm2, 0
-								shufps		xmm3, xmm3, 0
-								mulps		xmm2, xmm0
-								mulps		xmm3, xmm1
-								addps		xmm6, xmm2
-								addps		xmm6, xmm3
-								movaps		xmmword ptr [eax+96], xmm6
-								// Calculating first 4 elements in the sixth row of the destination matrix.
-								movss		xmm4, dword ptr [edx+136]
-								movss		xmm5, dword ptr [edx+140]
-								movhps		xmm7, qword ptr [eax+128]
-								movlps		xmm7, qword ptr [eax+120]
-								shufps		xmm4, xmm4, 0
-								shufps		xmm5, xmm5, 0
-								mulps		xmm4, xmm0
-								mulps		xmm5, xmm1
-								addps		xmm7, xmm4
-								addps		xmm7, xmm5
-								// Calculating last 2 columns of the destination matrix.
-								movlps		xmm0, qword ptr [ecx+16]
-								movhps		xmm0, qword ptr [ecx+40]
-								movhps		qword ptr [eax+128], xmm7
-								movlps		qword ptr [eax+120], xmm7
-								movlps		xmm2, qword ptr [ecx+64]
-								movhps		xmm2, qword ptr [ecx+88]
-								movaps		xmm3, xmm2
-								shufps		xmm3, xmm3, 4Eh
-								movlps		xmm4, qword ptr [ecx+112]
-								movhps		xmm4, qword ptr [ecx+136]
-								movaps		xmm5, xmm4
-								shufps		xmm5, xmm5, 4Eh
-								movlps		xmm6, qword ptr [edx]
-								movhps		xmm6, qword ptr [edx+24]
-								movaps		xmm7, xmm6
-								shufps		xmm7, xmm7, 0F0h
-								mulps		xmm7, xmm0
-								shufps		xmm6, xmm6, 0A5h
-								movaps		xmm1, xmm0
-								shufps		xmm1, xmm1, 4Eh
-								mulps		xmm1, xmm6
-								addps		xmm7, xmm1
-								movlps		xmm6, qword ptr [edx+8]
-								movhps		xmm6, qword ptr [edx+32]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm2
-								mulps		xmm6, xmm3
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movhps		xmm6, qword ptr [edx+40]
-								movlps		xmm6, qword ptr [edx+16]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm4
-								mulps		xmm6, xmm5
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movlps		qword ptr [eax+16], xmm7
-								movhps		qword ptr [eax+40], xmm7
-								movlps		xmm6, qword ptr [edx+48]
-								movhps		xmm6, qword ptr [edx+72]
-								movaps		xmm7, xmm6
-								shufps		xmm7, xmm7, 0F0h
-								mulps		xmm7, xmm0
-								shufps		xmm6, xmm6, 0A5h
-								movaps		xmm1, xmm0
-								shufps		xmm1, xmm1, 4Eh
-								mulps		xmm1, xmm6
-								addps		xmm7, xmm1
-								movhps		xmm6, qword ptr [edx+80]
-								movlps		xmm6, qword ptr [edx+56]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm2
-								mulps		xmm6, xmm3
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movlps		xmm6, qword ptr [edx+64]
-								movhps		xmm6, qword ptr [edx+88]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm4
-								mulps		xmm6, xmm5
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movlps		qword ptr [eax+64], xmm7
-								movhps		qword ptr [eax+88], xmm7
-								movlps		xmm6, qword ptr [edx+96]
-								movhps		xmm6, qword ptr [edx+120]
-								movaps		xmm7, xmm6
-								shufps		xmm7, xmm7, 0F0h
-								mulps		xmm7, xmm0
-								shufps		xmm6, xmm6, 0A5h
-								movaps		xmm1, xmm0
-								shufps		xmm1, xmm1, 4Eh
-								mulps		xmm1, xmm6
-								addps		xmm7, xmm1
-								movlps		xmm6, qword ptr [edx+104]
-								movhps		xmm6, qword ptr [edx+128]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm2
-								mulps		xmm6, xmm3
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movlps		xmm6, qword ptr [edx+112]
-								movhps		xmm6, qword ptr [edx+136]
-								movaps		xmm1, xmm6
-								shufps		xmm1, xmm1, 0F0h
-								shufps		xmm6, xmm6, 0A5h
-								mulps		xmm1, xmm4
-								mulps		xmm6, xmm5
-								addps		xmm7, xmm1
-								addps		xmm7, xmm6
-								movlps		qword ptr [eax+112], xmm7
-								movhps		qword ptr [eax+136], xmm7
-							}
-							return;
-						}
-					}
-				}
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
-									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
-					m2Ptr++;
-				}
-				m1Ptr += 6;
+		}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2 * l] +
+				            m1Ptr[3] * m2Ptr[3 * l] + m1Ptr[4] * m2Ptr[4 * l] + m1Ptr[5] * m2Ptr[5 * l];
+				m2Ptr++;
 			}
-			break;
+			m1Ptr += 6;
 		}
-		default: {
-			for ( i = 0; i < k; i++ ) {
-				for ( j = 0; j < l; j++ ) {
-					m2Ptr = m2.ToFloatPtr() + j;
-					sum = m1Ptr[0] * m2Ptr[0];
-					for ( n = 1; n < m1.GetNumColumns(); n++ ) {
-						m2Ptr += l;
-						sum += m1Ptr[n] * m2Ptr[0];
-					}
-					*dstPtr++ = sum;
+		break;
+	}
+	default: {
+		for ( i = 0; i < k; i++ ) {
+			for ( j = 0; j < l; j++ ) {
+				m2Ptr = m2.ToFloatPtr() + j;
+				sum = m1Ptr[0] * m2Ptr[0];
+				for ( n = 1; n < m1.GetNumColumns(); n++ ) {
+					m2Ptr += l;
+					sum += m1Ptr[n] * m2Ptr[0];
 				}
-				m1Ptr += m1.GetNumColumns();
+				*dstPtr++ = sum;
 			}
-			break;
+			m1Ptr += m1.GetNumColumns();
 		}
+		break;
+	}
 	}
 }
 
@@ -9620,35 +9716,35 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 	k = m1.GetNumColumns();
 	l = m2.GetNumColumns();
 
-	switch( m1.GetNumRows() ) {
-		case 1:
-			if ( !((k^6)|(l^1)) ) {			// 1x6 * 1x1
-				__asm {
-					mov		esi, m2Ptr
-					mov		edi, m1Ptr
-					mov		eax, dstPtr
-					movss	xmm0, [esi]
-					shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-					movaps	xmm1, xmm0
-					mulps	xmm0, [edi]
-					mulps	xmm1, [edi+16]
-					movaps	[eax], xmm0
-					movlps	[eax+16], xmm1
-				}
-				return;
+	switch ( m1.GetNumRows() ) {
+	case 1:
+		if ( !( ( k ^ 6 ) | ( l ^ 1 ) ) ) {			// 1x6 * 1x1
+			__asm {
+				mov		esi, m2Ptr
+				mov		edi, m1Ptr
+				mov		eax, dstPtr
+				movss	xmm0, [esi]
+				shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
+				movaps	xmm1, xmm0
+				mulps	xmm0, [edi]
+				mulps	xmm1, [edi + 16]
+				movaps	[eax], xmm0
+				movlps	[eax + 16], xmm1
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			return;
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0];
+				m2Ptr++;
 			}
-			break;
-		case 2:
-			if ( !((k^6)|(l^2)) ) {			// 2x6 * 2x2
-				#define MUL_2xN_2x2_INIT								\
+			m1Ptr++;
+		}
+		break;
+	case 2:
+		if ( !( ( k ^ 6 ) | ( l ^ 2 ) ) ) {			// 2x6 * 2x2
+#define MUL_2xN_2x2_INIT								\
 				__asm mov		esi, m2Ptr								\
 				__asm mov		edi, m1Ptr								\
 				__asm mov		eax, dstPtr								\
@@ -9657,7 +9753,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movlps	xmm1, [esi+8]							\
 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
 
-				#define MUL_2xN_2x2_ROW2( N, row )						\
+#define MUL_2xN_2x2_ROW2( N, row )						\
 				__asm movlps	xmm6, [edi+(row+0*N)*4]					\
 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
 				__asm movlps	xmm7, [edi+(row+1*N)*4]					\
@@ -9667,26 +9763,26 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm addps		xmm6, xmm7								\
 				__asm movaps	[eax+(row*2)*4], xmm6
 
-				MUL_2xN_2x2_INIT
-				MUL_2xN_2x2_ROW2( 6, 0 )
-				MUL_2xN_2x2_ROW2( 6, 2 )
-				MUL_2xN_2x2_ROW2( 6, 4 )
+			MUL_2xN_2x2_INIT
+			MUL_2xN_2x2_ROW2( 6, 0 )
+			MUL_2xN_2x2_ROW2( 6, 2 )
+			MUL_2xN_2x2_ROW2( 6, 4 )
 
-				return;
-			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			return;
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
+				m2Ptr++;
 			}
-			break;
-		case 3:
-			if ( !((k^6)|(l^3)) ) {			// 3x6 * 3x3
+			m1Ptr++;
+		}
+		break;
+	case 3:
+		if ( !( ( k ^ 6 ) | ( l ^ 3 ) ) ) {			// 3x6 * 3x3
 
-				#define MUL_3xN_3x3_INIT								\
+#define MUL_3xN_3x3_INIT								\
 				__asm mov		esi, m2Ptr								\
 				__asm mov		edi, m1Ptr								\
 				__asm mov		eax, dstPtr								\
@@ -9697,12 +9793,12 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movss		xmm2, [esi+(2*3+0)*4]					\
 				__asm movhps	xmm2, [esi+(2*3+1)*4]
 
-				#define MUL_3xN_3x3_INIT_ROW4							\
+#define MUL_3xN_3x3_INIT_ROW4							\
 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 )	\
 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 )	\
 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
 
-				#define MUL_3xN_3x3_ROW4( N, row )						\
+#define MUL_3xN_3x3_ROW4( N, row )						\
 				__asm movlps	xmm3, [edi+(row+0*N+0)*4]				\
 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 )	\
 				__asm movlps	xmm4, [edi+(row+1*N+0)*4]				\
@@ -9746,17 +9842,17 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm addps		xmm3, xmm5								\
 				__asm movaps	[eax+(row*3+8)*4], xmm3
 
-				#define MUL_3xN_3x3_INIT_ROW4_ROW4						\
+#define MUL_3xN_3x3_INIT_ROW4_ROW4						\
 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )	\
 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )	\
 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-				#define MUL_3xN_3x3_INIT_ROW4_ROW						\
+#define MUL_3xN_3x3_INIT_ROW4_ROW						\
 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 )	\
 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 )	\
 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
 
-				#define MUL_3xN_3x3_ROW( N, row )						\
+#define MUL_3xN_3x3_ROW( N, row )						\
 				__asm movss		xmm3, [edi+(row+0*N)*4]					\
 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
 				__asm movss		xmm4, [edi+(row+1*N)*4]					\
@@ -9771,28 +9867,28 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movss		[eax+(row*3+0)*4], xmm3					\
 				__asm movhps	[eax+(row*3+1)*4], xmm3
 
-				MUL_3xN_3x3_INIT
-				MUL_3xN_3x3_INIT_ROW4
-				MUL_3xN_3x3_ROW4( 6, 0 )
-				MUL_3xN_3x3_INIT_ROW4_ROW
-				MUL_3xN_3x3_ROW( 6, 4 )
-				MUL_3xN_3x3_ROW( 6, 5 )
+			MUL_3xN_3x3_INIT
+			MUL_3xN_3x3_INIT_ROW4
+			MUL_3xN_3x3_ROW4( 6, 0 )
+			MUL_3xN_3x3_INIT_ROW4_ROW
+			MUL_3xN_3x3_ROW( 6, 4 )
+			MUL_3xN_3x3_ROW( 6, 5 )
 
-				return;
-			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			return;
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l];
+				m2Ptr++;
 			}
-			break;
-		case 4:
-			if ( !((k^6)|(l^4)) ) {			// 4x6 * 4x4
+			m1Ptr++;
+		}
+		break;
+	case 4:
+		if ( !( ( k ^ 6 ) | ( l ^ 4 ) ) ) {			// 4x6 * 4x4
 
-				#define MUL_4xN_4x4_INIT								\
+#define MUL_4xN_4x4_INIT								\
 				__asm mov		esi, m2Ptr								\
 				__asm mov		edi, m1Ptr								\
 				__asm mov		eax, dstPtr								\
@@ -9801,7 +9897,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movaps	xmm2, [esi+32]							\
 				__asm movaps	xmm3, [esi+48]
 
-				#define MUL_4xN_4x4_ROW( N, row )						\
+#define MUL_4xN_4x4_ROW( N, row )						\
 				__asm movss		xmm7, [edi+(row+0*N)*4]					\
 				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
 				__asm mulps		xmm7, xmm0								\
@@ -9819,30 +9915,30 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm addps		xmm7, xmm6								\
 				__asm movaps	[eax+row*16], xmm7
 
-				MUL_4xN_4x4_INIT
-				MUL_4xN_4x4_ROW( 6, 0 )
-				MUL_4xN_4x4_ROW( 6, 1 )
-				MUL_4xN_4x4_ROW( 6, 2 )
-				MUL_4xN_4x4_ROW( 6, 3 )
-				MUL_4xN_4x4_ROW( 6, 4 )
-				MUL_4xN_4x4_ROW( 6, 5 )
+			MUL_4xN_4x4_INIT
+			MUL_4xN_4x4_ROW( 6, 0 )
+			MUL_4xN_4x4_ROW( 6, 1 )
+			MUL_4xN_4x4_ROW( 6, 2 )
+			MUL_4xN_4x4_ROW( 6, 3 )
+			MUL_4xN_4x4_ROW( 6, 4 )
+			MUL_4xN_4x4_ROW( 6, 5 )
 
-				return;
-			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
-									m1Ptr[3*k] * m2Ptr[3*l];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			return;
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] +
+				            m1Ptr[3 * k] * m2Ptr[3 * l];
+				m2Ptr++;
 			}
-			break;
-		case 5:
-			if ( !((k^6)|(l^5)) ) {			// 5x6 * 5x5
+			m1Ptr++;
+		}
+		break;
+	case 5:
+		if ( !( ( k ^ 6 ) | ( l ^ 5 ) ) ) {			// 5x6 * 5x5
 
-				#define MUL_5xN_5x5_INIT								\
+#define MUL_5xN_5x5_INIT								\
 				__asm mov		esi, m2Ptr								\
 				__asm mov		edi, m1Ptr								\
 				__asm mov		eax, dstPtr								\
@@ -9857,7 +9953,7 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movlps	xmm4, [esi+20*4]						\
 				__asm movhps	xmm4, [esi+22*4]
 
-				#define MUL_5xN_5x5_ROW( N, row )						\
+#define MUL_5xN_5x5_ROW( N, row )						\
 				__asm movss		xmm6, [edi+(row+0*N)*4]					\
 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
 				__asm mulps		xmm6, xmm0								\
@@ -9895,240 +9991,240 @@ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m
 				__asm movlps	[eax+(row*5+0)*4], xmm6					\
 				__asm movhps	[eax+(row*5+2)*4], xmm6
 
-				MUL_5xN_5x5_INIT
-				MUL_5xN_5x5_ROW( 6, 0 )
-				MUL_5xN_5x5_ROW( 6, 1 )
-				MUL_5xN_5x5_ROW( 6, 2 )
-				MUL_5xN_5x5_ROW( 6, 3 )
-				MUL_5xN_5x5_ROW( 6, 4 )
-				MUL_5xN_5x5_ROW( 6, 5 )
+			MUL_5xN_5x5_INIT
+			MUL_5xN_5x5_ROW( 6, 0 )
+			MUL_5xN_5x5_ROW( 6, 1 )
+			MUL_5xN_5x5_ROW( 6, 2 )
+			MUL_5xN_5x5_ROW( 6, 3 )
+			MUL_5xN_5x5_ROW( 6, 4 )
+			MUL_5xN_5x5_ROW( 6, 5 )
+
+			return;
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] +
+				            m1Ptr[3 * k] * m2Ptr[3 * l] + m1Ptr[4 * k] * m2Ptr[4 * l];
+				m2Ptr++;
+			}
+			m1Ptr++;
+		}
+		break;
+	case 6:
+		if ( !( l ^ 6 ) ) {
+			switch ( k ) {
+			case 1: {					// 6x1 * 6x6
+#define MUL_6xN_6x6_FIRST4COLUMNS_INIT					\
+					__asm mov		esi, m2Ptr								\
+					__asm mov		edi, m1Ptr								\
+					__asm mov		eax, dstPtr								\
+					__asm movlps	xmm0, [esi+ 0*4]						\
+					__asm movhps	xmm0, [esi+ 2*4]						\
+					__asm movlps	xmm1, [esi+ 6*4]						\
+					__asm movhps	xmm1, [esi+ 8*4]						\
+					__asm movlps	xmm2, [esi+12*4]						\
+					__asm movhps	xmm2, [esi+14*4]						\
+					__asm movlps	xmm3, [esi+18*4]						\
+					__asm movhps	xmm3, [esi+20*4]						\
+					__asm movlps	xmm4, [esi+24*4]						\
+					__asm movhps	xmm4, [esi+26*4]						\
+					__asm movlps	xmm5, [esi+30*4]						\
+					__asm movhps	xmm5, [esi+32*4]
+
+#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row )			\
+					__asm movss		xmm7, [edi+(row+0*N)*4]					\
+					__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm7, xmm0								\
+					__asm movss		xmm6, [edi+(row+1*N)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm1								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(row+2*N)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm2								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(row+3*N)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm3								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(row+4*N)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm4								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(row+5*N)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm5								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	[eax+(row*6+0)*4], xmm7					\
+					__asm movhps	[eax+(row*6+2)*4], xmm7
+
+#define MUL_6xN_6x6_LAST2COLUMNS_INIT					\
+					__asm movlps	xmm0, [esi+ 4*4]						\
+					__asm movlps	xmm1, [esi+10*4]						\
+					__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+					__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+					__asm movlps	xmm2, [esi+16*4]						\
+					__asm movlps	xmm3, [esi+22*4]						\
+					__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+					__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+					__asm movlps	xmm4, [esi+28*4]						\
+					__asm movlps	xmm5, [esi+34*4]						\
+					__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
+					__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
+
+#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row )			\
+					__asm movlps	xmm7, [edi+(row*2+0*N)*4]				\
+					__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm7, xmm0								\
+					__asm movlps	xmm6, [edi+(row*2+1*N)*4]				\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm6, xmm1								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	xmm6, [edi+(row*2+2*N)*4]				\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm6, xmm2								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	xmm6, [edi+(row*2+3*N)*4]				\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm6, xmm3								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	xmm6, [edi+(row*2+4*N)*4]				\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm6, xmm4								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	xmm6, [edi+(row*2+5*N)*4]				\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
+					__asm mulps		xmm6, xmm5								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	[eax+(row*12+ 4)*4], xmm7				\
+					__asm movhps	[eax+(row*12+10)*4], xmm7
+
+#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row )			\
+					__asm movss		xmm7, [edi+(1*N-1)*4]					\
+					__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm7, xmm0								\
+					__asm movss		xmm6, [edi+(2*N-1)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm1								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(3*N-1)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm2								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(4*N-1)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm3								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(5*N-1)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm4								\
+					__asm addps		xmm7, xmm6								\
+					__asm movss		xmm6, [edi+(6*N-1)*4]					\
+					__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
+					__asm mulps		xmm6, xmm5								\
+					__asm addps		xmm7, xmm6								\
+					__asm movlps	[eax+(row*6+4)*4], xmm7
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
+
+				return;
+			}
+			case 2: {					// 6x2 * 6x6
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
 
 				return;
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
-									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			case 3: {					// 6x3 * 6x6
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
+
+				return;
 			}
-			break;
-		case 6:
-			if ( !(l^6) ) {
-				switch( k ) {
-					case 1: {					// 6x1 * 6x6
-						#define MUL_6xN_6x6_FIRST4COLUMNS_INIT					\
-						__asm mov		esi, m2Ptr								\
-						__asm mov		edi, m1Ptr								\
-						__asm mov		eax, dstPtr								\
-						__asm movlps	xmm0, [esi+ 0*4]						\
-						__asm movhps	xmm0, [esi+ 2*4]						\
-						__asm movlps	xmm1, [esi+ 6*4]						\
-						__asm movhps	xmm1, [esi+ 8*4]						\
-						__asm movlps	xmm2, [esi+12*4]						\
-						__asm movhps	xmm2, [esi+14*4]						\
-						__asm movlps	xmm3, [esi+18*4]						\
-						__asm movhps	xmm3, [esi+20*4]						\
-						__asm movlps	xmm4, [esi+24*4]						\
-						__asm movhps	xmm4, [esi+26*4]						\
-						__asm movlps	xmm5, [esi+30*4]						\
-						__asm movhps	xmm5, [esi+32*4]
-
-						#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row )			\
-						__asm movss		xmm7, [edi+(row+0*N)*4]					\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm movss		xmm6, [edi+(row+1*N)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(row+2*N)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(row+3*N)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(row+4*N)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(row+5*N)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	[eax+(row*6+0)*4], xmm7					\
-						__asm movhps	[eax+(row*6+2)*4], xmm7
-
-						#define MUL_6xN_6x6_LAST2COLUMNS_INIT					\
-						__asm movlps	xmm0, [esi+ 4*4]						\
-						__asm movlps	xmm1, [esi+10*4]						\
-						__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm movlps	xmm2, [esi+16*4]						\
-						__asm movlps	xmm3, [esi+22*4]						\
-						__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm movlps	xmm4, [esi+28*4]						\
-						__asm movlps	xmm5, [esi+34*4]						\
-						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
-						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
-
-						#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row )			\
-						__asm movlps	xmm7, [edi+(row*2+0*N)*4]				\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm movlps	xmm6, [edi+(row*2+1*N)*4]				\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	xmm6, [edi+(row*2+2*N)*4]				\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	xmm6, [edi+(row*2+3*N)*4]				\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	xmm6, [edi+(row*2+4*N)*4]				\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	xmm6, [edi+(row*2+5*N)*4]				\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	[eax+(row*12+ 4)*4], xmm7				\
-						__asm movhps	[eax+(row*12+10)*4], xmm7
-
-						#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row )			\
-						__asm movss		xmm7, [edi+(1*N-1)*4]					\
-						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm7, xmm0								\
-						__asm movss		xmm6, [edi+(2*N-1)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm1								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(3*N-1)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm2								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(4*N-1)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm3								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(5*N-1)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm4								\
-						__asm addps		xmm7, xmm6								\
-						__asm movss		xmm6, [edi+(6*N-1)*4]					\
-						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
-						__asm mulps		xmm6, xmm5								\
-						__asm addps		xmm7, xmm6								\
-						__asm movlps	[eax+(row*6+4)*4], xmm7
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
-
-						return;
-					}
-					case 2: {					// 6x2 * 6x6
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
-
-						return;
-					}
-					case 3: {					// 6x3 * 6x6
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
-
-						return;
-					}
-					case 4: {					// 6x4 * 6x6
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
-
-						return;
-					}
-					case 5: {					// 6x5 * 6x6
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
-
-						return;
-					}
-					case 6: {					// 6x6 * 6x6
-
-						MUL_6xN_6x6_FIRST4COLUMNS_INIT
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
-						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
-						MUL_6xN_6x6_LAST2COLUMNS_INIT
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
-						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
-
-						return;
-					}
-				}
+			case 4: {					// 6x4 * 6x6
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
+
+				return;
 			}
-			for ( i = 0; i < k; i++ ) {
-				m2Ptr = m2.ToFloatPtr();
-				for ( j = 0; j < l; j++ ) {
-					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
-									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
-					m2Ptr++;
-				}
-				m1Ptr++;
+			case 5: {					// 6x5 * 6x6
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
+
+				return;
 			}
-			break;
-		default:
-			for ( i = 0; i < k; i++ ) {
-				for ( j = 0; j < l; j++ ) {
-					m1Ptr = m1.ToFloatPtr() + i;
-					m2Ptr = m2.ToFloatPtr() + j;
-					sum = m1Ptr[0] * m2Ptr[0];
-					for ( n = 1; n < m1.GetNumRows(); n++ ) {
-						m1Ptr += k;
-						m2Ptr += l;
-						sum += m1Ptr[0] * m2Ptr[0];
-					}
-					*dstPtr++ = sum;
+			case 6: {					// 6x6 * 6x6
+
+				MUL_6xN_6x6_FIRST4COLUMNS_INIT
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
+				MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
+				MUL_6xN_6x6_LAST2COLUMNS_INIT
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
+				MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
+
+				return;
+			}
+			}
+		}
+		for ( i = 0; i < k; i++ ) {
+			m2Ptr = m2.ToFloatPtr();
+			for ( j = 0; j < l; j++ ) {
+				*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2 * k] * m2Ptr[2 * l] +
+				            m1Ptr[3 * k] * m2Ptr[3 * l] + m1Ptr[4 * k] * m2Ptr[4 * l] + m1Ptr[5 * k] * m2Ptr[5 * l];
+				m2Ptr++;
+			}
+			m1Ptr++;
+		}
+		break;
+	default:
+		for ( i = 0; i < k; i++ ) {
+			for ( j = 0; j < l; j++ ) {
+				m1Ptr = m1.ToFloatPtr() + i;
+				m2Ptr = m2.ToFloatPtr() + j;
+				sum = m1Ptr[0] * m2Ptr[0];
+				for ( n = 1; n < m1.GetNumRows(); n++ ) {
+					m1Ptr += k;
+					m2Ptr += l;
+					sum += m1Ptr[0] * m2Ptr[0];
 				}
+				*dstPtr++ = sum;
 			}
+		}
 		break;
 	}
 }
@@ -10156,54 +10252,86 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co
 
 	// unrolled cases for n < 8
 	if ( n < 8 ) {
-		#define NSKIP( n, s )	((n<<3)|(s&7))
-		switch( NSKIP( n, skip ) ) {
-			case NSKIP( 1, 0 ): x[0] = b[0];
-				return;
-			case NSKIP( 2, 0 ): x[0] = b[0];
-			case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-				return;
-			case NSKIP( 3, 0 ): x[0] = b[0];
-			case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-			case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-				return;
-			case NSKIP( 4, 0 ): x[0] = b[0];
-			case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-			case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-			case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
-				return;
-			case NSKIP( 5, 0 ): x[0] = b[0];
-			case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-			case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-			case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
-			case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
-				return;
-			case NSKIP( 6, 0 ): x[0] = b[0];
-			case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-			case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-			case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
-			case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
-			case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
-				return;
-			case NSKIP( 7, 0 ): x[0] = b[0];
-			case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
-			case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-			case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
-			case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
-			case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
-			case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
-				return;
+#define NSKIP( n, s )	((n<<3)|(s&7))
+		switch ( NSKIP( n, skip ) ) {
+		case NSKIP( 1, 0 ):
+			x[0] = b[0];
+			return;
+		case NSKIP( 2, 0 ):
+			x[0] = b[0];
+		case NSKIP( 2, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+			return;
+		case NSKIP( 3, 0 ):
+			x[0] = b[0];
+		case NSKIP( 3, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+		case NSKIP( 3, 2 ):
+			x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+			return;
+		case NSKIP( 4, 0 ):
+			x[0] = b[0];
+		case NSKIP( 4, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+		case NSKIP( 4, 2 ):
+			x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+		case NSKIP( 4, 3 ):
+			x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2];
+			return;
+		case NSKIP( 5, 0 ):
+			x[0] = b[0];
+		case NSKIP( 5, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+		case NSKIP( 5, 2 ):
+			x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+		case NSKIP( 5, 3 ):
+			x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2];
+		case NSKIP( 5, 4 ):
+			x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3];
+			return;
+		case NSKIP( 6, 0 ):
+			x[0] = b[0];
+		case NSKIP( 6, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+		case NSKIP( 6, 2 ):
+			x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+		case NSKIP( 6, 3 ):
+			x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2];
+		case NSKIP( 6, 4 ):
+			x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3];
+		case NSKIP( 6, 5 ):
+			x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4];
+			return;
+		case NSKIP( 7, 0 ):
+			x[0] = b[0];
+		case NSKIP( 7, 1 ):
+			x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+		case NSKIP( 7, 2 ):
+			x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+		case NSKIP( 7, 3 ):
+			x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2];
+		case NSKIP( 7, 4 ):
+			x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3];
+		case NSKIP( 7, 5 ):
+			x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4];
+		case NSKIP( 7, 6 ):
+			x[6] = b[6] - lptr[6 * nc + 0] * x[0] - lptr[6 * nc + 1] * x[1] - lptr[6 * nc + 2] * x[2] - lptr[6 * nc + 3] * x[3] - lptr[6 * nc + 4] * x[4] - lptr[6 * nc + 5] * x[5];
+			return;
 		}
 		return;
 	}
 
 	// process first 4 rows
-	switch( skip ) {
-		case 0: x[0] = b[0];
-		case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
-		case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
-		case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
-				skip = 4;
+	switch ( skip ) {
+	case 0:
+		x[0] = b[0];
+	case 1:
+		x[1] = b[1] - lptr[1 * nc + 0] * x[0];
+	case 2:
+		x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1];
+	case 3:
+		x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2];
+		skip = 4;
 	}
 
 	lptr = L[skip];
@@ -10230,56 +10358,56 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co
 		jnz			loopurow
 
 		// aligned
-	looprow:
+		looprow:
 		mov			ecx, eax
 		neg			ecx
 		movaps		xmm0, [esi+ecx]
 		mulps		xmm0, [edi+ecx]
 		add			ecx, 12*4
 		jg			donedot8
-	dot8:
-		movaps		xmm1, [esi+ecx-(8*4)]
-		mulps		xmm1, [edi+ecx-(8*4)]
+		dot8:
+		movaps		xmm1, [esi+ecx-( 8*4 )]
+		mulps		xmm1, [edi+ecx-( 8*4 )]
 		addps		xmm0, xmm1
-		movaps		xmm3, [esi+ecx-(4*4)]
-		mulps		xmm3, [edi+ecx-(4*4)]
+		movaps		xmm3, [esi+ecx-( 4*4 )]
+		mulps		xmm3, [edi+ecx-( 4*4 )]
 		addps		xmm0, xmm3
 		add			ecx, 8*4
 		jle			dot8
-	donedot8:
+		donedot8:
 		sub			ecx, 4*4
 		jg			donedot4
-	//dot4:
-		movaps		xmm1, [esi+ecx-(4*4)]
-		mulps		xmm1, [edi+ecx-(4*4)]
+		//dot4:
+		movaps		xmm1, [esi+ecx-( 4*4 )]
+		mulps		xmm1, [edi+ecx-( 4*4 )]
 		addps		xmm0, xmm1
 		add			ecx, 4*4
-	donedot4:
+		donedot4:
 		movhlps		xmm1, xmm0
 		addps		xmm0, xmm1
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
 		addss		xmm0, xmm1
-		sub			ecx, 4*4
+		sub			ecx, 4 * 4
 		jz			dot0
 		add			ecx, 4
 		jz			dot1
 		add			ecx, 4
 		jz			dot2
-	//dot3:
-		movss		xmm1, [esi-(3*4)]
-		mulss		xmm1, [edi-(3*4)]
+		//dot3:
+		movss		xmm1, [esi - ( 3 * 4 )]
+		mulss		xmm1, [edi - ( 3 * 4 )]
 		addss		xmm0, xmm1
-	dot2:
-		movss		xmm3, [esi-(2*4)]
-		mulss		xmm3, [edi-(2*4)]
+		dot2:
+		movss		xmm3, [esi - ( 2 * 4 )]
+		mulss		xmm3, [edi - ( 2 * 4 )]
 		addss		xmm0, xmm3
-	dot1:
-		movss		xmm5, [esi-(1*4)]
-		mulss		xmm5, [edi-(1*4)]
+		dot1:
+		movss		xmm5, [esi - ( 1 * 4 )]
+		mulss		xmm5, [edi - ( 1 * 4 )]
 		addss		xmm0, xmm5
-	dot0:
-		movss		xmm1, [ebx+eax]
+		dot0:
+		movss		xmm1, [ebx + eax]
 		subss		xmm1, xmm0
 		movss		[esi], xmm1
 		add			eax, 4
@@ -10293,63 +10421,63 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co
 		jmp			looprow
 
 		// unaligned
-	loopurow:
+		loopurow:
 		mov			ecx, eax
 		neg			ecx
-		movups		xmm0, [esi+ecx]
-		movups		xmm1, [edi+ecx]
+		movups		xmm0, [esi + ecx]
+		movups		xmm1, [edi + ecx]
 		mulps		xmm0, xmm1
-		add			ecx, 12*4
+		add			ecx, 12 * 4
 		jg			doneudot8
-	udot8:
-		movups		xmm1, [esi+ecx-(8*4)]
-		movups		xmm2, [edi+ecx-(8*4)]
+		udot8:
+		movups		xmm1, [esi + ecx - ( 8 * 4 )]
+		movups		xmm2, [edi + ecx - ( 8 * 4 )]
 		mulps		xmm1, xmm2
 		addps		xmm0, xmm1
-		movups		xmm3, [esi+ecx-(4*4)]
-		movups		xmm4, [edi+ecx-(4*4)]
+		movups		xmm3, [esi + ecx - ( 4 * 4 )]
+		movups		xmm4, [edi + ecx - ( 4 * 4 )]
 		mulps		xmm3, xmm4
 		addps		xmm0, xmm3
-		add			ecx, 8*4
+		add			ecx, 8 * 4
 		jle			udot8
-	doneudot8:
-		sub			ecx, 4*4
+		doneudot8:
+		sub			ecx, 4 * 4
 		jg			doneudot4
-	//udot4:
-		movups		xmm1, [esi+ecx-(4*4)]
-		movups		xmm2, [edi+ecx-(4*4)]
+		//udot4:
+		movups		xmm1, [esi + ecx - ( 4 * 4 )]
+		movups		xmm2, [edi + ecx - ( 4 * 4 )]
 		mulps		xmm1, xmm2
 		addps		xmm0, xmm1
-		add			ecx, 4*4
-	doneudot4:
+		add			ecx, 4 * 4
+		doneudot4:
 		movhlps		xmm1, xmm0
 		addps		xmm0, xmm1
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
 		addss		xmm0, xmm1
-		sub			ecx, 4*4
+		sub			ecx, 4 * 4
 		jz			udot0
 		add			ecx, 4
 		jz			udot1
 		add			ecx, 4
 		jz			udot2
-	//udot3:
-		movss		xmm1, [esi-(3*4)]
-		movss		xmm2, [edi-(3*4)]
+		//udot3:
+		movss		xmm1, [esi - ( 3 * 4 )]
+		movss		xmm2, [edi - ( 3 * 4 )]
 		mulss		xmm1, xmm2
 		addss		xmm0, xmm1
-	udot2:
-		movss		xmm3, [esi-(2*4)]
-		movss		xmm4, [edi-(2*4)]
+		udot2:
+		movss		xmm3, [esi - ( 2 * 4 )]
+		movss		xmm4, [edi - ( 2 * 4 )]
 		mulss		xmm3, xmm4
 		addss		xmm0, xmm3
-	udot1:
-		movss		xmm5, [esi-(1*4)]
-		movss		xmm6, [edi-(1*4)]
+		udot1:
+		movss		xmm5, [esi - ( 1 * 4 )]
+		movss		xmm6, [edi - ( 1 * 4 )]
 		mulss		xmm5, xmm6
 		addss		xmm0, xmm5
-	udot0:
-		movss		xmm1, [ebx+eax]
+		udot0:
+		movss		xmm1, [ebx + eax]
 		subss		xmm1, xmm0
 		movss		[esi], xmm1
 		add			eax, 4
@@ -10361,7 +10489,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, co
 		add			edi, ecx
 		add			edi, 4
 		jmp			loopurow
-	done:
+		done:
 		pop			ebx
 	}
 }
@@ -10384,51 +10512,51 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 
 	// unrolled cases for n < 8
 	if ( n < 8 ) {
-		switch( n ) {
-			case 0:
-				return;
-			case 1:
-				x[0] = b[0];
-				return;
-			case 2:
-				x[1] = b[1];
-				x[0] = b[0] - lptr[1*nc+0] * x[1];
-				return;
-			case 3:
-				x[2] = b[2];
-				x[1] = b[1] - lptr[2*nc+1] * x[2];
-				x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
-				return;
-			case 4:
-				x[3] = b[3];
-				x[2] = b[2] - lptr[3*nc+2] * x[3];
-				x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
-				x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
-				return;
-			case 5:
-				x[4] = b[4];
-				x[3] = b[3] - lptr[4*nc+3] * x[4];
-				x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
-				x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
-				x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
-				return;
-			case 6:
-				x[5] = b[5];
-				x[4] = b[4] - lptr[5*nc+4] * x[5];
-				x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
-				x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
-				x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
-				x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
-				return;
-			case 7:
-				x[6] = b[6];
-				x[5] = b[5] - lptr[6*nc+5] * x[6];
-				x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
-				x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
-				x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
-				x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
-				x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
-				return;
+		switch ( n ) {
+		case 0:
+			return;
+		case 1:
+			x[0] = b[0];
+			return;
+		case 2:
+			x[1] = b[1];
+			x[0] = b[0] - lptr[1 * nc + 0] * x[1];
+			return;
+		case 3:
+			x[2] = b[2];
+			x[1] = b[1] - lptr[2 * nc + 1] * x[2];
+			x[0] = b[0] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1];
+			return;
+		case 4:
+			x[3] = b[3];
+			x[2] = b[2] - lptr[3 * nc + 2] * x[3];
+			x[1] = b[1] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2];
+			x[0] = b[0] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1];
+			return;
+		case 5:
+			x[4] = b[4];
+			x[3] = b[3] - lptr[4 * nc + 3] * x[4];
+			x[2] = b[2] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3];
+			x[1] = b[1] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2];
+			x[0] = b[0] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1];
+			return;
+		case 6:
+			x[5] = b[5];
+			x[4] = b[4] - lptr[5 * nc + 4] * x[5];
+			x[3] = b[3] - lptr[5 * nc + 3] * x[5] - lptr[4 * nc + 3] * x[4];
+			x[2] = b[2] - lptr[5 * nc + 2] * x[5] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3];
+			x[1] = b[1] - lptr[5 * nc + 1] * x[5] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2];
+			x[0] = b[0] - lptr[5 * nc + 0] * x[5] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1];
+			return;
+		case 7:
+			x[6] = b[6];
+			x[5] = b[5] - lptr[6 * nc + 5] * x[6];
+			x[4] = b[4] - lptr[6 * nc + 4] * x[6] - lptr[5 * nc + 4] * x[5];
+			x[3] = b[3] - lptr[6 * nc + 3] * x[6] - lptr[5 * nc + 3] * x[5] - lptr[4 * nc + 3] * x[4];
+			x[2] = b[2] - lptr[6 * nc + 2] * x[6] - lptr[5 * nc + 2] * x[5] - lptr[4 * nc + 2] * x[4] - lptr[3 * nc + 2] * x[3];
+			x[1] = b[1] - lptr[6 * nc + 1] * x[6] - lptr[5 * nc + 1] * x[5] - lptr[4 * nc + 1] * x[4] - lptr[3 * nc + 1] * x[3] - lptr[2 * nc + 1] * x[2];
+			x[0] = b[0] - lptr[6 * nc + 0] * x[6] - lptr[5 * nc + 0] * x[5] - lptr[4 * nc + 0] * x[4] - lptr[3 * nc + 0] * x[3] - lptr[2 * nc + 0] * x[2] - lptr[1 * nc + 0] * x[1];
+			return;
 		}
 		return;
 	}
@@ -10458,49 +10586,49 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 			mov			ebx, b					// ebx = b
 			mov			edx, nc					// edx = nc*sizeof(float)
 			shl			edx, 2
-		process4rows_1:
+			process4rows_1:
 			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
 			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
 			xor			ecx, ecx
 			sub			eax, m
 			neg			eax
 			jz			done4x4_1
-		process4x4_1:	// process 4x4 blocks
+			process4x4_1:	// process 4x4 blocks
 			movlps		xmm2, [edi+0]
 			movhps		xmm2, [edi+8]
 			add			edi, edx
 			movss		xmm1, [esi+4*ecx+0]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm3, [edi+0]
-			movhps		xmm3, [edi+8]
+			movlps		xmm3, [edi + 0]
+			movhps		xmm3, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm2
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+4]
+			movss		xmm1, [esi + 4 * ecx + 4]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm4, [edi+0]
-			movhps		xmm4, [edi+8]
+			movlps		xmm4, [edi + 0]
+			movhps		xmm4, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm3
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+8]
+			movss		xmm1, [esi + 4 * ecx + 8]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm5, [edi+0]
-			movhps		xmm5, [edi+8]
+			movlps		xmm5, [edi + 0]
+			movhps		xmm5, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm4
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+12]
+			movss		xmm1, [esi + 4 * ecx + 12]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 			add			ecx, 4
 			cmp			ecx, eax
 			mulps		xmm1, xmm5
 			subps		xmm0, xmm1
 			jl			process4x4_1
-		done4x4_1:		// process left over of the 4 rows
-			movlps		xmm2, [edi+0]
-			movhps		xmm2, [edi+8]
-			movss		xmm1, [esi+4*ecx]
+			done4x4_1:		// process left over of the 4 rows
+			movlps		xmm2, [edi + 0]
+			movhps		xmm2, [edi + 8]
+			movss		xmm1, [esi + 4 * ecx]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 			mulps		xmm1, xmm2
 			subps		xmm0, xmm1
@@ -10517,34 +10645,34 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 			movaps		xmm3, xmm0
 			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
 			sub			edi, edx
-			movss		[esi-4], xmm3			// xptr[-1] = s3
+			movss		[esi - 4], xmm3			// xptr[-1] = s3
 			movss		xmm4, xmm3
 			movss		xmm5, xmm3
-			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
-			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
-			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
+			mulss		xmm3, [edi + 8]			// lptr[-1*nc+2] * s3
+			mulss		xmm4, [edi + 4]			// lptr[-1*nc+1] * s3
+			mulss		xmm5, [edi + 0]			// lptr[-1*nc+0] * s3
 			subss		xmm2, xmm3
-			movss		[esi-8], xmm2			// xptr[-2] = s2
+			movss		[esi - 8], xmm2			// xptr[-2] = s2
 			movss		xmm6, xmm2
 			sub			edi, edx
 			subss		xmm0, xmm5
 			subss		xmm1, xmm4
-			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
-			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
+			mulss		xmm2, [edi + 4]			// lptr[-2*nc+1] * s2
+			mulss		xmm6, [edi + 0]			// lptr[-2*nc+0] * s2
 			subss		xmm1, xmm2
-			movss		[esi-12], xmm1			// xptr[-3] = s1
+			movss		[esi - 12], xmm1			// xptr[-3] = s1
 			subss		xmm0, xmm6
 			sub			edi, edx
 			cmp			eax, 4
-			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
+			mulss		xmm1, [edi + 0]			// lptr[-3*nc+0] * s1
 			subss		xmm0, xmm1
-			movss		[esi-16], xmm0			// xptr[-4] = s0
+			movss		[esi - 16], xmm0			// xptr[-4] = s0
 			jl			done4rows_1
 			sub			edi, edx
 			sub			edi, 16
 			sub			esi, 16
 			jmp			process4rows_1
-		done4rows_1:
+			done4rows_1:
 			pop			ebx
 		}
 
@@ -10560,39 +10688,39 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 			mov			ebx, b					// ebx = b
 			mov			edx, nc					// edx = nc*sizeof(float)
 			shl			edx, 2
-		process4rows:
+			process4rows:
 			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
 			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
 			sub			eax, m
 			jz			done4x4
 			neg			eax
 			xor			ecx, ecx
-		process4x4:		// process 4x4 blocks
+			process4x4:		// process 4x4 blocks
 			movlps		xmm2, [edi+0]
 			movhps		xmm2, [edi+8]
 			add			edi, edx
 			movss		xmm1, [esi+4*ecx+0]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm3, [edi+0]
-			movhps		xmm3, [edi+8]
+			movlps		xmm3, [edi + 0]
+			movhps		xmm3, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm2
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+4]
+			movss		xmm1, [esi + 4 * ecx + 4]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm4, [edi+0]
-			movhps		xmm4, [edi+8]
+			movlps		xmm4, [edi + 0]
+			movhps		xmm4, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm3
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+8]
+			movss		xmm1, [esi + 4 * ecx + 8]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-			movlps		xmm5, [edi+0]
-			movhps		xmm5, [edi+8]
+			movlps		xmm5, [edi + 0]
+			movhps		xmm5, [edi + 8]
 			add			edi, edx
 			mulps		xmm1, xmm4
 			subps		xmm0, xmm1
-			movss		xmm1, [esi+4*ecx+12]
+			movss		xmm1, [esi + 4 * ecx + 12]
 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 			add			ecx, 4
 			cmp			ecx, eax
@@ -10602,7 +10730,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 			imul		ecx, edx
 			sub			edi, ecx
 			neg			eax
-		done4x4:		// process left over of the 4 rows
+			done4x4:		// process left over of the 4 rows
 			add			eax, m
 			sub			eax, 4
 			movaps		xmm1, xmm0
@@ -10612,44 +10740,44 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 			movaps		xmm3, xmm0
 			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
 			sub			edi, edx
-			movss		[esi-4], xmm3			// xptr[-1] = s3
+			movss		[esi - 4], xmm3			// xptr[-1] = s3
 			movss		xmm4, xmm3
 			movss		xmm5, xmm3
-			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
-			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
-			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
+			mulss		xmm3, [edi + 8]			// lptr[-1*nc+2] * s3
+			mulss		xmm4, [edi + 4]			// lptr[-1*nc+1] * s3
+			mulss		xmm5, [edi + 0]			// lptr[-1*nc+0] * s3
 			subss		xmm2, xmm3
-			movss		[esi-8], xmm2			// xptr[-2] = s2
+			movss		[esi - 8], xmm2			// xptr[-2] = s2
 			movss		xmm6, xmm2
 			sub			edi, edx
 			subss		xmm0, xmm5
 			subss		xmm1, xmm4
-			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
-			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
+			mulss		xmm2, [edi + 4]			// lptr[-2*nc+1] * s2
+			mulss		xmm6, [edi + 0]			// lptr[-2*nc+0] * s2
 			subss		xmm1, xmm2
-			movss		[esi-12], xmm1			// xptr[-3] = s1
+			movss		[esi - 12], xmm1			// xptr[-3] = s1
 			subss		xmm0, xmm6
 			sub			edi, edx
 			cmp			eax, 4
-			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
+			mulss		xmm1, [edi + 0]			// lptr[-3*nc+0] * s1
 			subss		xmm0, xmm1
-			movss		[esi-16], xmm0			// xptr[-4] = s0
+			movss		[esi - 16], xmm0			// xptr[-4] = s0
 			jl			done4rows
 			sub			edi, edx
 			sub			edi, 16
 			sub			esi, 16
 			jmp			process4rows
-		done4rows:
+			done4rows:
 			pop			ebx
 		}
 	}
 
 	// process left over rows
-	for ( i = (m&3)-1; i >= 0; i-- ) {
+	for ( i = ( m & 3 ) - 1; i >= 0; i-- ) {
 		s0 = b[i];
 		lptr = L[0] + i;
 		for ( j = i + 1; j < n; j++ ) {
-			s0 -= lptr[j*nc] * x[j];
+			s0 -= lptr[j * nc] * x[j];
 		}
 		x[i] = s0;
 	}
@@ -10671,14 +10799,14 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 		xptr = x + m;
 		// process 4 rows at a time
 		for ( i = m; i >= 4; i -= 4 ) {
-			s0 = b[i-4];
-			s1 = b[i-3];
-			s2 = b[i-2];
-			s3 = b[i-1];
+			s0 = b[i - 4];
+			s1 = b[i - 3];
+			s2 = b[i - 2];
+			s3 = b[i - 1];
 			// process 4x4 blocks
 			xptr2 = xptr;	// x + i;
 			lptr2 = lptr;	// ptr = L[i] + i - 4;
-			for ( j = 0; j < m-i; j += 4 ) {
+			for ( j = 0; j < m - i; j += 4 ) {
 				t = xptr2[0];
 				s0 -= lptr2[0] * t;
 				s1 -= lptr2[1] * t;
@@ -10740,14 +10868,14 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 		xptr = x + m;
 		// process 4 rows at a time
 		for ( i = m; i >= 4; i -= 4 ) {
-			s0 = b[i-4];
-			s1 = b[i-3];
-			s2 = b[i-2];
-			s3 = b[i-1];
+			s0 = b[i - 4];
+			s1 = b[i - 3];
+			s2 = b[i - 2];
+			s3 = b[i - 1];
 			// process 4x4 blocks
 			xptr2 = xptr;	// x + i;
 			lptr2 = lptr;	// ptr = L[i] + i - 4;
-			for ( j = 0; j < m-i; j += 4 ) {
+			for ( j = 0; j < m - i; j += 4 ) {
 				t = xptr2[0];
 				s0 -= lptr2[0] * t;
 				s1 -= lptr2[1] * t;
@@ -10803,7 +10931,7 @@ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, flo
 		s0 = b[i];
 		lptr = L[0] + i;
 		for ( j = i + 1; j < m; j++ ) {
-			s0 -= lptr[j*nc] * x[j];
+			s0 -= lptr[j * nc] * x[j];
 		}
 		x[i] = s0;
 	}
@@ -10827,8 +10955,8 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 	float *v, *diag, *invDiagPtr, *mptr;
 	double s0, s1, s2, sum, d;
 
-	v = (float *) _alloca16( n * sizeof( float ) );
-	diag = (float *) _alloca16( n * sizeof( float ) );
+	v = ( float * ) _alloca16( n * sizeof( float ) );
+	diag = ( float * ) _alloca16( n * sizeof( float ) );
 	invDiagPtr = invDiag.ToFloatPtr();
 
 	nc = mat.GetNumColumns();
@@ -10856,12 +10984,13 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 1; j < n; j++ ) {
-		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
+		mptr[j * nc + 0] = ( mptr[j * nc + 0] ) * d;
 	}
 
 	mptr = mat[1];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
 	sum = mptr[1] - s0;
 
 	if ( sum == 0.0f ) {
@@ -10878,13 +11007,15 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 2; j < n; j++ ) {
-		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
+		mptr[j * nc + 1] = ( mptr[j * nc + 1] - v[0] * mptr[j * nc + 0] ) * d;
 	}
 
 	mptr = mat[2];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
-	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
+	v[1] = diag[1] * mptr[1];
+	s1 = v[1] * mptr[1];
 	sum = mptr[2] - s0 - s1;
 
 	if ( sum == 0.0f ) {
@@ -10901,14 +11032,17 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 3; j < n; j++ ) {
-		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
+		mptr[j * nc + 2] = ( mptr[j * nc + 2] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] ) * d;
 	}
 
 	mptr = mat[3];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
-	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
-	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
+	v[1] = diag[1] * mptr[1];
+	s1 = v[1] * mptr[1];
+	v[2] = diag[2] * mptr[2];
+	s2 = v[2] * mptr[2];
 	sum = mptr[3] - s0 - s1 - s2;
 
 	if ( sum == 0.0f ) {
@@ -10925,7 +11059,7 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 4; j < n; j++ ) {
-		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
+		mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d;
 	}
 
 	int ncf = nc * sizeof( float );
@@ -10939,158 +11073,158 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 		push		ebx
 		mov			ebx, 4
 
-	loopRow:
-			cmp			ebx, n
-			jge			done
-
-			mov			ecx, ebx				// esi = i
-			shl			ecx, 2					// esi = i * 4
-			mov			edx, diag				// edx = diag
-			add			edx, ecx				// edx = &diag[i]
-			mov			edi, ebx				// edi = i
-			imul		edi, ncf				// edi = i * nc * sizeof( float )
-			add			edi, mptr				// edi = mat[i]
-			add			edi, ecx				// edi = &mat[i][i]
-			mov			esi, v					// ecx = v
-			add			esi, ecx				// ecx = &v[i]
-			mov			eax, invDiagPtr			// eax = invDiagPtr
-			add			eax, ecx				// eax = &invDiagPtr[i]
-			neg			ecx
-
-			movaps		xmm0, [edx+ecx]
-			mulps		xmm0, [edi+ecx]
-			movaps		[esi+ecx], xmm0
-			mulps		xmm0, [edi+ecx]
-			add			ecx, 12*4
-			jg			doneDot8
+		loopRow:
+		cmp			ebx, n
+		jge			done
+
+		mov			ecx, ebx				// esi = i
+		shl			ecx, 2					// esi = i * 4
+		mov			edx, diag				// edx = diag
+		add			edx, ecx				// edx = &diag[i]
+		mov			edi, ebx				// edi = i
+		imul		edi, ncf				// edi = i * nc * sizeof( float )
+		add			edi, mptr				// edi = mat[i]
+		add			edi, ecx				// edi = &mat[i][i]
+		mov			esi, v					// ecx = v
+		add			esi, ecx				// ecx = &v[i]
+		mov			eax, invDiagPtr			// eax = invDiagPtr
+		add			eax, ecx				// eax = &invDiagPtr[i]
+		neg			ecx
+
+		movaps		xmm0, [edx+ecx]
+		mulps		xmm0, [edi+ecx]
+		movaps		[esi+ecx], xmm0
+		mulps		xmm0, [edi+ecx]
+		add			ecx, 12*4
+		jg			doneDot8
 		dot8:
-			movaps		xmm1, [edx+ecx-(8*4)]
-			mulps		xmm1, [edi+ecx-(8*4)]
-			movaps		[esi+ecx-(8*4)], xmm1
-			mulps		xmm1, [edi+ecx-(8*4)]
-			addps		xmm0, xmm1
-			movaps		xmm2, [edx+ecx-(4*4)]
-			mulps		xmm2, [edi+ecx-(4*4)]
-			movaps		[esi+ecx-(4*4)], xmm2
-			mulps		xmm2, [edi+ecx-(4*4)]
-			addps		xmm0, xmm2
-			add			ecx, 8*4
-			jle			dot8
+		movaps		xmm1, [edx+ecx-( 8*4 )]
+		mulps		xmm1, [edi+ecx-( 8*4 )]
+		movaps		[esi+ecx-( 8*4 )], xmm1
+		mulps		xmm1, [edi+ecx-( 8*4 )]
+		addps		xmm0, xmm1
+		movaps		xmm2, [edx+ecx-( 4*4 )]
+		mulps		xmm2, [edi+ecx-( 4*4 )]
+		movaps		[esi+ecx-( 4*4 )], xmm2
+		mulps		xmm2, [edi+ecx-( 4*4 )]
+		addps		xmm0, xmm2
+		add			ecx, 8*4
+		jle			dot8
 		doneDot8:
-			sub			ecx, 4*4
-			jg			doneDot4
-			movaps		xmm1, [edx+ecx-(4*4)]
-			mulps		xmm1, [edi+ecx-(4*4)]
-			movaps		[esi+ecx-(4*4)], xmm1
-			mulps		xmm1, [edi+ecx-(4*4)]
-			addps		xmm0, xmm1
-			add			ecx, 4*4
+		sub			ecx, 4*4
+		jg			doneDot4
+		movaps		xmm1, [edx+ecx-( 4*4 )]
+		mulps		xmm1, [edi+ecx-( 4*4 )]
+		movaps		[esi+ecx-( 4*4 )], xmm1
+		mulps		xmm1, [edi+ecx-( 4*4 )]
+		addps		xmm0, xmm1
+		add			ecx, 4*4
 		doneDot4:
-			sub			ecx, 2*4
-			jg			doneDot2
-			movlps		xmm3, [edx+ecx-(2*4)]
-			movlps		xmm4, [edi+ecx-(2*4)]
-			mulps		xmm3, xmm4
-			movlps		[esi+ecx-(2*4)], xmm3
-			mulps		xmm3, xmm4
-			addps		xmm0, xmm3
-			add			ecx, 2*4
+		sub			ecx, 2*4
+		jg			doneDot2
+		movlps		xmm3, [edx+ecx-( 2*4 )]
+		movlps		xmm4, [edi+ecx-( 2*4 )]
+		mulps		xmm3, xmm4
+		movlps		[esi+ecx-( 2*4 )], xmm3
+		mulps		xmm3, xmm4
+		addps		xmm0, xmm3
+		add			ecx, 2*4
 		doneDot2:
-			sub			ecx, 1*4
-			jg			doneDot1
-			movss		xmm3, [edx+ecx-(1*4)]
-			movss		xmm4, [edi+ecx-(1*4)]
-			mulss		xmm3, xmm4
-			movss		[esi+ecx-(1*4)], xmm3
-			mulss		xmm3, xmm4
-			addss		xmm0, xmm3
+		sub			ecx, 1*4
+		jg			doneDot1
+		movss		xmm3, [edx+ecx-( 1*4 )]
+		movss		xmm4, [edi+ecx-( 1*4 )]
+		mulss		xmm3, xmm4
+		movss		[esi+ecx-( 1*4 )], xmm3
+		mulss		xmm3, xmm4
+		addss		xmm0, xmm3
 		doneDot1:
-			movhlps		xmm2, xmm0
-			addps		xmm0, xmm2
-			movaps		xmm2, xmm0
-			shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
-			addss		xmm0, xmm2
-			movss		xmm1, [edi]
-			subss		xmm1, xmm0
-			movss		[edi], xmm1				// mptr[i] = sum;
-			movss		[edx], xmm1				// diag[i] = sum;
-
-			// if ( sum == 0.0f ) return false;
-			movaps		xmm2, xmm1
-			cmpeqss		xmm2, SIMD_SP_zero
-			andps		xmm2, SIMD_SP_tiny
-			orps		xmm1, xmm2
-
-			rcpss		xmm7, xmm1
-			mulss		xmm1, xmm7
-			mulss		xmm1, xmm7
-			addss		xmm7, xmm7
-			subss		xmm7, xmm1
-			movss		[eax], xmm7				// invDiagPtr[i] = 1.0f / sum;
-
-			mov			edx, n					// edx = n
-			sub			edx, ebx				// edx = n - i
-			dec			edx						// edx = n - i - 1
-			jle			doneSubRow				// if ( i + 1 >= n ) return true;
-
-			mov			eax, ebx				// eax = i
-			shl			eax, 2					// eax = i * 4
-			neg			eax
+		movhlps		xmm2, xmm0
+		addps		xmm0, xmm2
+		movaps		xmm2, xmm0
+		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
+		addss		xmm0, xmm2
+		movss		xmm1, [edi]
+		subss		xmm1, xmm0
+		movss		[edi], xmm1				// mptr[i] = sum;
+		movss		[edx], xmm1				// diag[i] = sum;
+
+		// if ( sum == 0.0f ) return false;
+		movaps		xmm2, xmm1
+		cmpeqss		xmm2, SIMD_SP_zero
+		andps		xmm2, SIMD_SP_tiny
+		orps		xmm1, xmm2
+
+		rcpss		xmm7, xmm1
+		mulss		xmm1, xmm7
+		mulss		xmm1, xmm7
+		addss		xmm7, xmm7
+		subss		xmm7, xmm1
+		movss		[eax], xmm7				// invDiagPtr[i] = 1.0f / sum;
+
+		mov			edx, n					// edx = n
+		sub			edx, ebx				// edx = n - i
+		dec			edx						// edx = n - i - 1
+		jle			doneSubRow				// if ( i + 1 >= n ) return true;
+
+		mov			eax, ebx				// eax = i
+		shl			eax, 2					// eax = i * 4
+		neg			eax
 
 		loopSubRow:
-				add			edi, ncf
-				mov			ecx, eax
-				movaps		xmm0, [esi+ecx]
-				mulps		xmm0, [edi+ecx]
-				add			ecx, 12*4
-				jg			doneSubDot8
-			subDot8:
-				movaps		xmm1, [esi+ecx-(8*4)]
-				mulps		xmm1, [edi+ecx-(8*4)]
-				addps		xmm0, xmm1
-				movaps		xmm2, [esi+ecx-(4*4)]
-				mulps		xmm2, [edi+ecx-(4*4)]
-				addps		xmm0, xmm2
-				add			ecx, 8*4
-				jle			subDot8
-			doneSubDot8:
-				sub			ecx, 4*4
-				jg			doneSubDot4
-				movaps		xmm1, [esi+ecx-(4*4)]
-				mulps		xmm1, [edi+ecx-(4*4)]
-				addps		xmm0, xmm1
-				add			ecx, 4*4
-			doneSubDot4:
-				sub			ecx, 2*4
-				jg			doneSubDot2
-				movlps		xmm3, [esi+ecx-(2*4)]
-				movlps		xmm4, [edi+ecx-(2*4)]
-				mulps		xmm3, xmm4
-				addps		xmm0, xmm3
-				add			ecx, 2*4
-			doneSubDot2:
-				sub			ecx, 1*4
-				jg			doneSubDot1
-				movss		xmm3, [esi+ecx-(1*4)]
-				movss		xmm4, [edi+ecx-(1*4)]
-				mulss		xmm3, xmm4
-				addss		xmm0, xmm3
-			doneSubDot1:
-				movhlps		xmm2, xmm0
-				addps		xmm0, xmm2
-				movaps		xmm2, xmm0
-				shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
-				addss		xmm0, xmm2
-				movss		xmm1, [edi]
-				subss		xmm1, xmm0
-				mulss		xmm1, xmm7
-				movss		[edi], xmm1
-				dec			edx
-				jg			loopSubRow
+		add			edi, ncf
+		mov			ecx, eax
+		movaps		xmm0, [esi + ecx]
+		mulps		xmm0, [edi + ecx]
+		add			ecx, 12 * 4
+		jg			doneSubDot8
+		subDot8:
+		movaps		xmm1, [esi + ecx - ( 8 * 4 )]
+		mulps		xmm1, [edi + ecx - ( 8 * 4 )]
+		addps		xmm0, xmm1
+		movaps		xmm2, [esi + ecx - ( 4 * 4 )]
+		mulps		xmm2, [edi + ecx - ( 4 * 4 )]
+		addps		xmm0, xmm2
+		add			ecx, 8 * 4
+		jle			subDot8
+		doneSubDot8:
+		sub			ecx, 4 * 4
+		jg			doneSubDot4
+		movaps		xmm1, [esi + ecx - ( 4 * 4 )]
+		mulps		xmm1, [edi + ecx - ( 4 * 4 )]
+		addps		xmm0, xmm1
+		add			ecx, 4 * 4
+		doneSubDot4:
+		sub			ecx, 2 * 4
+		jg			doneSubDot2
+		movlps		xmm3, [esi + ecx - ( 2 * 4 )]
+		movlps		xmm4, [edi + ecx - ( 2 * 4 )]
+		mulps		xmm3, xmm4
+		addps		xmm0, xmm3
+		add			ecx, 2 * 4
+		doneSubDot2:
+		sub			ecx, 1 * 4
+		jg			doneSubDot1
+		movss		xmm3, [esi + ecx - ( 1 * 4 )]
+		movss		xmm4, [edi + ecx - ( 1 * 4 )]
+		mulss		xmm3, xmm4
+		addss		xmm0, xmm3
+		doneSubDot1:
+		movhlps		xmm2, xmm0
+		addps		xmm0, xmm2
+		movaps		xmm2, xmm0
+		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
+		addss		xmm0, xmm2
+		movss		xmm1, [edi]
+		subss		xmm1, xmm0
+		mulss		xmm1, xmm7
+		movss		[edi], xmm1
+		dec			edx
+		jg			loopSubRow
 		doneSubRow:
-			inc		ebx
-			jmp		loopRow
-	done:
+		inc		ebx
+		jmp		loopRow
+		done:
 		pop		ebx
 	}
 
@@ -11102,8 +11236,8 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 	float *v, *diag, *mptr;
 	double s0, s1, s2, s3, sum, d;
 
-	v = (float *) _alloca16( n * sizeof( float ) );
-	diag = (float *) _alloca16( n * sizeof( float ) );
+	v = ( float * ) _alloca16( n * sizeof( float ) );
+	diag = ( float * ) _alloca16( n * sizeof( float ) );
 
 	nc = mat.GetNumColumns();
 
@@ -11128,12 +11262,13 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 1; j < n; j++ ) {
-		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
+		mptr[j * nc + 0] = ( mptr[j * nc + 0] ) * d;
 	}
 
 	mptr = mat[1];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
 	sum = mptr[1] - s0;
 
 	if ( sum == 0.0f ) {
@@ -11150,13 +11285,15 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 2; j < n; j++ ) {
-		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
+		mptr[j * nc + 1] = ( mptr[j * nc + 1] - v[0] * mptr[j * nc + 0] ) * d;
 	}
 
 	mptr = mat[2];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
-	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
+	v[1] = diag[1] * mptr[1];
+	s1 = v[1] * mptr[1];
 	sum = mptr[2] - s0 - s1;
 
 	if ( sum == 0.0f ) {
@@ -11173,14 +11310,17 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 3; j < n; j++ ) {
-		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
+		mptr[j * nc + 2] = ( mptr[j * nc + 2] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] ) * d;
 	}
 
 	mptr = mat[3];
 
-	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
-	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
-	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
+	v[0] = diag[0] * mptr[0];
+	s0 = v[0] * mptr[0];
+	v[1] = diag[1] * mptr[1];
+	s1 = v[1] * mptr[1];
+	v[2] = diag[2] * mptr[2];
+	s2 = v[2] * mptr[2];
 	sum = mptr[3] - s0 - s1 - s2;
 
 	if ( sum == 0.0f ) {
@@ -11197,27 +11337,41 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 
 	mptr = mat[0];
 	for ( j = 4; j < n; j++ ) {
-		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
+		mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d;
 	}
 
 	for ( i = 4; i < n; i++ ) {
 
 		mptr = mat[i];
 
-		v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
-		v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
-		v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
-		v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
-		for ( k = 4; k < i-3; k += 4 ) {
-			v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
-			v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
-			v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
-			v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
+		v[0] = diag[0] * mptr[0];
+		s0 = v[0] * mptr[0];
+		v[1] = diag[1] * mptr[1];
+		s1 = v[1] * mptr[1];
+		v[2] = diag[2] * mptr[2];
+		s2 = v[2] * mptr[2];
+		v[3] = diag[3] * mptr[3];
+		s3 = v[3] * mptr[3];
+		for ( k = 4; k < i - 3; k += 4 ) {
+			v[k + 0] = diag[k + 0] * mptr[k + 0];
+			s0 += v[k + 0] * mptr[k + 0];
+			v[k + 1] = diag[k + 1] * mptr[k + 1];
+			s1 += v[k + 1] * mptr[k + 1];
+			v[k + 2] = diag[k + 2] * mptr[k + 2];
+			s2 += v[k + 2] * mptr[k + 2];
+			v[k + 3] = diag[k + 3] * mptr[k + 3];
+			s3 += v[k + 3] * mptr[k + 3];
 		}
-		switch( i - k ) {
-			case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
-			case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
-			case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
+		switch ( i - k ) {
+		case 3:
+			v[k + 2] = diag[k + 2] * mptr[k + 2];
+			s0 += v[k + 2] * mptr[k + 2];
+		case 2:
+			v[k + 1] = diag[k + 1] * mptr[k + 1];
+			s1 += v[k + 1] * mptr[k + 1];
+		case 1:
+			v[k + 0] = diag[k + 0] * mptr[k + 0];
+			s2 += v[k + 0] * mptr[k + 0];
 		}
 		sum = s3;
 		sum += s2;
@@ -11237,30 +11391,37 @@ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int
 			return true;
 		}
 
-		mptr = mat[i+1];
-		for ( j = i+1; j < n; j++ ) {
+		mptr = mat[i + 1];
+		for ( j = i + 1; j < n; j++ ) {
 			s0 = mptr[0] * v[0];
 			s1 = mptr[1] * v[1];
 			s2 = mptr[2] * v[2];
 			s3 = mptr[3] * v[3];
-			for ( k = 4; k < i-7; k += 8 ) {
-				s0 += mptr[k+0] * v[k+0];
-				s1 += mptr[k+1] * v[k+1];
-				s2 += mptr[k+2] * v[k+2];
-				s3 += mptr[k+3] * v[k+3];
-				s0 += mptr[k+4] * v[k+4];
-				s1 += mptr[k+5] * v[k+5];
-				s2 += mptr[k+6] * v[k+6];
-				s3 += mptr[k+7] * v[k+7];
+			for ( k = 4; k < i - 7; k += 8 ) {
+				s0 += mptr[k + 0] * v[k + 0];
+				s1 += mptr[k + 1] * v[k + 1];
+				s2 += mptr[k + 2] * v[k + 2];
+				s3 += mptr[k + 3] * v[k + 3];
+				s0 += mptr[k + 4] * v[k + 4];
+				s1 += mptr[k + 5] * v[k + 5];
+				s2 += mptr[k + 6] * v[k + 6];
+				s3 += mptr[k + 7] * v[k + 7];
 			}
-			switch( i - k ) {
-				case 7: s0 += mptr[k+6] * v[k+6];
-				case 6: s1 += mptr[k+5] * v[k+5];
-				case 5: s2 += mptr[k+4] * v[k+4];
-				case 4: s3 += mptr[k+3] * v[k+3];
-				case 3: s0 += mptr[k+2] * v[k+2];
-				case 2: s1 += mptr[k+1] * v[k+1];
-				case 1: s2 += mptr[k+0] * v[k+0];
+			switch ( i - k ) {
+			case 7:
+				s0 += mptr[k + 6] * v[k + 6];
+			case 6:
+				s1 += mptr[k + 5] * v[k + 5];
+			case 5:
+				s2 += mptr[k + 4] * v[k + 4];
+			case 4:
+				s3 += mptr[k + 3] * v[k + 3];
+			case 3:
+				s0 += mptr[k + 2] * v[k + 2];
+			case 2:
+				s1 += mptr[k + 1] * v[k + 1];
+			case 1:
+				s2 += mptr[k + 0] * v[k + 0];
 			}
 			sum = s3;
 			sum += s2;
@@ -11313,7 +11474,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 		ALIGN16( float blendQuat3[4] );
 
 		for ( int j = 0; j < 4; j++ ) {
-			int n = index[i+j];
+			int n = index[i + j];
 
 			jointVert0[j] = joints[n].t[0];
 			jointVert1[j] = joints[n].t[1];
@@ -11551,15 +11712,15 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 		cosom[2] += jointQuat3[2] * blendQuat3[2];
 		cosom[3] += jointQuat3[3] * blendQuat3[3];
 
-		signBit[0] = (*(unsigned int *)&cosom[0]) & ( 1 << 31 );
-		signBit[1] = (*(unsigned int *)&cosom[1]) & ( 1 << 31 );
-		signBit[2] = (*(unsigned int *)&cosom[2]) & ( 1 << 31 );
-		signBit[3] = (*(unsigned int *)&cosom[3]) & ( 1 << 31 );
+		signBit[0] = ( *( unsigned int * )&cosom[0] ) & ( 1 << 31 );
+		signBit[1] = ( *( unsigned int * )&cosom[1] ) & ( 1 << 31 );
+		signBit[2] = ( *( unsigned int * )&cosom[2] ) & ( 1 << 31 );
+		signBit[3] = ( *( unsigned int * )&cosom[3] ) & ( 1 << 31 );
 
-		(*(unsigned int *)&cosom[0]) ^= signBit[0];
-		(*(unsigned int *)&cosom[1]) ^= signBit[1];
-		(*(unsigned int *)&cosom[2]) ^= signBit[2];
-		(*(unsigned int *)&cosom[3]) ^= signBit[3];
+		( *( unsigned int * )&cosom[0] ) ^= signBit[0];
+		( *( unsigned int * )&cosom[1] ) ^= signBit[1];
+		( *( unsigned int * )&cosom[2] ) ^= signBit[2];
+		( *( unsigned int * )&cosom[3] ) ^= signBit[3];
 
 		scale0[0] = 1.0f - cosom[0] * cosom[0];
 		scale0[1] = 1.0f - cosom[1] * cosom[1];
@@ -11606,10 +11767,10 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 		scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
 		scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
 
-		(*(unsigned int *)&scale1[0]) ^= signBit[0];
-		(*(unsigned int *)&scale1[1]) ^= signBit[1];
-		(*(unsigned int *)&scale1[2]) ^= signBit[2];
-		(*(unsigned int *)&scale1[3]) ^= signBit[3];
+		( *( unsigned int * )&scale1[0] ) ^= signBit[0];
+		( *( unsigned int * )&scale1[1] ) ^= signBit[1];
+		( *( unsigned int * )&scale1[2] ) ^= signBit[2];
+		( *( unsigned int * )&scale1[3] ) ^= signBit[3];
 
 		jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
 		jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
@@ -11634,7 +11795,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 #endif
 
 		for ( int j = 0; j < 4; j++ ) {
-			int n = index[i+j];
+			int n = index[i + j];
 
 			joints[n].t[0] = jointVert0[j];
 			joints[n].t[1] = jointVert1[j];
@@ -11669,9 +11830,9 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 
 		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
 
-		signBit = (*(unsigned int *)&cosom) & ( 1 << 31 );
+		signBit = ( *( unsigned int * )&cosom ) & ( 1 << 31 );
 
-		(*(unsigned int *)&cosom) ^= signBit;
+		( *( unsigned int * )&cosom ) ^= signBit;
 
 		scale0 = 1.0f - cosom * cosom;
 		scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
@@ -11680,7 +11841,7 @@ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *ble
 		scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
 		scale1 = idMath::Sin16( lerp * omega ) * sinom;
 
-		(*(unsigned int *)&scale1) ^= signBit;
+		( *( unsigned int * )&scale1 ) ^= signBit;
 
 		jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
 		jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
@@ -11698,16 +11859,16 @@ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, con
 
 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
-	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
+	assert( ( int )( &( ( idJointQuat * )0 )->t ) == ( int )( &( ( idJointQuat * )0 )->q ) + ( int )sizeof( ( ( idJointQuat * )0 )->q ) );
 
 	for ( int i = 0; i < numJoints; i++ ) {
 
 		const float *q = jointQuats[i].q.ToFloatPtr();
 		float *m = jointMats[i].ToFloatPtr();
 
-		m[0*4+3] = q[4];
-		m[1*4+3] = q[5];
-		m[2*4+3] = q[6];
+		m[0 * 4 + 3] = q[4];
+		m[1 * 4 + 3] = q[5];
+		m[2 * 4 + 3] = q[6];
 
 		float x2 = q[0] + q[0];
 		float y2 = q[1] + q[1];
@@ -11718,33 +11879,33 @@ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, con
 			float yy = q[1] * y2;
 			float zz = q[2] * z2;
 
-			m[0*4+0] = 1.0f - yy - zz;
-			m[1*4+1] = 1.0f - xx - zz;
-			m[2*4+2] = 1.0f - xx - yy;
+			m[0 * 4 + 0] = 1.0f - yy - zz;
+			m[1 * 4 + 1] = 1.0f - xx - zz;
+			m[2 * 4 + 2] = 1.0f - xx - yy;
 		}
 
 		{
 			float yz = q[1] * z2;
 			float wx = q[3] * x2;
 
-			m[2*4+1] = yz - wx;
-			m[1*4+2] = yz + wx;
+			m[2 * 4 + 1] = yz - wx;
+			m[1 * 4 + 2] = yz + wx;
 		}
 
 		{
 			float xy = q[0] * y2;
 			float wz = q[3] * z2;
 
-			m[1*4+0] = xy - wz;
-			m[0*4+1] = xy + wz;
+			m[1 * 4 + 0] = xy - wz;
+			m[0 * 4 + 1] = xy + wz;
 		}
 
 		{
 			float xz = q[0] * z2;
 			float wy = q[3] * y2;
 
-			m[0*4+2] = xz - wy;
-			m[2*4+0] = xz + wy;
+			m[0 * 4 + 2] = xz - wy;
+			m[2 * 4 + 0] = xz + wy;
 		}
 	}
 }
@@ -11758,7 +11919,7 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 
 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
-	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
+	assert( ( int )( &( ( idJointQuat * )0 )->t ) == ( int )( &( ( idJointQuat * )0 )->q ) + ( int )sizeof( ( ( idJointQuat * )0 )->q ) );
 
 #if 1
 
@@ -11774,7 +11935,7 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		add			esi, eax
 		neg			eax
 
-	loopMat4:
+		loopMat4:
 		movss		xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
 		movss		xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
 		movss		xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
@@ -11783,9 +11944,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
-		movss		xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
-		movss		xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
+		movss		xmm0, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 0 * 4]
+		movss		xmm1, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 1 * 4]
+		movss		xmm2, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 2 * 4]
 
 		movss		xmm5, xmm0
 		movss		xmm6, xmm1
@@ -11795,9 +11956,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
-		movss		xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
-		movss		xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
+		movss		xmm0, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 0 * 4]
+		movss		xmm1, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 1 * 4]
+		movss		xmm2, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 2 * 4]
 
 		movss		xmm5, xmm0
 		movss		xmm6, xmm1
@@ -11807,9 +11968,9 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
-		movss		xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
-		movss		xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
+		movss		xmm0, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 0 * 4]
+		movss		xmm1, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 1 * 4]
+		movss		xmm2, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 2 * 4]
 
 		movss		xmm5, xmm0
 		movss		xmm6, xmm1
@@ -11881,38 +12042,38 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 
 		// -------------------
 
-		add			edi, 4*JOINTQUAT_SIZE
+		add			edi, 4 * JOINTQUAT_SIZE
 
-		movzx		ecx, byte ptr shuffle[0*4+0]			// ecx = k0
-		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
+		movzx		ecx, byte ptr shuffle[0 * 4 + 0]			// ecx = k0
+		movss		[edi + ecx * 4 - 4 * JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
 
-		movzx		edx, byte ptr shuffle[0*4+1]			// edx = k1
-		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
+		movzx		edx, byte ptr shuffle[0 * 4 + 1]			// edx = k1
+		movss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 0 * 4]
 		xorps		xmm4, xmm2
-		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
+		subss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 1 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
+		movss		[edi + edx * 4 - 4 * JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
 
-		movzx		ecx, byte ptr shuffle[0*4+2]			// ecx = k2
-		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
+		movzx		ecx, byte ptr shuffle[0 * 4 + 2]			// ecx = k2
+		movss		xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 2 * 4]
 		xorps		xmm3, xmm1
-		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
+		subss		xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 0 * 4]
 		mulss		xmm3, xmm6
-		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
+		movss		[edi + ecx * 4 - 4 * JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
 
-		movzx		edx, byte ptr shuffle[0*4+3]			// edx = k3
-		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
+		movzx		edx, byte ptr shuffle[0 * 4 + 3]			// edx = k3
+		movss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 1 * 4]
 		xorps		xmm4, xmm0
-		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
+		subss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 2 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
+		movss		[edi + edx * 4 - 4 * JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
 
-		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
-		mov			[edi-4*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
-		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
-		mov			[edi-4*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
-		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
-		mov			[edi-4*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
+		mov			ecx, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 3 * 4]
+		mov			[edi - 4 * JOINTQUAT_SIZE + 16], ecx			// q[4] = m[0 * 4 + 3];
+		mov			edx, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 3 * 4]
+		mov			[edi - 4 * JOINTQUAT_SIZE + 20], edx			// q[5] = m[1 * 4 + 3];
+		mov			ecx, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 3 * 4]
+		mov			[edi - 4 * JOINTQUAT_SIZE + 24], ecx			// q[6] = m[2 * 4 + 3];
 
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
@@ -11920,36 +12081,36 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movzx		ecx, byte ptr shuffle[1*4+0]			// ecx = k0
-		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
+		movzx		ecx, byte ptr shuffle[1 * 4 + 0]			// ecx = k0
+		movss		[edi + ecx * 4 - 3 * JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
 
-		movzx		edx, byte ptr shuffle[1*4+1]			// edx = k1
-		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
+		movzx		edx, byte ptr shuffle[1 * 4 + 1]			// edx = k1
+		movss		xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 0 * 4]
 		xorps		xmm4, xmm2
-		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
+		subss		xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 1 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
+		movss		[edi + edx * 4 - 3 * JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
 
-		movzx		ecx, byte ptr shuffle[1*4+2]			// ecx = k2
-		movss		xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
+		movzx		ecx, byte ptr shuffle[1 * 4 + 2]			// ecx = k2
+		movss		xmm3, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 2 * 4]
 		xorps		xmm3, xmm1
-		subss		xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
+		subss		xmm3, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 0 * 4]
 		mulss		xmm3, xmm6
-		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
+		movss		[edi + ecx * 4 - 3 * JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
 
-		movzx		edx, byte ptr shuffle[1*4+3]			// edx = k3
-		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
+		movzx		edx, byte ptr shuffle[1 * 4 + 3]			// edx = k3
+		movss		xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 1 * 4]
 		xorps		xmm4, xmm0
-		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
+		subss		xmm4, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 2 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
+		movss		[edi + edx * 4 - 3 * JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
 
-		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
-		mov			[edi-3*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
-		mov			edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
-		mov			[edi-3*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
-		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
-		mov			[edi-3*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
+		mov			ecx, [esi + eax + 1 * JOINTMAT_SIZE + 0 * 16 + 3 * 4]
+		mov			[edi - 3 * JOINTQUAT_SIZE + 16], ecx			// q[4] = m[0 * 4 + 3];
+		mov			edx, [esi + eax + 1 * JOINTMAT_SIZE + 1 * 16 + 3 * 4]
+		mov			[edi - 3 * JOINTQUAT_SIZE + 20], edx			// q[5] = m[1 * 4 + 3];
+		mov			ecx, [esi + eax + 1 * JOINTMAT_SIZE + 2 * 16 + 3 * 4]
+		mov			[edi - 3 * JOINTQUAT_SIZE + 24], ecx			// q[6] = m[2 * 4 + 3];
 
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
@@ -11957,36 +12118,36 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movzx		ecx, byte ptr shuffle[2*4+0]			// ecx = k0
-		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
+		movzx		ecx, byte ptr shuffle[2 * 4 + 0]			// ecx = k0
+		movss		[edi + ecx * 4 - 2 * JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
 
-		movzx		edx, byte ptr shuffle[2*4+1]			// edx = k1
-		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
+		movzx		edx, byte ptr shuffle[2 * 4 + 1]			// edx = k1
+		movss		xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 0 * 4]
 		xorps		xmm4, xmm2
-		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
+		subss		xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 1 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
+		movss		[edi + edx * 4 - 2 * JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
 
-		movzx		ecx, byte ptr shuffle[2*4+2]			// ecx = k2
-		movss		xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
+		movzx		ecx, byte ptr shuffle[2 * 4 + 2]			// ecx = k2
+		movss		xmm3, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 2 * 4]
 		xorps		xmm3, xmm1
-		subss		xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
+		subss		xmm3, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 0 * 4]
 		mulss		xmm3, xmm6
-		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
+		movss		[edi + ecx * 4 - 2 * JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
 
-		movzx		edx, byte ptr shuffle[2*4+3]			// edx = k3
-		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
+		movzx		edx, byte ptr shuffle[2 * 4 + 3]			// edx = k3
+		movss		xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 1 * 4]
 		xorps		xmm4, xmm0
-		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
+		subss		xmm4, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 2 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
+		movss		[edi + edx * 4 - 2 * JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
 
-		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
-		mov			[edi-2*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
-		mov			edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
-		mov			[edi-2*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
-		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
-		mov			[edi-2*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
+		mov			ecx, [esi + eax + 2 * JOINTMAT_SIZE + 0 * 16 + 3 * 4]
+		mov			[edi - 2 * JOINTQUAT_SIZE + 16], ecx			// q[4] = m[0 * 4 + 3];
+		mov			edx, [esi + eax + 2 * JOINTMAT_SIZE + 1 * 16 + 3 * 4]
+		mov			[edi - 2 * JOINTQUAT_SIZE + 20], edx			// q[5] = m[1 * 4 + 3];
+		mov			ecx, [esi + eax + 2 * JOINTMAT_SIZE + 2 * 16 + 3 * 4]
+		mov			[edi - 2 * JOINTQUAT_SIZE + 24], ecx			// q[6] = m[2 * 4 + 3];
 
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
@@ -11994,41 +12155,41 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movzx		ecx, byte ptr shuffle[3*4+0]			// ecx = k0
-		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
+		movzx		ecx, byte ptr shuffle[3 * 4 + 0]			// ecx = k0
+		movss		[edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
 
-		movzx		edx, byte ptr shuffle[3*4+1]			// edx = k1
-		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
+		movzx		edx, byte ptr shuffle[3 * 4 + 1]			// edx = k1
+		movss		xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 0 * 4]
 		xorps		xmm4, xmm2
-		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
+		subss		xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 1 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
+		movss		[edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
 
-		movzx		ecx, byte ptr shuffle[3*4+2]			// ecx = k2
-		movss		xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
+		movzx		ecx, byte ptr shuffle[3 * 4 + 2]			// ecx = k2
+		movss		xmm3, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 2 * 4]
 		xorps		xmm3, xmm1
-		subss		xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
+		subss		xmm3, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 0 * 4]
 		mulss		xmm3, xmm6
-		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
+		movss		[edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
 
-		movzx		edx, byte ptr shuffle[3*4+3]			// edx = k3
-		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
+		movzx		edx, byte ptr shuffle[3 * 4 + 3]			// edx = k3
+		movss		xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 1 * 4]
 		xorps		xmm4, xmm0
-		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
+		subss		xmm4, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 2 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
+		movss		[edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
 
-		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
-		mov			edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
-		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
+		mov			ecx, [esi + eax + 3 * JOINTMAT_SIZE + 0 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 16], ecx			// q[4] = m[0 * 4 + 3];
+		mov			edx, [esi + eax + 3 * JOINTMAT_SIZE + 1 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 20], edx			// q[5] = m[1 * 4 + 3];
+		mov			ecx, [esi + eax + 3 * JOINTMAT_SIZE + 2 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 24], ecx			// q[6] = m[2 * 4 + 3];
 
-		add			eax, 4*JOINTMAT_SIZE
+		add			eax, 4 * JOINTMAT_SIZE
 		jl			loopMat4
 
-	done4:
+		done4:
 		mov			eax, numJoints
 		and			eax, 3
 		jz			done1
@@ -12036,10 +12197,10 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 		add			esi, eax
 		neg			eax
 
-	loopMat1:
-		movss		xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
-		movss		xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
-		movss		xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
+		loopMat1:
+		movss		xmm5, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 0 * 4]
+		movss		xmm6, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 1 * 4]
+		movss		xmm7, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 2 * 4]
 
 		// -------------------
 
@@ -12109,40 +12270,40 @@ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, c
 
 		movzx		ecx, byte ptr shuffle[0]				// ecx = k0
 		add			edi, JOINTQUAT_SIZE
-		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
+		movss		[edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
 
 		movzx		edx, byte ptr shuffle[1]				// edx = k1
-		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
+		movss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 0 * 4]
 		xorps		xmm4, xmm2
-		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
+		subss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 1 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
+		movss		[edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
 
 		movzx		ecx, byte ptr shuffle[2]				// ecx = k2
-		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
+		movss		xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 2 * 4]
 		xorps		xmm3, xmm1
-		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
+		subss		xmm3, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 0 * 4]
 		mulss		xmm3, xmm6
-		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
+		movss		[edi + ecx * 4 - 1 * JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
 
 		movzx		edx, byte ptr shuffle[3]				// edx = k3
-		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
+		movss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 1 * 4]
 		xorps		xmm4, xmm0
-		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
+		subss		xmm4, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 2 * 4]
 		mulss		xmm4, xmm6
-		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
+		movss		[edi + edx * 4 - 1 * JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
 
-		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
-		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
-		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
-		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
+		mov			ecx, [esi + eax + 0 * JOINTMAT_SIZE + 0 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 16], ecx			// q[4] = m[0 * 4 + 3];
+		mov			edx, [esi + eax + 0 * JOINTMAT_SIZE + 1 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 20], edx			// q[5] = m[1 * 4 + 3];
+		mov			ecx, [esi + eax + 0 * JOINTMAT_SIZE + 2 * 16 + 3 * 4]
+		mov			[edi - 1 * JOINTQUAT_SIZE + 24], ecx			// q[6] = m[2 * 4 + 3];
 
 		add			eax, JOINTMAT_SIZE
 		jl			loopMat1
 
-	done1:
+		done1:
 	}
 
 #elif 0
@@ -12291,7 +12452,7 @@ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *paren
 		add			edi, eax
 		neg			eax
 
-	loopJoint:
+		loopJoint:
 
 		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
 		mov			edx, [edi+eax]
@@ -12303,70 +12464,70 @@ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *paren
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm4, xmm0
 
-		movss		xmm5, [esi+edx+ 4]
+		movss		xmm5, [esi + edx + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm5, xmm1
 		addps		xmm4, xmm5
-		movss		xmm6, [esi+edx+ 8]
+		movss		xmm6, [esi + edx + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm2
 		addps		xmm4, xmm6
 
-		movss		xmm5, [esi+edx+16]
+		movss		xmm5, [esi + edx + 16]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm5, xmm0
 
-		movss		xmm7, [esi+edx+12]
+		movss		xmm7, [esi + edx + 12]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
 		addps		xmm4, xmm7
 
-		movaps		[esi+ecx+ 0], xmm4
+		movaps		[esi + ecx + 0], xmm4
 
-		movss		xmm6, [esi+edx+20]
+		movss		xmm6, [esi + edx + 20]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm1
 		addps		xmm5, xmm6
-		movss		xmm7, [esi+edx+24]
+		movss		xmm7, [esi + edx + 24]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm2
 		addps		xmm5, xmm7
 
-		movss		xmm6, [esi+edx+32]
+		movss		xmm6, [esi + edx + 32]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm0
 
-		movss		xmm3, [esi+edx+28]
+		movss		xmm3, [esi + edx + 28]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
 		addps		xmm5, xmm3
 
-		movaps		[esi+ecx+16], xmm5
+		movaps		[esi + ecx + 16], xmm5
 
-		movss		xmm7, [esi+edx+36]
+		movss		xmm7, [esi + edx + 36]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm1
 		addps		xmm6, xmm7
-		movss		xmm3, [esi+edx+40]
+		movss		xmm3, [esi + edx + 40]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm3, xmm2
 		addps		xmm6, xmm3
 
-		movss		xmm7, [esi+edx+44]
+		movss		xmm7, [esi + edx + 44]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
 		addps		xmm6, xmm7
 
-		movaps		[esi+ecx+32], xmm6
+		movaps		[esi + ecx + 32], xmm6
 
 		add			ecx, JOINTMAT_SIZE
 		add			eax, 4
 		jle			loopJoint
-	done:
+		done:
 	}
 
 #else
 
 	int i;
 
-	for( i = firstJoint; i <= lastJoint; i++ ) {
+	for ( i = firstJoint; i <= lastJoint; i++ ) {
 		assert( parents[i] < i );
 		jointMats[i] *= jointMats[parents[i]];
 	}
@@ -12398,7 +12559,7 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *par
 		add			edi, edx
 		imul		eax, 4
 
-	loopJoint:
+		loopJoint:
 
 		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
 		mov			edx, [edi+eax]
@@ -12409,66 +12570,66 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *par
 		movss		xmm6, [esi+edx+12]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
 		subps		xmm0, xmm6
-		movss		xmm7, [esi+edx+28]
+		movss		xmm7, [esi + edx + 28]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
 		subps		xmm1, xmm7
-		movss		xmm3, [esi+edx+44]
+		movss		xmm3, [esi + edx + 44]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
 		subps		xmm2, xmm3
 
-		movss		xmm4, [esi+edx+ 0]
+		movss		xmm4, [esi + edx + 0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm4, xmm0
-		movss		xmm5, [esi+edx+16]
+		movss		xmm5, [esi + edx + 16]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm5, xmm1
 		addps		xmm4, xmm5
-		movss		xmm6, [esi+edx+32]
+		movss		xmm6, [esi + edx + 32]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm2
 		addps		xmm4, xmm6
 
-		movaps		[esi+ecx+ 0], xmm4
+		movaps		[esi + ecx + 0], xmm4
 
-		movss		xmm5, [esi+edx+ 4]
+		movss		xmm5, [esi + edx + 4]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm5, xmm0
-		movss		xmm6, [esi+edx+20]
+		movss		xmm6, [esi + edx + 20]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm1
 		addps		xmm5, xmm6
-		movss		xmm7, [esi+edx+36]
+		movss		xmm7, [esi + edx + 36]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm2
 		addps		xmm5, xmm7
 
-		movaps		[esi+ecx+16], xmm5
+		movaps		[esi + ecx + 16], xmm5
 
-		movss		xmm6, [esi+edx+ 8]
+		movss		xmm6, [esi + edx + 8]
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm0
-		movss		xmm7, [esi+edx+24]
+		movss		xmm7, [esi + edx + 24]
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm1
 		addps		xmm6, xmm7
-		movss		xmm3, [esi+edx+40]
+		movss		xmm3, [esi + edx + 40]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm3, xmm2
 		addps		xmm6, xmm3
 
-		movaps		[esi+ecx+32], xmm6
+		movaps		[esi + ecx + 32], xmm6
 
 		sub			ecx, JOINTMAT_SIZE
 		sub			eax, 4
 		jge			loopJoint
-	done:
+		done:
 	}
 
 #else
 
 	int i;
 
-	for( i = lastJoint; i >= firstJoint; i-- ) {
+	for ( i = lastJoint; i >= firstJoint; i-- ) {
 		assert( parents[i] < i );
 		jointMats[i] /= jointMats[parents[i]];
 	}
@@ -12485,12 +12646,11 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c
 #if 1
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
 
-	__asm
-	{
+	__asm {
 		mov			eax, numVerts
 		test		eax, eax
 		jz			done
@@ -12504,7 +12664,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c
 		add			ecx, eax
 		neg			eax
 
-	loopVert:
+		loopVert:
 		mov			ebx, [edx]
 		movaps		xmm2, [esi]
 		add			edx, 8
@@ -12520,7 +12680,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c
 
 		jne			doneWeight
 
-	loopWeight:
+		loopWeight:
 		mov			ebx, [edx]
 		movaps		xmm5, [esi]
 		add			edx, 8
@@ -12540,7 +12700,7 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c
 
 		je			loopWeight
 
-	doneWeight:
+		doneWeight:
 		add			eax, DRAWVERT_SIZE
 
 		movaps		xmm6, xmm0								// xmm6 =    m0,    m1,          m2,          t0
@@ -12559,24 +12719,24 @@ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, c
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 )	// xmm5 = m7+t2, m6+m8
 		addss		xmm5, xmm6								// xmm5 = m6+m8+m7+t2
 
-		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm5
+		movss		[ecx + eax - DRAWVERT_SIZE + 8], xmm5
 
 		jl			loopVert
-	done:
+		done:
 	}
 
 #else
 
 	int i, j;
-	const byte *jointsPtr = (byte *)joints;
+	const byte *jointsPtr = ( byte * )joints;
 
-	for( j = i = 0; i < numVerts; i++ ) {
+	for ( j = i = 0; i < numVerts; i++ ) {
 		idVec3 v;
 
-		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
-		while( index[j*2+1] == 0 ) {
+		v = ( *( idJointMat * )( jointsPtr + index[j * 2 + 0] ) ) * weights[j];
+		while ( index[j * 2 + 1] == 0 ) {
 			j++;
-			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
+			v += ( *( idJointMat * )( jointsPtr + index[j * 2 + 0] ) ) * weights[j];
 		}
 		j++;
 
@@ -12595,7 +12755,7 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo
 #if 1
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 		push		ebx
@@ -12628,13 +12788,13 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo
 		add			esi, eax
 		neg			eax
 
-	loopVert:
-		movss		xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
+		loopVert:
+		movss		xmm4, [esi + eax + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm5, [esi + eax + DRAWVERT_XYZ_OFFSET + 4]
 		mulps		xmm4, xmm0
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movss		xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm6, [esi + eax + DRAWVERT_XYZ_OFFSET + 8]
 		mulps		xmm5, xmm1
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		addps		xmm4, xmm5
@@ -12652,10 +12812,10 @@ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const flo
 		inc			edi
 		or			dl, cl
 		add			eax, DRAWVERT_SIZE
-		mov			byte ptr [edi-1], cl
+		mov			byte ptr [edi - 1], cl
 		jl			loopVert
 
-	done:
+		done:
 		mov			esi, totalOr
 		mov			byte ptr [esi], dl
 		pop			ebx
@@ -12725,7 +12885,7 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 	ALIGN16( float p7[4] );
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 		mov			ecx, planes
@@ -12749,13 +12909,13 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 		movaps		p2, xmm2
 		movaps		p3, xmm3
 
-		movlps		xmm4, [ecx+64]							// xmm4 = p40, p41,   X,   X
-		movhps		xmm4, [ecx+80]							// xmm4 = p40, p41, p50, p51
+		movlps		xmm4, [ecx + 64]							// xmm4 = p40, p41,   X,   X
+		movhps		xmm4, [ecx + 80]							// xmm4 = p40, p41, p50, p51
 		movaps		xmm5, xmm4								// xmm5 = p40, p41, p50, p51
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm4 = p40, p50, p40, p50
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm5 = p41, p51, p41, p51
-		movlps		xmm6, [ecx+72]							// xmm6 = p42, p43,   X,   X
-		movhps		xmm6, [ecx+88]							// xmm6 = p42, p43, p52, p53
+		movlps		xmm6, [ecx + 72]							// xmm6 = p42, p43,   X,   X
+		movhps		xmm6, [ecx + 88]							// xmm6 = p42, p43, p52, p53
 		movaps		xmm7, xmm6								// xmm7 = p42, p43, p52, p53
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm6 = p42, p52, p42, p52
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm7 = p43, p53, p43, p53
@@ -12774,18 +12934,18 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 		add			esi, eax
 		neg			eax
 
-	loopVert2:
+		loopVert2:
 		movaps		xmm6, p0
-		movss		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm0, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm0
 		movaps		xmm7, p1
-		movss		xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm1
 		addps		xmm6, xmm7
 		movaps		xmm7, p2
-		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm2, [esi + eax + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm2
 		addps		xmm6, xmm7
@@ -12795,16 +12955,16 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 		movmskps	ecx, xmm6
 
 		movaps		xmm6, p0
-		movss		xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm3, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm3
 		movaps		xmm7, p1
-		movss		xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm4, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm4
 		addps		xmm6, xmm7
 		movaps		xmm7, p2
-		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm5, [esi + eax + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm5
 		addps		xmm6, xmm7
@@ -12832,30 +12992,30 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 		mov			dh, dl
 		shl			dl, 4
 		shl			dh, 2
-		and			edx, (3<<4)|(3<<12)
+		and			edx, ( 3 << 4 ) | ( 3 << 12 )
 		or			ecx, edx
 
-		add			eax, 2*DRAWVERT_SIZE
-		mov			word ptr [edi-2], cx
+		add			eax, 2 * DRAWVERT_SIZE
+		mov			word ptr [edi - 2], cx
 		jl			loopVert2
 
-	done2:
+		done2:
 
 		mov			eax, numVerts
 		and			eax, 1
 		jz			done
 
 		movaps		xmm6, p0
-		movss		xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm0, [esi + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm6, xmm0
 		movaps		xmm7, p1
-		movss		xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [esi + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm1
 		addps		xmm6, xmm7
 		movaps		xmm7, p2
-		movss		xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm2, [esi + DRAWVERT_XYZ_OFFSET + 8]
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm7, xmm2
 		addps		xmm6, xmm7
@@ -12880,7 +13040,7 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 
 		mov			byte ptr [edi], cl
 
-	done:
+		done:
 	}
 
 
@@ -12891,8 +13051,8 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 	for ( i = 0; i < numVerts; i += 2 ) {
 		unsigned short bits0, bits1;
 		float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
-		const idVec3 &v0 = verts[i+0].xyz;
-		const idVec3 &v1 = verts[i+1].xyz;
+		const idVec3 &v0 = verts[i + 0].xyz;
+		const idVec3 &v1 = verts[i + 1].xyz;
 
 		d0  = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
 		d1  = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
@@ -12909,21 +13069,21 @@ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, c
 		d8  = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
 		d9  = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
 
-		bits0  = FLOATSIGNBITSET( d0  ) << (0+0);
-		bits0 |= FLOATSIGNBITSET( d1  ) << (0+1);
-		bits0 |= FLOATSIGNBITSET( d2  ) << (0+2);
-		bits0 |= FLOATSIGNBITSET( d3  ) << (0+3);
-		bits0 |= FLOATSIGNBITSET( d4  ) << (0+4);
-		bits0 |= FLOATSIGNBITSET( d5  ) << (0+5);
-
-		bits1  = FLOATSIGNBITSET( d6  ) << (8+0);
-		bits1 |= FLOATSIGNBITSET( d7  ) << (8+1);
-		bits1 |= FLOATSIGNBITSET( d8  ) << (8+2);
-		bits1 |= FLOATSIGNBITSET( d9  ) << (8+3);
-		bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
-		bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
-
-		*(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
+		bits0  = FLOATSIGNBITSET( d0 ) << ( 0 + 0 );
+		bits0 |= FLOATSIGNBITSET( d1 ) << ( 0 + 1 );
+		bits0 |= FLOATSIGNBITSET( d2 ) << ( 0 + 2 );
+		bits0 |= FLOATSIGNBITSET( d3 ) << ( 0 + 3 );
+		bits0 |= FLOATSIGNBITSET( d4 ) << ( 0 + 4 );
+		bits0 |= FLOATSIGNBITSET( d5 ) << ( 0 + 5 );
+
+		bits1  = FLOATSIGNBITSET( d6 ) << ( 8 + 0 );
+		bits1 |= FLOATSIGNBITSET( d7 ) << ( 8 + 1 );
+		bits1 |= FLOATSIGNBITSET( d8 ) << ( 8 + 2 );
+		bits1 |= FLOATSIGNBITSET( d9 ) << ( 8 + 3 );
+		bits1 |= FLOATSIGNBITSET( d10 ) << ( 8 + 4 );
+		bits1 |= FLOATSIGNBITSET( d11 ) << ( 8 + 5 );
+
+		*( unsigned short * )( cullBits + i ) = ( bits0 | bits1 ) ^ 0x3F3F;
 	}
 
 	if ( numVerts & 1 ) {
@@ -12962,7 +13122,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 #if 1
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 		mov			eax, numVerts
@@ -12975,16 +13135,16 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		movss		xmm5, [ecx+16]
 		shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
-		movss		xmm5, [ecx+ 4]
-		movss		xmm6, [ecx+20]
+		movss		xmm5, [ecx + 4]
+		movss		xmm6, [ecx + 20]
 		shufps		xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
-		movss		xmm6, [ecx+ 8]
-		movss		xmm7, [ecx+24]
+		movss		xmm6, [ecx + 8]
+		movss		xmm7, [ecx + 24]
 		shufps		xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
-		movss		xmm7, [ecx+12]
-		movss		xmm0, [ecx+28]
+		movss		xmm7, [ecx + 12]
+		movss		xmm0, [ecx + 28]
 		shufps		xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
 
@@ -12993,17 +13153,17 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		add			edi, eax
 		neg			eax
 
-	loopVert2:
-		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movss		xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		loopVert2:
+		movss		xmm0, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movss		xmm1, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm0, xmm4
-		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
-		movss		xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
+		movss		xmm2, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm1, xmm5
-		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movss		xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm2, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movss		xmm3, [edx + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm2, xmm6
 		addps		xmm0, xmm1
@@ -13015,27 +13175,27 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		subps		xmm2, xmm0
 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
 		shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
-		add			edx, 2*DRAWVERT_SIZE
+		add			edx, 2 * DRAWVERT_SIZE
 		movmskps	ecx, xmm0
-		mov			byte ptr [edi+eax+0], cl
-		add			esi, 4*4
+		mov			byte ptr [edi + eax + 0], cl
+		add			esi, 4 * 4
 		movmskps	ecx, xmm1
-		mov			byte ptr [edi+eax+1], cl
+		mov			byte ptr [edi + eax + 1], cl
 		add			eax, 2
 		jl			loopVert2
 
-	done2:
+		done2:
 		mov			eax, numVerts
 		and			eax, 1
 		jz			done
 
-		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm0, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm0, xmm4
-		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm1, xmm5
-		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm2, [edx + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm2, xmm6
 		addps		xmm0, xmm1
@@ -13049,7 +13209,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		movmskps	ecx, xmm0
 		mov			byte ptr [edi], cl
 
-	done:
+		done:
 	}
 
 #else
@@ -13061,18 +13221,18 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		unsigned short bits;
 		float d0, d1, d2, d3;
 
-		const idVec3 &v0 = verts[i+0].xyz;
-		const idVec3 &v1 = verts[i+1].xyz;
+		const idVec3 &v0 = verts[i + 0].xyz;
+		const idVec3 &v1 = verts[i + 1].xyz;
 
 		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
 		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
 		d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
 		d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
 
-		texCoords[i+0][0] = d0;
-		texCoords[i+0][1] = d1;
-		texCoords[i+1][0] = d2;
-		texCoords[i+1][1] = d3;
+		texCoords[i + 0][0] = d0;
+		texCoords[i + 0][1] = d1;
+		texCoords[i + 1][0] = d2;
+		texCoords[i + 1][1] = d3;
 
 		bits  = FLOATSIGNBITSET( d0 ) << 0;
 		bits |= FLOATSIGNBITSET( d1 ) << 1;
@@ -13089,7 +13249,7 @@ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, con
 		bits |= FLOATSIGNBITSET( d2 ) << 10;
 		bits |= FLOATSIGNBITSET( d3 ) << 11;
 
-		*(unsigned short *)(cullBits + i) = bits;
+		*( unsigned short * )( cullBits + i ) = bits;
 	}
 
 	if ( numVerts & 1 ) {
@@ -13130,7 +13290,7 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert
 #if 1
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
 
 	__asm {
 		mov			eax, numIndexes
@@ -13145,7 +13305,7 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert
 		add			eax, 4*12
 		jge			done4
 
-	loopPlane4:
+		loopPlane4:
 		mov			ebx, [edi+eax-4*12+4]
 		imul		ebx, DRAWVERT_SIZE
 		mov			ecx, [edi+eax-4*12+0]
@@ -13167,129 +13327,129 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm3, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm3, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 
-		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm4, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm4, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 
-		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm5, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm5, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
-		mov			ebx, [edi+eax-3*12+4]
+		mov			ebx, [edi + eax - 3 * 12 + 4]
 		imul		ebx, DRAWVERT_SIZE
-		mov			ecx, [edi+eax-3*12+0]
+		mov			ecx, [edi + eax - 3 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm0, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm1, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm2, xmm6
 
-		mov			ebx, [edi+eax-3*12+8]
+		mov			ebx, [edi + eax - 3 * 12 + 8]
 		imul		ebx, DRAWVERT_SIZE
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm3, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm4, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm5, xmm7
 
-		mov			ebx, [edi+eax-2*12+4]
+		mov			ebx, [edi + eax - 2 * 12 + 4]
 		imul		ebx, DRAWVERT_SIZE
-		mov			ecx, [edi+eax-2*12+0]
+		mov			ecx, [edi + eax - 2 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm0, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm1, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm2, xmm6
 
-		mov			ebx, [edi+eax-2*12+8]
+		mov			ebx, [edi + eax - 2 * 12 + 8]
 		imul		ebx, DRAWVERT_SIZE
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm3, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm4, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm5, xmm7
 
-		mov			ebx, [edi+eax-1*12+4]
+		mov			ebx, [edi + eax - 1 * 12 + 4]
 		imul		ebx, DRAWVERT_SIZE
-		mov			ecx, [edi+eax-1*12+0]
+		mov			ecx, [edi + eax - 1 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm0, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm1, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm2, xmm6
 
-		mov			ebx, [edi+eax-1*12+8]
+		mov			ebx, [edi + eax - 1 * 12 + 8]
 		imul		ebx, DRAWVERT_SIZE
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 		movss		xmm3, xmm7
 
-		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm6, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm6, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 		movss		xmm4, xmm6
 
-		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm7, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm7, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 		movss		xmm5, xmm7
 
 		movaps		xmm6, xmm4
@@ -13317,121 +13477,121 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert
 		addps		xmm3, xmm6
 		rsqrtps		xmm3, xmm3
 
-		add			edx, 4*16
-		mov			ecx, [edi+eax-1*12+0]
+		add			edx, 4 * 16
+		mov			ecx, [edi + eax - 1 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		mulps		xmm0, xmm3
 		mulps		xmm1, xmm3
 		mulps		xmm2, xmm3
 
-		movss		[edx-1*16+0], xmm0
-		movss		[edx-1*16+4], xmm1
-		movss		[edx-1*16+8], xmm2
+		movss		[edx - 1 * 16 + 0], xmm0
+		movss		[edx - 1 * 16 + 4], xmm1
+		movss		[edx - 1 * 16 + 8], xmm2
 
-		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
-		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
-		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		mulss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
+		mulss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
+		mulss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		xorps		xmm0, SIMD_SP_singleSignBitMask
 		subss		xmm0, xmm1
 		subss		xmm0, xmm2
-		movss		[edx-1*16+12], xmm0
+		movss		[edx - 1 * 16 + 12], xmm0
 
-		mov			ecx, [edi+eax-2*12+0]
+		mov			ecx, [edi + eax - 2 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[edx-2*16+0], xmm0
-		movss		[edx-2*16+4], xmm1
-		movss		[edx-2*16+8], xmm2
+		movss		[edx - 2 * 16 + 0], xmm0
+		movss		[edx - 2 * 16 + 4], xmm1
+		movss		[edx - 2 * 16 + 8], xmm2
 
-		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
-		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
-		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		mulss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
+		mulss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
+		mulss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		xorps		xmm0, SIMD_SP_singleSignBitMask
 		subss		xmm0, xmm1
 		subss		xmm0, xmm2
-		movss		[edx-2*16+12], xmm0
+		movss		[edx - 2 * 16 + 12], xmm0
 
-		mov			ecx, [edi+eax-3*12+0]
+		mov			ecx, [edi + eax - 3 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[edx-3*16+0], xmm0
-		movss		[edx-3*16+4], xmm1
-		movss		[edx-3*16+8], xmm2
+		movss		[edx - 3 * 16 + 0], xmm0
+		movss		[edx - 3 * 16 + 4], xmm1
+		movss		[edx - 3 * 16 + 8], xmm2
 
-		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
-		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
-		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		mulss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
+		mulss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
+		mulss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		xorps		xmm0, SIMD_SP_singleSignBitMask
 		subss		xmm0, xmm1
 		subss		xmm0, xmm2
-		movss		[edx-3*16+12], xmm0
+		movss		[edx - 3 * 16 + 12], xmm0
 
-		mov			ecx, [edi+eax-4*12+0]
+		mov			ecx, [edi + eax - 4 * 12 + 0]
 		imul		ecx, DRAWVERT_SIZE
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[edx-4*16+0], xmm0
-		movss		[edx-4*16+4], xmm1
-		movss		[edx-4*16+8], xmm2
+		movss		[edx - 4 * 16 + 0], xmm0
+		movss		[edx - 4 * 16 + 4], xmm1
+		movss		[edx - 4 * 16 + 8], xmm2
 
-		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
-		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
-		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		mulss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
+		mulss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
+		mulss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		xorps		xmm0, SIMD_SP_singleSignBitMask
 		subss		xmm0, xmm1
 		subss		xmm0, xmm2
-		movss		[edx-4*16+12], xmm0
+		movss		[edx - 4 * 16 + 12], xmm0
 
-		add			eax, 4*12
+		add			eax, 4 * 12
 		jle			loopPlane4
 
-	done4:
+		done4:
 
-		sub			eax, 4*12
+		sub			eax, 4 * 12
 		jge			done
 
-	loopPlane1:
-		mov			ebx, [edi+eax+4]
+		loopPlane1:
+		mov			ebx, [edi + eax + 4]
 		imul		ebx, DRAWVERT_SIZE
-		mov			ecx, [edi+eax+0]
+		mov			ecx, [edi + eax + 0]
 		imul		ecx, DRAWVERT_SIZE
 
-		movss		xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm0, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 
-		movss		xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 
-		movss		xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm2, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
-		mov			ebx, [edi+eax+8]
+		mov			ebx, [edi + eax + 8]
 		imul		ebx, DRAWVERT_SIZE
 
-		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
-		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm3, [esi + ebx + DRAWVERT_XYZ_OFFSET + 0]
+		subss		xmm3, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
 
-		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
-		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm4, [esi + ebx + DRAWVERT_XYZ_OFFSET + 4]
+		subss		xmm4, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
 
-		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
-		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		movss		xmm5, [esi + ebx + DRAWVERT_XYZ_OFFSET + 8]
+		subss		xmm5, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		movss		xmm6, xmm4
 		mulss		xmm6, xmm2
@@ -13458,29 +13618,29 @@ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *vert
 		addss		xmm3, xmm6
 		rsqrtss		xmm3, xmm3
 
-		add			edx, 1*16
+		add			edx, 1 * 16
 
 		mulss		xmm0, xmm3
 		mulss		xmm1, xmm3
 		mulss		xmm2, xmm3
 
-		movss		[edx-1*16+0], xmm0
-		movss		[edx-1*16+4], xmm1
-		movss		[edx-1*16+8], xmm2
+		movss		[edx - 1 * 16 + 0], xmm0
+		movss		[edx - 1 * 16 + 4], xmm1
+		movss		[edx - 1 * 16 + 8], xmm2
 
-		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
-		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
-		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
+		mulss		xmm0, [esi + ecx + DRAWVERT_XYZ_OFFSET + 0]
+		mulss		xmm1, [esi + ecx + DRAWVERT_XYZ_OFFSET + 4]
+		mulss		xmm2, [esi + ecx + DRAWVERT_XYZ_OFFSET + 8]
 
 		xorps		xmm0, SIMD_SP_singleSignBitMask
 		subss		xmm0, xmm1
 		subss		xmm0, xmm2
-		movss		[edx-1*16+12], xmm0
+		movss		[edx - 1 * 16 + 12], xmm0
 
-		add			eax, 1*12
+		add			eax, 1 * 12
 		jl			loopPlane1
 
-	done:
+		done:
 	}
 
 #else
@@ -13646,9 +13806,9 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 	int i;
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
 
 	assert( planes != NULL );
 	assert( verts != NULL );
@@ -13661,7 +13821,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 	}
 #endif
 
-	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
+	bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) );
 	memset( used, 0, numVerts * sizeof( used[0] ) );
 
 	for ( i = 0; i <= numIndexes - 12; i += 12 ) {
@@ -13969,10 +14129,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 		tmp[2] -= d4[2] * d8[2];
 		tmp[3] -= d4[3] * d8[3];
 
-		signBit[0] = ( *(unsigned int *)&tmp[0] ) & ( 1 << 31 );
-		signBit[1] = ( *(unsigned int *)&tmp[1] ) & ( 1 << 31 );
-		signBit[2] = ( *(unsigned int *)&tmp[2] ) & ( 1 << 31 );
-		signBit[3] = ( *(unsigned int *)&tmp[3] ) & ( 1 << 31 );
+		signBit[0] = ( *( unsigned int * )&tmp[0] ) & ( 1 << 31 );
+		signBit[1] = ( *( unsigned int * )&tmp[1] ) & ( 1 << 31 );
+		signBit[2] = ( *( unsigned int * )&tmp[2] ) & ( 1 << 31 );
+		signBit[3] = ( *( unsigned int * )&tmp[3] ) & ( 1 << 31 );
 
 		// first tangent
 		t0[0] = d0[0] * d9[0];
@@ -14025,10 +14185,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 		tmp[2] = idMath::RSqrt( tmp[2] );
 		tmp[3] = idMath::RSqrt( tmp[3] );
 
-		*(unsigned int *)&tmp[0] ^= signBit[0];
-		*(unsigned int *)&tmp[1] ^= signBit[1];
-		*(unsigned int *)&tmp[2] ^= signBit[2];
-		*(unsigned int *)&tmp[3] ^= signBit[3];
+		*( unsigned int * )&tmp[0] ^= signBit[0];
+		*( unsigned int * )&tmp[1] ^= signBit[1];
+		*( unsigned int * )&tmp[2] ^= signBit[2];
+		*( unsigned int * )&tmp[3] ^= signBit[3];
 
 		t0[0] *= tmp[0];
 		t0[1] *= tmp[1];
@@ -14096,10 +14256,10 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 		tmp[2] = idMath::RSqrt( tmp[2] );
 		tmp[3] = idMath::RSqrt( tmp[3] );
 
-		*(unsigned int *)&tmp[0] ^= signBit[0];
-		*(unsigned int *)&tmp[1] ^= signBit[1];
-		*(unsigned int *)&tmp[2] ^= signBit[2];
-		*(unsigned int *)&tmp[3] ^= signBit[3];
+		*( unsigned int * )&tmp[0] ^= signBit[0];
+		*( unsigned int * )&tmp[1] ^= signBit[1];
+		*( unsigned int * )&tmp[2] ^= signBit[2];
+		*( unsigned int * )&tmp[3] ^= signBit[3];
 
 		t3[0] *= tmp[0];
 		t3[1] *= tmp[1];
@@ -14448,7 +14608,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 
 		// area sign bit
 		tmp = d3 * d9 - d4 * d8;
-		signBit[0] = ( *(unsigned int *)&tmp ) & ( 1 << 31 );
+		signBit[0] = ( *( unsigned int * )&tmp ) & ( 1 << 31 );
 
 		// first tangent
 		t0 = d0 * d9 - d4 * d5;
@@ -14456,7 +14616,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 		t2 = d2 * d9 - d4 * d7;
 
 		tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
-		*(unsigned int *)&tmp ^= signBit[0];
+		*( unsigned int * )&tmp ^= signBit[0];
 
 		t0 *= tmp;
 		t1 *= tmp;
@@ -14468,7 +14628,7 @@ void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, cons
 		t5 = d3 * d7 - d2 * d8;
 
 		tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
-		*(unsigned int *)&tmp ^= signBit[0];
+		*( unsigned int * )&tmp ^= signBit[0];
 
 		t3 *= tmp;
 		t4 *= tmp;
@@ -14605,7 +14765,7 @@ void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const domin
 		for ( j = 0; j < 4; j++ ) {
 			const idDrawVert *a, *b, *c;
 
-			const dominantTri_s &dt = dominantTris[i+j];
+			const dominantTri_s &dt = dominantTris[i + j];
 
 			s0[j] = dt.normalizationScale[0];
 			s1[j] = dt.normalizationScale[1];
@@ -15097,9 +15257,9 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 	ALIGN16( float normal[12] );
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
 
 	assert( verts != NULL );
 	assert( numVerts >= 0 );
@@ -15122,7 +15282,7 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		sub			eax, DRAWVERT_SIZE*4
 		jl			loopVert1
 
-	loopVert4:
+		loopVert4:
 
 		sub			eax, DRAWVERT_SIZE*4
 
@@ -15172,48 +15332,48 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 
 		// save the 4 idDrawVert::normal to project the tangents
 
-		movaps		[normal+ 0], xmm0
-		movaps		[normal+16], xmm1
-		movaps		[normal+32], xmm2
+		movaps		[normal + 0], xmm0
+		movaps		[normal + 16], xmm1
+		movaps		[normal + 32], xmm2
 
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_NORMAL_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_NORMAL_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_NORMAL_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_NORMAL_OFFSET + 8], xmm2
 
 		// project and normalize 4 idDrawVert::tangent[0]
 
-		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  X,  X
-		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  3,  4
-		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8]	//  5,  X,  X,  X
-		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4]	//	5,  X,  1,  2
-		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  X,  X
-		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  9, 10
-		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8]	// 11,  X,  X,  X
-		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4]	// 11,  X,  7,  8
+		movss		xmm0, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 0]	//  0,  X,  X,  X
+		movhps		xmm0, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 0]	//  0,  X,  3,  4
+		movss		xmm2, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 8]	//  5,  X,  X,  X
+		movhps		xmm2, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 4]	//	5,  X,  1,  2
+		movss		xmm4, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 0]	//  6,  X,  X,  X
+		movhps		xmm4, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 0]	//  6,  X,  9, 10
+		movss		xmm3, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 8]	// 11,  X,  X,  X
+		movhps		xmm3, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 4]	// 11,  X,  7,  8
 
 		movaps		xmm1, xmm0
 		movaps		xmm5, xmm2
@@ -15227,17 +15387,17 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		movaps		xmm4, xmm1
 		movaps		xmm5, xmm2
 
-		mulps		xmm3, [normal+ 0]
-		mulps		xmm4, [normal+16]
-		mulps		xmm5, [normal+32]
+		mulps		xmm3, [normal + 0]
+		mulps		xmm4, [normal + 16]
+		mulps		xmm5, [normal + 32]
 		addps		xmm3, xmm4
 		addps		xmm3, xmm5
 
 		movaps		xmm4, xmm3
 		movaps		xmm5, xmm3
-		mulps		xmm3, [normal+ 0]
-		mulps		xmm4, [normal+16]
-		mulps		xmm5, [normal+32]
+		mulps		xmm3, [normal + 0]
+		mulps		xmm4, [normal + 16]
+		mulps		xmm5, [normal + 32]
 		subps		xmm0, xmm3
 		subps		xmm1, xmm4
 		subps		xmm2, xmm5
@@ -15267,44 +15427,44 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		mulps		xmm1, xmm3
 		mulps		xmm2, xmm3
 
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT0_OFFSET + 8], xmm2
 
 		// project and normalize 4 idDrawVert::tangent[1]
 
-		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  X,  X
-		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  3,  4
-		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8]	//  5,  X,  X,  X
-		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4]	//	5,  X,  1,  2
-		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  X,  X
-		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  9, 10
-		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8]	// 11,  X,  X,  X
-		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4]	// 11,  X,  7,  8
+		movss		xmm0, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 0]	//  0,  X,  X,  X
+		movhps		xmm0, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 0]	//  0,  X,  3,  4
+		movss		xmm2, [esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 8]	//  5,  X,  X,  X
+		movhps		xmm2, [esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 4]	//	5,  X,  1,  2
+		movss		xmm4, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 0]	//  6,  X,  X,  X
+		movhps		xmm4, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 0]	//  6,  X,  9, 10
+		movss		xmm3, [esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 8]	// 11,  X,  X,  X
+		movhps		xmm3, [esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 4]	// 11,  X,  7,  8
 
 		movaps		xmm1, xmm0
 		movaps		xmm5, xmm2
@@ -15318,17 +15478,17 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		movaps		xmm4, xmm1
 		movaps		xmm5, xmm2
 
-		mulps		xmm3, [normal+ 0]
-		mulps		xmm4, [normal+16]
-		mulps		xmm5, [normal+32]
+		mulps		xmm3, [normal + 0]
+		mulps		xmm4, [normal + 16]
+		mulps		xmm5, [normal + 32]
 		addps		xmm3, xmm4
 		addps		xmm3, xmm5
 
 		movaps		xmm4, xmm3
 		movaps		xmm5, xmm3
-		mulps		xmm3, [normal+ 0]
-		mulps		xmm4, [normal+16]
-		mulps		xmm5, [normal+32]
+		mulps		xmm3, [normal + 0]
+		mulps		xmm4, [normal + 16]
+		mulps		xmm5, [normal + 32]
 		subps		xmm0, xmm3
 		subps		xmm1, xmm4
 		subps		xmm2, xmm5
@@ -15358,48 +15518,48 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		mulps		xmm1, xmm3
 		mulps		xmm2, xmm3
 
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 0 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 1 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 2 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_SIZE * 3 + DRAWVERT_TANGENT1_OFFSET + 8], xmm2
 
-		add			eax, DRAWVERT_SIZE*8
+		add			eax, DRAWVERT_SIZE * 8
 
 		jle			loopVert4
 
-		sub			eax, DRAWVERT_SIZE*4
+		sub			eax, DRAWVERT_SIZE * 4
 		jge			done
 
-	loopVert1:
+		loopVert1:
 
 		// normalize one idDrawVert::normal
 
-		movss		xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
+		movss		xmm0, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0]
+		movss		xmm1, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4]
+		movss		xmm2, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8]
 		movss		xmm3, xmm0
 		movss		xmm4, xmm1
 		movss		xmm5, xmm2
@@ -15425,30 +15585,30 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		mulss		xmm1, xmm3
 		mulss		xmm2, xmm3
 
-		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_NORMAL_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_NORMAL_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_NORMAL_OFFSET + 8], xmm2
 
 		// project and normalize one idDrawVert::tangent[0]
 
-		movss		xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
+		movss		xmm0, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 0]
+		movss		xmm1, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 4]
+		movss		xmm2, [esi + eax + DRAWVERT_TANGENT0_OFFSET + 8]
 		movss		xmm3, xmm0
 		movss		xmm4, xmm1
 		movss		xmm5, xmm2
 
-		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
-		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
-		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
+		mulss		xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0]
+		mulss		xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4]
+		mulss		xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8]
 		addss		xmm3, xmm4
 		addss		xmm3, xmm5
 
 		movss		xmm4, xmm3
 		movss		xmm5, xmm3
-		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
-		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
-		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
+		mulss		xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0]
+		mulss		xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4]
+		mulss		xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8]
 		subss		xmm0, xmm3
 		subss		xmm1, xmm4
 		subss		xmm2, xmm5
@@ -15478,30 +15638,30 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		mulss		xmm1, xmm3
 		mulss		xmm2, xmm3
 
-		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_TANGENT0_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_TANGENT0_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_TANGENT0_OFFSET + 8], xmm2
 
 		// project and normalize one idDrawVert::tangent[1]
 
-		movss		xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
-		movss		xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
-		movss		xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
+		movss		xmm0, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 0]
+		movss		xmm1, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 4]
+		movss		xmm2, [esi + eax + DRAWVERT_TANGENT1_OFFSET + 8]
 		movss		xmm3, xmm0
 		movss		xmm4, xmm1
 		movss		xmm5, xmm2
 
-		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
-		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
-		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
+		mulss		xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0]
+		mulss		xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4]
+		mulss		xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8]
 		addss		xmm3, xmm4
 		addss		xmm3, xmm5
 
 		movss		xmm4, xmm3
 		movss		xmm5, xmm3
-		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
-		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
-		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
+		mulss		xmm3, [esi + eax + DRAWVERT_NORMAL_OFFSET + 0]
+		mulss		xmm4, [esi + eax + DRAWVERT_NORMAL_OFFSET + 4]
+		mulss		xmm5, [esi + eax + DRAWVERT_NORMAL_OFFSET + 8]
 		subss		xmm0, xmm3
 		subss		xmm1, xmm4
 		subss		xmm2, xmm5
@@ -15531,14 +15691,14 @@ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts
 		mulss		xmm1, xmm3
 		mulss		xmm2, xmm3
 
-		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
-		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
-		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
+		movss		[esi + eax + DRAWVERT_TANGENT1_OFFSET + 0], xmm0
+		movss		[esi + eax + DRAWVERT_TANGENT1_OFFSET + 4], xmm1
+		movss		[esi + eax + DRAWVERT_TANGENT1_OFFSET + 8], xmm2
 
 		add			eax, DRAWVERT_SIZE
 
 		jl			loopVert1
-	done:
+		done:
 	}
 }
 
@@ -15550,12 +15710,12 @@ idSIMD_SSE::CreateTextureSpaceLightVectors
 void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
-	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
 
-	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
+	bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) );
 	memset( used, 0, numVerts * sizeof( used[0] ) );
 
 	for ( int i = numIndexes - 1; i >= 0; i-- ) {
@@ -15584,7 +15744,7 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		mov			ecx, lightVectors
 		sub			ecx, 3*4
 
-	loopVert:
+		loopVert:
 		inc			eax
 		jge			done
 
@@ -15625,13 +15785,13 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 
 		addps		xmm5, xmm4
 		addps		xmm5, xmm2
-		movlps		[ecx+0], xmm5
+		movlps		[ecx + 0], xmm5
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
-		movss		[ecx+8], xmm5
+		movss		[ecx + 8], xmm5
 
 		jmp			loopVert
 
-	done:
+		done:
 	}
 
 #elif 1
@@ -15684,7 +15844,7 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		neg			eax
 		dec			eax
 
-	loopVert4:
+		loopVert4:
 		inc			eax
 		jge			done4
 
@@ -15783,9 +15943,9 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		imul		edx, 12
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
 
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
 		mov			edx, usedVertNums[8]
@@ -15793,9 +15953,9 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		imul		edx, 12
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
 
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
 		mov			edx, usedVertNums[12]
@@ -15803,14 +15963,14 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		imul		edx, 12
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
 
 		xor			ecx, ecx
 		jmp			loopVert4
 
-	done4:
+		done4:
 		test		ecx, ecx
 		jz			done
 		xor			eax, eax
@@ -15818,49 +15978,49 @@ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, co
 		imul		edi, 12
 		add			edi, lightVectors
 
-	loopVert1:
-		movss		xmm0, lightDir0[eax*4]
-		movss		xmm1, lightDir1[eax*4]
-		movss		xmm2, lightDir2[eax*4]
+		loopVert1:
+		movss		xmm0, lightDir0[eax * 4]
+		movss		xmm1, lightDir1[eax * 4]
+		movss		xmm2, lightDir2[eax * 4]
 
-		mov			edx, usedVertNums[eax*4]
+		mov			edx, usedVertNums[eax * 4]
 		imul		edx, 12
 
-		movss		xmm3, tangent0[eax*4]
+		movss		xmm3, tangent0[eax * 4]
 		mulss		xmm3, xmm0
-		movss		xmm4, tangent1[eax*4]
+		movss		xmm4, tangent1[eax * 4]
 		mulss		xmm4, xmm1
-		movss		xmm5, tangent2[eax*4]
+		movss		xmm5, tangent2[eax * 4]
 		mulss		xmm5, xmm2
 
 		addss		xmm3, xmm4
 		addss		xmm5, xmm3
-		movss		[edi+edx+0], xmm5
+		movss		[edi + edx + 0], xmm5
 
-		movss		xmm3, tangent3[eax*4]
+		movss		xmm3, tangent3[eax * 4]
 		mulss		xmm3, xmm0
-		movss		xmm4, tangent4[eax*4]
+		movss		xmm4, tangent4[eax * 4]
 		mulss		xmm4, xmm1
-		movss		xmm6, tangent5[eax*4]
+		movss		xmm6, tangent5[eax * 4]
 		mulss		xmm6, xmm2
 
 		addss		xmm3, xmm4
 		addss		xmm6, xmm3
-		movss		[edi+edx+4], xmm6
+		movss		[edi + edx + 4], xmm6
 
-		mulss		xmm0, normal0[eax*4]
-		mulss		xmm1, normal1[eax*4]
-		mulss		xmm2, normal2[eax*4]
+		mulss		xmm0, normal0[eax * 4]
+		mulss		xmm1, normal1[eax * 4]
+		mulss		xmm2, normal2[eax * 4]
 
 		addss		xmm0, xmm1
 		addss		xmm0, xmm2
-		movss		[edi+edx+8], xmm0
+		movss		[edi + edx + 8], xmm0
 
 		inc			eax
 		dec			ecx
 		jg			loopVert1
 
-	done:
+		done:
 	}
 
 #else
@@ -15978,12 +16138,12 @@ idSIMD_SSE::CreateSpecularTextureCoords
 void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
 
 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
-	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
-	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
-	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->xyz == DRAWVERT_XYZ_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->normal == DRAWVERT_NORMAL_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
+	assert( ( int ) & ( ( idDrawVert * )0 )->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
 
-	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
+	bool *used = ( bool * )_alloca16( numVerts * sizeof( used[0] ) );
 	memset( used, 0, numVerts * sizeof( used[0] ) );
 
 	for ( int i = numIndexes - 1; i >= 0; i-- ) {
@@ -16016,7 +16176,7 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		mov			ecx, texCoords
 		sub			ecx, 4*4
 
-	loopVert:
+		loopVert:
 		inc			eax
 		jge			done
 
@@ -16059,16 +16219,16 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		mulps		xmm1, xmm5
 		addps		xmm0, xmm1
 
-		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
-		movhps		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
+		movss		xmm2, [edi + DRAWVERT_TANGENT0_OFFSET + 0]
+		movhps		xmm2, [edi + DRAWVERT_TANGENT0_OFFSET + 4]
 		mulps		xmm2, xmm0
 
-		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
-		movhps		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
+		movss		xmm3, [edi + DRAWVERT_TANGENT1_OFFSET + 0]
+		movhps		xmm3, [edi + DRAWVERT_TANGENT1_OFFSET + 4]
 		mulps		xmm3, xmm0
 
-		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
-		movhps		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
+		movss		xmm4, [edi + DRAWVERT_NORMAL_OFFSET + 0]
+		movhps		xmm4, [edi + DRAWVERT_NORMAL_OFFSET + 4]
 		mulps		xmm4, xmm0
 
 		movaps		xmm5, xmm2								// xmm5 = 0,  X,  1,  2
@@ -16083,12 +16243,12 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 
 		addps		xmm5, xmm4
 		addps		xmm5, xmm2
-		movaps		[ecx+0], xmm5
-		movss		[ecx+12], xmm3
+		movaps		[ecx + 0], xmm5
+		movss		[ecx + 12], xmm3
 
 		jmp			loopVert
 
-	done:
+		done:
 	}
 
 #elif 0
@@ -16159,7 +16319,7 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		neg			eax
 		dec			eax
 
-	loopVert4:
+		loopVert4:
 		inc			eax
 		jge			done4
 
@@ -16302,10 +16462,10 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		shl			edx, 4
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
-		movss		[ecx+edx+12], xmm3
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
+		movss		[ecx + edx + 12], xmm3
 
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
 		mov			edx, usedVertNums[8]
@@ -16313,10 +16473,10 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		shl			edx, 4
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
-		movss		[ecx+edx+12], xmm3
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
+		movss		[ecx + edx + 12], xmm3
 
 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
 		mov			edx, usedVertNums[12]
@@ -16324,15 +16484,15 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		shl			edx, 4
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
 
-		movss		[ecx+edx+0], xmm5
-		movss		[ecx+edx+4], xmm6
-		movss		[ecx+edx+8], xmm0
-		movss		[ecx+edx+12], xmm3
+		movss		[ecx + edx + 0], xmm5
+		movss		[ecx + edx + 4], xmm6
+		movss		[ecx + edx + 8], xmm0
+		movss		[ecx + edx + 12], xmm3
 
 		xor			ecx, ecx
 		jmp			loopVert4
 
-	done4:
+		done4:
 		test		ecx, ecx
 		jz			done
 		xor			eax, eax
@@ -16340,15 +16500,15 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		shl			edi, 4
 		add			edi, texCoords
 
-	loopVert1:
-		movss		xmm6, lightDir0[eax*4]
+		loopVert1:
+		movss		xmm6, lightDir0[eax * 4]
 		movss		xmm0, xmm6
 		mulss		xmm6, xmm6
-		movss		xmm7, lightDir1[eax*4]
+		movss		xmm7, lightDir1[eax * 4]
 		movss		xmm1, xmm7
 		mulss		xmm7, xmm7
 		addss		xmm6, xmm7
-		movss		xmm5, lightDir2[eax*4]
+		movss		xmm5, lightDir2[eax * 4]
 		movss		xmm2, xmm5
 		mulss		xmm5, xmm5
 		addss		xmm6, xmm5
@@ -16358,14 +16518,14 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		mulss		xmm1, xmm6
 		mulss		xmm2, xmm6
 
-		movss		xmm3, viewDir0[eax*4]
+		movss		xmm3, viewDir0[eax * 4]
 		movss		xmm7, xmm3
 		mulss		xmm7, xmm7
-		movss		xmm4, viewDir1[eax*4]
+		movss		xmm4, viewDir1[eax * 4]
 		movss		xmm6, xmm4
 		mulss		xmm6, xmm6
 		addss		xmm7, xmm6
-		movss		xmm5, viewDir2[eax*4]
+		movss		xmm5, viewDir2[eax * 4]
 		movss		xmm6, xmm5
 		mulss		xmm6, xmm6
 		addss		xmm7, xmm6
@@ -16378,44 +16538,44 @@ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const id
 		mulss		xmm5, xmm7
 		addss		xmm2, xmm5
 
-		mov			edx, usedVertNums[eax*4]
+		mov			edx, usedVertNums[eax * 4]
 		shl			edx, 4
 
-		movss		xmm3, tangent0[eax*4]
+		movss		xmm3, tangent0[eax * 4]
 		mulss		xmm3, xmm0
-		movss		xmm4, tangent1[eax*4]
+		movss		xmm4, tangent1[eax * 4]
 		mulss		xmm4, xmm1
 		addss		xmm3, xmm4
-		movss		xmm5, tangent2[eax*4]
+		movss		xmm5, tangent2[eax * 4]
 		mulss		xmm5, xmm2
 		addss		xmm5, xmm3
-		movss		[edi+edx+0], xmm5
+		movss		[edi + edx + 0], xmm5
 
-		movss		xmm3, tangent3[eax*4]
+		movss		xmm3, tangent3[eax * 4]
 		mulss		xmm3, xmm0
-		movss		xmm4, tangent4[eax*4]
+		movss		xmm4, tangent4[eax * 4]
 		mulss		xmm4, xmm1
 		addss		xmm3, xmm4
-		movss		xmm6, tangent5[eax*4]
+		movss		xmm6, tangent5[eax * 4]
 		mulss		xmm6, xmm2
 		addss		xmm6, xmm3
-		movss		[edi+edx+4], xmm6
+		movss		[edi + edx + 4], xmm6
 
-		mulss		xmm0, normal0[eax*4]
-		mulss		xmm1, normal1[eax*4]
+		mulss		xmm0, normal0[eax * 4]
+		mulss		xmm1, normal1[eax * 4]
 		addss		xmm0, xmm1
-		mulss		xmm2, normal2[eax*4]
+		mulss		xmm2, normal2[eax * 4]
 		addss		xmm0, xmm2
-		movss		[edi+edx+8], xmm0
+		movss		[edi + edx + 8], xmm0
 
 		movss		xmm3, SIMD_SP_one
-		movss		[edi+edx+12], xmm3
+		movss		[edi + edx + 12], xmm3
 
 		inc			eax
 		dec			ecx
 		jg			loopVert1
 
-	done:
+		done:
 	}
 
 #else
@@ -16687,75 +16847,75 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c
 		add			edx, eax
 		neg			eax
 
-	loop4:
-		prefetchnta	[edx+128]
-		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
+		loop4:
+		prefetchnta	[edx + 128]
+		prefetchnta	[esi + 4 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET]
 
-		cmp         dword ptr [edx+eax+0], ebx
+		cmp         dword ptr [edx + eax + 0], ebx
 		jne         skip1
 
-		mov			dword ptr [edx+eax+0], ecx
-		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		mov			dword ptr [edx + eax + 0], ecx
+		movss		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		add			ecx, 2
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
 		orps		xmm0, xmm5
-		movaps		[edi+0*16], xmm0
+		movaps		[edi + 0 * 16], xmm0
 		subps		xmm0, xmm6
-		movaps		[edi+1*16], xmm0
-		add			edi, 2*16
+		movaps		[edi + 1 * 16], xmm0
+		add			edi, 2 * 16
 
-	skip1:
-		cmp         dword ptr [edx+eax+4], ebx
+		skip1:
+		cmp         dword ptr [edx + eax + 4], ebx
 		jne         skip2
 
-		mov			dword ptr [edx+eax+4], ecx
-		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		mov			dword ptr [edx + eax + 4], ecx
+		movss		xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		add			ecx, 2
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
 		orps		xmm1, xmm5
-		movaps		[edi+0*16], xmm1
+		movaps		[edi + 0 * 16], xmm1
 		subps		xmm1, xmm7
-		movaps		[edi+1*16], xmm1
-		add			edi, 2*16
+		movaps		[edi + 1 * 16], xmm1
+		add			edi, 2 * 16
 
-	skip2:
-		cmp         dword ptr [edx+eax+8], ebx
+		skip2:
+		cmp         dword ptr [edx + eax + 8], ebx
 		jne         skip3
 
-		mov			dword ptr [edx+eax+8], ecx
-		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		mov			dword ptr [edx + eax + 8], ecx
+		movss		xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		add			ecx, 2
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
 		orps		xmm2, xmm5
-		movaps		[edi+0*16], xmm2
+		movaps		[edi + 0 * 16], xmm2
 		subps		xmm2, xmm6
-		movaps		[edi+1*16], xmm2
-		add			edi, 2*16
+		movaps		[edi + 1 * 16], xmm2
+		add			edi, 2 * 16
 
-	skip3:
-		cmp         dword ptr [edx+eax+12], ebx
+		skip3:
+		cmp         dword ptr [edx + eax + 12], ebx
 		jne         skip4
 
-		mov			dword ptr [edx+eax+12], ecx
-		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		mov			dword ptr [edx + eax + 12], ecx
+		movss		xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		add			ecx, 2
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
 		orps		xmm3, xmm5
-		movaps		[edi+0*16], xmm3
+		movaps		[edi + 0 * 16], xmm3
 		subps		xmm3, xmm7
-		movaps		[edi+1*16], xmm3
-		add			edi, 2*16
+		movaps		[edi + 1 * 16], xmm3
+		add			edi, 2 * 16
 
-	skip4:
-		add			esi, 4*DRAWVERT_SIZE
-		add			eax, 4*4
+		skip4:
+		add			esi, 4 * DRAWVERT_SIZE
+		add			eax, 4 * 4
 		jl			loop4
 
-	done4:
+		done4:
 		mov			eax, numVerts
 		and			eax, 3
 		jz			done1
@@ -16763,28 +16923,28 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c
 		add			edx, eax
 		neg			eax
 
-	loop1:
-		cmp         dword ptr [edx+eax+0], ebx
+		loop1:
+		cmp         dword ptr [edx + eax + 0], ebx
 		jne         skip0
 
-		mov			dword ptr [edx+eax+0], ecx
-		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		mov			dword ptr [edx + eax + 0], ecx
+		movss		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		add			ecx, 2
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
 		orps		xmm0, xmm5
-		movaps		[edi+0*16], xmm0
+		movaps		[edi + 0 * 16], xmm0
 		subps		xmm0, xmm6
-		movaps		[edi+1*16], xmm0
-		add			edi, 2*16
+		movaps		[edi + 1 * 16], xmm0
+		add			edi, 2 * 16
 
-	skip0:
+		skip0:
 
 		add			esi, DRAWVERT_SIZE
 		add			eax, 4
 		jl			loop1
 
-	done1:
+		done1:
 		pop			ebx
 		mov			outVerts, ecx
 	}
@@ -16798,18 +16958,18 @@ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, c
 			continue;
 		}
 		const float *v = verts[i].xyz.ToFloatPtr();
-		vertexCache[outVerts+0][0] = v[0];
-		vertexCache[outVerts+0][1] = v[1];
-		vertexCache[outVerts+0][2] = v[2];
-		vertexCache[outVerts+0][3] = 1.0f;
+		vertexCache[outVerts + 0][0] = v[0];
+		vertexCache[outVerts + 0][1] = v[1];
+		vertexCache[outVerts + 0][2] = v[2];
+		vertexCache[outVerts + 0][3] = 1.0f;
 
 		// R_SetupProjection() builds the projection matrix with a slight crunch
 		// for depth, which keeps this w=0 division from rasterizing right at the
 		// wrap around point and causing depth fighting with the rear caps
-		vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
-		vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
-		vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
-		vertexCache[outVerts+1][3] = 0.0f;
+		vertexCache[outVerts + 1][0] = v[0] - lightOrigin[0];
+		vertexCache[outVerts + 1][1] = v[1] - lightOrigin[1];
+		vertexCache[outVerts + 1][2] = v[2] - lightOrigin[2];
+		vertexCache[outVerts + 1][3] = 0.0f;
 		vertRemap[i] = outVerts;
 		outVerts += 2;
 	}
@@ -16841,42 +17001,42 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons
 		add			edi, eax
 		neg			eax
 
-	loop4:
+		loop4:
 		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
 
 		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
 		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
-		movaps		[edi+eax+1*16], xmm0
+		movaps		[edi + eax + 1 * 16], xmm0
 		orps		xmm0, xmm4
-		movaps		[edi+eax+0*16], xmm0
+		movaps		[edi + eax + 0 * 16], xmm0
 
-		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm1, [esi + 1 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
-		movaps		[edi+eax+3*16], xmm1
+		movaps		[edi + eax + 3 * 16], xmm1
 		orps		xmm1, xmm5
-		movaps		[edi+eax+2*16], xmm1
+		movaps		[edi + eax + 2 * 16], xmm1
 
-		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		movss		xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm2, [esi + 2 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
-		movaps		[edi+eax+5*16], xmm2
+		movaps		[edi + eax + 5 * 16], xmm2
 		orps		xmm2, xmm6
-		movaps		[edi+eax+4*16], xmm2
+		movaps		[edi + eax + 4 * 16], xmm2
 
-		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
-		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
+		movss		xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
+		movhps		xmm3, [esi + 3 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 4]
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
-		movaps		[edi+eax+7*16], xmm3
+		movaps		[edi + eax + 7 * 16], xmm3
 		orps		xmm3, xmm7
-		movaps		[edi+eax+6*16], xmm3
+		movaps		[edi + eax + 6 * 16], xmm3
 
-		add			esi, 4*DRAWVERT_SIZE
-		add			eax, 4*8*4
+		add			esi, 4 * DRAWVERT_SIZE
+		add			eax, 4 * 8 * 4
 		jl			loop4
 
-	done4:
+		done4:
 		mov			eax, numVerts
 		and			eax, 3
 		jz			done1
@@ -16884,19 +17044,19 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons
 		add			edi, eax
 		neg			eax
 
-	loop1:
-		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
-		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
+		loop1:
+		movss		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 8]
+		movhps		xmm0, [esi + 0 * DRAWVERT_SIZE + DRAWVERT_XYZ_OFFSET + 0]
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
-		movaps		[edi+eax+1*16], xmm0
+		movaps		[edi + eax + 1 * 16], xmm0
 		orps		xmm0, xmm4
-		movaps		[edi+eax+0*16], xmm0
+		movaps		[edi + eax + 0 * 16], xmm0
 
 		add			esi, DRAWVERT_SIZE
-		add			eax, 8*4
+		add			eax, 8 * 4
 		jl			loop1
 
-	done1:
+		done1:
 	}
 	return numVerts * 2;
 
@@ -16904,15 +17064,15 @@ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, cons
 
 	for ( int i = 0; i < numVerts; i++ ) {
 		const float *v = verts[i].xyz.ToFloatPtr();
-		vertexCache[i*2+0][0] = v[0];
-		vertexCache[i*2+0][1] = v[1];
-		vertexCache[i*2+0][2] = v[2];
-		vertexCache[i*2+0][3] = 1.0f;
-
-		vertexCache[i*2+1][0] = v[0];
-		vertexCache[i*2+1][1] = v[1];
-		vertexCache[i*2+1][2] = v[2];
-		vertexCache[i*2+1][3] = 0.0f;
+		vertexCache[i * 2 + 0][0] = v[0];
+		vertexCache[i * 2 + 0][1] = v[1];
+		vertexCache[i * 2 + 0][2] = v[2];
+		vertexCache[i * 2 + 0][3] = 1.0f;
+
+		vertexCache[i * 2 + 1][0] = v[0];
+		vertexCache[i * 2 + 1][1] = v[1];
+		vertexCache[i * 2 + 1][2] = v[2];
+		vertexCache[i * 2 + 1][3] = 0.0f;
 	}
 	return numVerts * 2;
 
@@ -16937,25 +17097,25 @@ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 2*4*4
 
 		movsx		ecx, word ptr [esi+eax+0]
 		cvtsi2ss	xmm0, ecx
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi-2*4*4+0], xmm0
-		movhps		[edi-2*4*4+8], xmm0
+		movlps		[edi - 2 * 4 * 4 + 0], xmm0
+		movhps		[edi - 2 * 4 * 4 + 8], xmm0
 
-		movsx		edx, word ptr [esi+eax+2]
+		movsx		edx, word ptr [esi + eax + 2]
 		cvtsi2ss	xmm1, edx
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi-1*4*4+0], xmm1
-		movhps		[edi-1*4*4+8], xmm1
+		movlps		[edi - 1 * 4 * 4 + 0], xmm1
+		movhps		[edi - 1 * 4 * 4 + 8], xmm1
 
-		add			eax, 2*2
+		add			eax, 2 * 2
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -16963,10 +17123,10 @@ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		movsx		ecx, word ptr [esi]
 		cvtsi2ss	xmm0, ecx
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi+0], xmm0
-		movhps		[edi+8], xmm0
+		movlps		[edi + 0], xmm0
+		movhps		[edi + 8], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -16988,7 +17148,7 @@ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, co
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 8*4
 
 		movsx		ecx, word ptr [esi+eax+0]
@@ -17007,7 +17167,7 @@ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, co
 		add			eax, 2*2
 		jl			loop2
 
-	done2:
+		done2:
 	}
 }
 
@@ -17029,7 +17189,7 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 4*4
 
 		movsx		ecx, word ptr [esi+eax+0]
@@ -17039,13 +17199,13 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		cvtsi2ss	xmm1, edx
 
 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi-4*4+0], xmm0
-		movhps		[edi-4*4+8], xmm0
+		movlps		[edi - 4 * 4 + 0], xmm0
+		movhps		[edi - 4 * 4 + 8], xmm0
 
-		add			eax, 2*2
+		add			eax, 2 * 2
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17055,7 +17215,7 @@ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
 		movlps		[edi], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17077,7 +17237,7 @@ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, co
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 4*4
 
 		movsx		ecx, word ptr [esi+eax+0]
@@ -17093,7 +17253,7 @@ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, co
 		add			eax, 2*2
 		jl			loop2
 
-	done2:
+		done2:
 	}
 }
 
@@ -17115,7 +17275,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 2*4
 
 		movsx		ecx, word ptr [esi+eax+0]
@@ -17129,7 +17289,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		add			eax, 2*2
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17138,7 +17298,7 @@ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, cons
 		cvtsi2ss	xmm0, ecx
 		movss		[edi], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17194,25 +17354,25 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 		neg			eax
 
 		align		16
-	loop2:
-		add			edi, 2*16
+		loop2:
+		add			edi, 2 * 16
 
-		movss		xmm0, [esi+eax+0]
+		movss		xmm0, [esi + eax + 0]
 		mulss		xmm0, xmm7
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi-32], xmm0
-		movlps		[edi-24], xmm0
+		movlps		[edi - 32], xmm0
+		movlps		[edi - 24], xmm0
 
-		movss		xmm1, [esi+eax+4]
+		movss		xmm1, [esi + eax + 4]
 		mulss		xmm1, xmm7
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi-16], xmm1
-		movlps		[edi- 8], xmm1
+		movlps		[edi - 16], xmm1
+		movlps		[edi - 8], xmm1
 
-		add			eax, 2*4
+		add			eax, 2 * 4
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17220,10 +17380,10 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 		movss		xmm0, [esi]
 		mulss		xmm0, xmm7
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi+0], xmm0
-		movlps		[edi+8], xmm0
+		movlps		[edi + 0], xmm0
+		movlps		[edi + 8], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17232,7 +17392,7 @@ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 SSE_UpSample11kHzStereoOGGTo44kHz
 ============
 */
-static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
+static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) {
 	float constant = 32768.0f;
 	__asm {
 		mov			esi, src
@@ -17251,26 +17411,26 @@ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const
 		neg			eax
 
 		align		16
-	loop2:
-		add			edi, 4*16
+		loop2:
+		add			edi, 4 * 16
 
-		movlps		xmm0, [ecx+eax]
-		movlps		xmm1, [edx+eax]
+		movlps		xmm0, [ecx + eax]
+		movlps		xmm1, [edx + eax]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi-8*8], xmm0
-		movlps		[edi-7*8], xmm0
-		movlps		[edi-6*8], xmm0
-		movlps		[edi-5*8], xmm0
-		movhps		[edi-4*8], xmm0
-		movhps		[edi-3*8], xmm0
-		movhps		[edi-2*8], xmm0
-		movhps		[edi-1*8], xmm0
-
-		add			eax, 2*4
+		movlps		[edi - 8 * 8], xmm0
+		movlps		[edi - 7 * 8], xmm0
+		movlps		[edi - 6 * 8], xmm0
+		movlps		[edi - 5 * 8], xmm0
+		movhps		[edi - 4 * 8], xmm0
+		movhps		[edi - 3 * 8], xmm0
+		movhps		[edi - 2 * 8], xmm0
+		movhps		[edi - 1 * 8], xmm0
+
+		add			eax, 2 * 4
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17279,12 +17439,12 @@ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const
 		movss		xmm1, [edx]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi+0*8], xmm0
-		movlps		[edi+1*8], xmm0
-		movlps		[edi+2*8], xmm0
-		movlps		[edi+3*8], xmm0
+		movlps		[edi + 0 * 8], xmm0
+		movlps		[edi + 1 * 8], xmm0
+		movlps		[edi + 2 * 8], xmm0
+		movlps		[edi + 3 * 8], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17309,20 +17469,20 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 		neg			eax
 
 		align		16
-	loop2:
-		add			edi, 2*8
+		loop2:
+		add			edi, 2 * 8
 
-		movss		xmm0, [esi+eax+0]
-		movss		xmm1, [esi+eax+4]
+		movss		xmm0, [esi + eax + 0]
+		movss		xmm1, [esi + eax + 4]
 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm0, xmm7
-		movlps		[edi-16], xmm0
-		movhps		[edi- 8], xmm0
+		movlps		[edi - 16], xmm0
+		movhps		[edi - 8], xmm0
 
-		add			eax, 2*4
+		add			eax, 2 * 4
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17330,9 +17490,9 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 		movss		xmm0, [esi]
 		mulss		xmm0, xmm7
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
-		movlps		[edi+0], xmm0
+		movlps		[edi + 0], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17341,7 +17501,7 @@ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 SSE_UpSample22kHzStereoOGGTo44kHz
 ============
 */
-static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
+static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) {
 	float constant = 32768.0f;
 	__asm {
 		mov			esi, src
@@ -17360,22 +17520,22 @@ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const
 		neg			eax
 
 		align		16
-	loop2:
-		add			edi, 2*16
+		loop2:
+		add			edi, 2 * 16
 
-		movlps		xmm0, [ecx+eax]
-		movlps		xmm1, [edx+eax]
+		movlps		xmm0, [ecx + eax]
+		movlps		xmm1, [edx + eax]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi-4*8], xmm0
-		movlps		[edi-3*8], xmm0
-		movhps		[edi-2*8], xmm0
-		movhps		[edi-1*8], xmm0
+		movlps		[edi - 4 * 8], xmm0
+		movlps		[edi - 3 * 8], xmm0
+		movhps		[edi - 2 * 8], xmm0
+		movhps		[edi - 1 * 8], xmm0
 
-		add			eax, 2*4
+		add			eax, 2 * 4
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17384,10 +17544,10 @@ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const
 		movss		xmm1, [edx]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi+0*8], xmm0
-		movlps		[edi+1*8], xmm0
+		movlps		[edi + 0 * 8], xmm0
+		movlps		[edi + 1 * 8], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17406,7 +17566,7 @@ static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, cons
 SSE_UpSample44kHzStereoOGGTo44kHz
 ============
 */
-static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
+static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float *const *src, const int numSamples ) {
 	float constant = 32768.0f;
 	__asm {
 		mov			esi, src
@@ -17425,20 +17585,20 @@ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const
 		neg			eax
 
 		align		16
-	loop2:
+		loop2:
 		add			edi, 16
 
-		movlps		xmm0, [ecx+eax]
-		movlps		xmm1, [edx+eax]
+		movlps		xmm0, [ecx + eax]
+		movlps		xmm1, [edx + eax]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi-2*8], xmm0
-		movhps		[edi-1*8], xmm0
+		movlps		[edi - 2 * 8], xmm0
+		movhps		[edi - 1 * 8], xmm0
 
-		add			eax, 2*4
+		add			eax, 2 * 4
 		jl			loop2
 
-	done2:
+		done2:
 		mov			eax, numSamples
 		and			eax, 1
 		jz			done
@@ -17447,9 +17607,9 @@ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const
 		movss		xmm1, [edx]
 		unpcklps	xmm0, xmm1
 		mulps		xmm0, xmm7
-		movlps		[edi+0*8], xmm0
+		movlps		[edi + 0 * 8], xmm0
 
-	done:
+		done:
 	}
 }
 
@@ -17460,7 +17620,7 @@ idSIMD_SSE::UpSampleOGGTo44kHz
   Duplicate samples for 44kHz output.
 ============
 */
-void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
+void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float *const *ogg, const int numSamples, const int kHz, const int numChannels ) {
 	if ( kHz == 11025 ) {
 		if ( numChannels == 1 ) {
 			SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
@@ -17517,38 +17677,38 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *s
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
 		addps		xmm7, xmm7
 
-	loop16:
-		add			edi, 4*4*4
+		loop16:
+		add			edi, 4 * 4 * 4
 
-		movaps		xmm0, [esi+eax+0*4*4]
+		movaps		xmm0, [esi + eax + 0 * 4 * 4]
 		movaps		xmm1, xmm0
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
 		mulps		xmm0, xmm6
-		addps		xmm0, [edi-4*4*4]
+		addps		xmm0, [edi - 4 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-4*4*4], xmm0
+		movaps		[edi - 4 * 4 * 4], xmm0
 
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
 		mulps		xmm1, xmm6
-		addps		xmm1, [edi-3*4*4]
+		addps		xmm1, [edi - 3 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-3*4*4], xmm1
+		movaps		[edi - 3 * 4 * 4], xmm1
 
-		movaps		xmm2, [esi+eax+1*4*4]
+		movaps		xmm2, [esi + eax + 1 * 4 * 4]
 		movaps		xmm3, xmm2
 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
 		mulps		xmm2, xmm6
-		addps		xmm2, [edi-2*4*4]
+		addps		xmm2, [edi - 2 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-2*4*4], xmm2
+		movaps		[edi - 2 * 4 * 4], xmm2
 
 		shufps		xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
 		mulps		xmm3, xmm6
-		addps		xmm3, [edi-1*4*4]
+		addps		xmm3, [edi - 1 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-1*4*4], xmm3
+		movaps		[edi - 1 * 4 * 4], xmm3
 
-		add			eax, 2*4*4
+		add			eax, 2 * 4 * 4
 
 		jl			loop16
 	}
@@ -17574,11 +17734,11 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *s
 	incL *= 2;
 	incR *= 2;
 
-	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
-		mixBuffer[i*2+0] += samples[i+0] * sL0;
-		mixBuffer[i*2+1] += samples[i+0] * sR0;
-		mixBuffer[i*2+2] += samples[i+1] * sL1;
-		mixBuffer[i*2+3] += samples[i+1] * sR1;
+	for ( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
+		mixBuffer[i * 2 + 0] += samples[i + 0] * sL0;
+		mixBuffer[i * 2 + 1] += samples[i + 0] * sR0;
+		mixBuffer[i * 2 + 2] += samples[i + 1] * sL1;
+		mixBuffer[i * 2 + 3] += samples[i + 1] * sR1;
 		sL0 += incL;
 		sR0 += incR;
 		sL1 += incL;
@@ -17620,34 +17780,34 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float
 		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
 		addps		xmm7, xmm7
 
-	loop16:
-		add			edi, 4*4*4
+		loop16:
+		add			edi, 4 * 4 * 4
 
-		movaps		xmm0, [esi+eax+0*4*4]
+		movaps		xmm0, [esi + eax + 0 * 4 * 4]
 		mulps		xmm0, xmm6
-		addps		xmm0, [edi-4*4*4]
+		addps		xmm0, [edi - 4 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-4*4*4], xmm0
+		movaps		[edi - 4 * 4 * 4], xmm0
 
-		movaps		xmm2, [esi+eax+1*4*4]
+		movaps		xmm2, [esi + eax + 1 * 4 * 4]
 		mulps		xmm2, xmm6
-		addps		xmm2, [edi-3*4*4]
+		addps		xmm2, [edi - 3 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-3*4*4], xmm2
+		movaps		[edi - 3 * 4 * 4], xmm2
 
-		movaps		xmm3, [esi+eax+2*4*4]
+		movaps		xmm3, [esi + eax + 2 * 4 * 4]
 		mulps		xmm3, xmm6
-		addps		xmm3, [edi-2*4*4]
+		addps		xmm3, [edi - 2 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-2*4*4], xmm3
+		movaps		[edi - 2 * 4 * 4], xmm3
 
-		movaps		xmm4, [esi+eax+3*4*4]
+		movaps		xmm4, [esi + eax + 3 * 4 * 4]
 		mulps		xmm4, xmm6
-		addps		xmm4, [edi-1*4*4]
+		addps		xmm4, [edi - 1 * 4 * 4]
 		addps		xmm6, xmm7
-		movaps		[edi-1*4*4], xmm4
+		movaps		[edi - 1 * 4 * 4], xmm4
 
-		add			eax, 4*4*4
+		add			eax, 4 * 4 * 4
 
 		jl			loop16
 	}
@@ -17673,11 +17833,11 @@ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float
 	incL *= 2;
 	incR *= 2;
 
-	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
-		mixBuffer[i*2+0] += samples[i*2+0] * sL0;
-		mixBuffer[i*2+1] += samples[i*2+1] * sR0;
-		mixBuffer[i*2+2] += samples[i*2+2] * sL1;
-		mixBuffer[i*2+3] += samples[i*2+3] * sR1;
+	for ( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
+		mixBuffer[i * 2 + 0] += samples[i * 2 + 0] * sL0;
+		mixBuffer[i * 2 + 1] += samples[i * 2 + 1] * sR0;
+		mixBuffer[i * 2 + 2] += samples[i * 2 + 2] * sL1;
+		mixBuffer[i * 2 + 3] += samples[i * 2 + 3] * sR1;
 		sL0 += incL;
 		sR0 += incR;
 		sL1 += incL;
@@ -17724,8 +17884,8 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s
 
 		xorps		xmm5, xmm5
 		movhps		xmm5, incs
-		movlps		xmm7, incs+8
-		movhps		xmm7, incs+16
+		movlps		xmm7, incs + 8
+		movhps		xmm7, incs + 16
 		addps		xmm3, xmm5
 		addps		xmm4, xmm7
 		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
@@ -17735,53 +17895,53 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s
 		addps		xmm6, xmm6
 		addps		xmm7, xmm7
 
-	loop24:
-		add			edi, 6*16
+		loop24:
+		add			edi, 6 * 16
 
-		movaps		xmm0, [esi+eax]
+		movaps		xmm0, [esi + eax]
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
 		mulps		xmm1, xmm2
-		addps		xmm1, [edi-6*16]
+		addps		xmm1, [edi - 6 * 16]
 		addps		xmm2, xmm5
-		movaps		[edi-6*16], xmm1
+		movaps		[edi - 6 * 16], xmm1
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
 		mulps		xmm1, xmm3
-		addps		xmm1, [edi-5*16]
+		addps		xmm1, [edi - 5 * 16]
 		addps		xmm3, xmm6
-		movaps		[edi-5*16], xmm1
+		movaps		[edi - 5 * 16], xmm1
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
 		mulps		xmm1, xmm4
-		addps		xmm1, [edi-4*16]
+		addps		xmm1, [edi - 4 * 16]
 		addps		xmm4, xmm7
-		movaps		[edi-4*16], xmm1
+		movaps		[edi - 4 * 16], xmm1
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
 		mulps		xmm1, xmm2
-		addps		xmm1, [edi-3*16]
+		addps		xmm1, [edi - 3 * 16]
 		addps		xmm2, xmm5
-		movaps		[edi-3*16], xmm1
+		movaps		[edi - 3 * 16], xmm1
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
 		mulps		xmm1, xmm3
-		addps		xmm1, [edi-2*16]
+		addps		xmm1, [edi - 2 * 16]
 		addps		xmm3, xmm6
-		movaps		[edi-2*16], xmm1
+		movaps		[edi - 2 * 16], xmm1
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
 		mulps		xmm0, xmm4
-		addps		xmm0, [edi-1*16]
+		addps		xmm0, [edi - 1 * 16]
 		addps		xmm4, xmm7
-		movaps		[edi-1*16], xmm0
+		movaps		[edi - 1 * 16], xmm0
 
-		add			eax, 4*4
+		add			eax, 4 * 4
 
 		jl			loop24
 	}
@@ -17822,21 +17982,21 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *s
 	incL4 *= 2;
 	incL5 *= 2;
 
-	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
-		mixBuffer[i*6+ 0] += samples[i+0] * sL0;
-		mixBuffer[i*6+ 1] += samples[i+0] * sL1;
-		mixBuffer[i*6+ 2] += samples[i+0] * sL2;
-		mixBuffer[i*6+ 3] += samples[i+0] * sL3;
+	for ( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
+		mixBuffer[i * 6 + 0] += samples[i + 0] * sL0;
+		mixBuffer[i * 6 + 1] += samples[i + 0] * sL1;
+		mixBuffer[i * 6 + 2] += samples[i + 0] * sL2;
+		mixBuffer[i * 6 + 3] += samples[i + 0] * sL3;
 
-		mixBuffer[i*6+ 4] += samples[i+0] * sL4;
-		mixBuffer[i*6+ 5] += samples[i+0] * sL5;
-		mixBuffer[i*6+ 6] += samples[i+1] * sL6;
-		mixBuffer[i*6+ 7] += samples[i+1] * sL7;
+		mixBuffer[i * 6 + 4] += samples[i + 0] * sL4;
+		mixBuffer[i * 6 + 5] += samples[i + 0] * sL5;
+		mixBuffer[i * 6 + 6] += samples[i + 1] * sL6;
+		mixBuffer[i * 6 + 7] += samples[i + 1] * sL7;
 
-		mixBuffer[i*6+ 8] += samples[i+1] * sL8;
-		mixBuffer[i*6+ 9] += samples[i+1] * sL9;
-		mixBuffer[i*6+10] += samples[i+1] * sL10;
-		mixBuffer[i*6+11] += samples[i+1] * sL11;
+		mixBuffer[i * 6 + 8] += samples[i + 1] * sL8;
+		mixBuffer[i * 6 + 9] += samples[i + 1] * sL9;
+		mixBuffer[i * 6 + 10] += samples[i + 1] * sL10;
+		mixBuffer[i * 6 + 11] += samples[i + 1] * sL11;
 
 		sL0  += incL0;
 		sL1  += incL1;
@@ -17896,8 +18056,8 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float
 
 		xorps		xmm5, xmm5
 		movhps		xmm5, incs
-		movlps		xmm7, incs+ 8
-		movhps		xmm7, incs+16
+		movlps		xmm7, incs + 8
+		movhps		xmm7, incs + 16
 		addps		xmm3, xmm5
 		addps		xmm4, xmm7
 		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
@@ -17907,32 +18067,32 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float
 		addps		xmm6, xmm6
 		addps		xmm7, xmm7
 
-	loop12:
-		add			edi, 3*16
+		loop12:
+		add			edi, 3 * 16
 
-		movaps		xmm0, [esi+eax+0]
+		movaps		xmm0, [esi + eax + 0]
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
 		mulps		xmm1, xmm2
-		addps		xmm1, [edi-3*16]
+		addps		xmm1, [edi - 3 * 16]
 		addps		xmm2, xmm5
-		movaps		[edi-3*16], xmm1
+		movaps		[edi - 3 * 16], xmm1
 
 		movaps		xmm1, xmm0
 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
 		mulps		xmm1, xmm3
-		addps		xmm1, [edi-2*16]
+		addps		xmm1, [edi - 2 * 16]
 		addps		xmm3, xmm6
-		movaps		[edi-2*16], xmm1
+		movaps		[edi - 2 * 16], xmm1
 
-		add			eax, 4*4
+		add			eax, 4 * 4
 
 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
 		mulps		xmm0, xmm4
-		addps		xmm0, [edi-1*16]
+		addps		xmm0, [edi - 1 * 16]
 		addps		xmm4, xmm7
-		movaps		[edi-1*16], xmm0
+		movaps		[edi - 1 * 16], xmm0
 
 		jl			loop12
 
@@ -17977,21 +18137,21 @@ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float
 	incL4 *= 2;
 	incL5 *= 2;
 
-	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
-		mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
-		mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
-		mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
-		mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
+	for ( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
+		mixBuffer[i * 6 + 0] += samples[i * 2 + 0 + 0] * sL0;
+		mixBuffer[i * 6 + 1] += samples[i * 2 + 0 + 1] * sL1;
+		mixBuffer[i * 6 + 2] += samples[i * 2 + 0 + 0] * sL2;
+		mixBuffer[i * 6 + 3] += samples[i * 2 + 0 + 0] * sL3;
 
-		mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
-		mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
-		mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
-		mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
+		mixBuffer[i * 6 + 4] += samples[i * 2 + 0 + 0] * sL4;
+		mixBuffer[i * 6 + 5] += samples[i * 2 + 0 + 1] * sL5;
+		mixBuffer[i * 6 + 6] += samples[i * 2 + 2 + 0] * sL6;
+		mixBuffer[i * 6 + 7] += samples[i * 2 + 2 + 1] * sL7;
 
-		mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
-		mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
-		mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
-		mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
+		mixBuffer[i * 6 + 8] += samples[i * 2 + 2 + 0] * sL8;
+		mixBuffer[i * 6 + 9] += samples[i * 2 + 2 + 0] * sL9;
+		mixBuffer[i * 6 + 10] += samples[i * 2 + 2 + 0] * sL10;
+		mixBuffer[i * 6 + 11] += samples[i * 2 + 2 + 1] * sL11;
 
 		sL0  += incL0;
 		sL1  += incL1;
@@ -18031,7 +18191,7 @@ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuf
 		add			edi, eax
 		neg			eax
 
-	loop16:
+		loop16:
 
 		movaps		xmm0, [edi+eax+0*16]
 		movaps		xmm2, [edi+eax+1*16]
@@ -18084,15 +18244,13 @@ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuf
 		} else if ( mixBuffer[i] >= 32767.0f ) {
 			samples[i] = 32767;
 		} else {
-			samples[i] = (short) mixBuffer[i];
+			samples[i] = ( short ) mixBuffer[i];
 		}
 	}
 
 #endif
 }
 
-#endif  /* _MSC_VER */
-
 /*
 ============
 idSIMD_SSE::CullByFrustum
@@ -18194,3 +18352,121 @@ void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, c
 		pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6;
 	}
 }
+
+// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
+#elif defined(_MSC_VER) && defined(_M_X64)
+
+#include <xmmintrin.h>
+
+/*
+============
+idSIMD_SSE::GetName
+============
+*/
+const char *idSIMD_SSE::GetName( void ) const {
+	return "MMX & SSE";
+}
+
+/*
+============
+idSIMD_SSE::CullByFrustum
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+	__m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+	__m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+	__m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+	__m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+	__m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+	__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+	__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+	__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
+
+	for ( int j = 0; j < numVerts; j++ ) {
+		idVec3 &vec = verts[j].xyz;
+		__m128 vX = _mm_set1_ps( vec.x );
+		__m128 vY = _mm_set1_ps( vec.y );
+		__m128 vZ = _mm_set1_ps( vec.z );
+		__m128 d14 = _mm_add_ps(
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fA14, vX ),
+		                     _mm_mul_ps( fB14, vY )
+		                 ),
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fC14, vZ ),
+		                     fD14
+		                 )
+		             );
+		__m128 d56 = _mm_add_ps(
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fA56, vX ),
+		                     _mm_mul_ps( fB56, vY )
+		                 ),
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fC56, vZ ),
+		                     fD56
+		                 )
+		             );
+		const short mask6 = ( 1 << 6 ) - 1;
+		__m128 eps = _mm_set1_ps( epsilon );
+		int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+		int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+		int mask_lo = mask_lo14 | mask_lo56 << 4;
+		pointCull[j] = mask_lo & mask6;
+	}
+}
+
+/*
+============
+idSIMD_SSE::CullByFrustum2
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+	__m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+	__m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+	__m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+	__m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+	__m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+	__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+	__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+	__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
+
+	for ( int j = 0; j < numVerts; j++ ) {
+		idVec3 &vec = verts[j].xyz;
+		__m128 vX = _mm_set1_ps( vec.x );
+		__m128 vY = _mm_set1_ps( vec.y );
+		__m128 vZ = _mm_set1_ps( vec.z );
+		__m128 d14 = _mm_add_ps(
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fA14, vX ),
+		                     _mm_mul_ps( fB14, vY )
+		                 ),
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fC14, vZ ),
+		                     fD14
+		                 )
+		             );
+		__m128 d56 = _mm_add_ps(
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fA56, vX ),
+		                     _mm_mul_ps( fB56, vY )
+		                 ),
+		                 _mm_add_ps(
+		                     _mm_mul_ps( fC56, vZ ),
+		                     fD56
+		                 )
+		             );
+		const short mask6 = ( 1 << 6 ) - 1;
+		__m128 eps = _mm_set1_ps( epsilon );
+		int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+		int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+		eps = _mm_set1_ps( -epsilon );
+		int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) );
+		int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) );
+		int mask_lo = mask_lo14 | mask_lo56 << 4;
+		int mask_hi = mask_hi14 | mask_hi56 << 4;
+		pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6;
+	}
+}
+
+#endif  /* _MSC_VER */
diff --git a/neo/idlib/math/Simd_SSE.h b/neo/idlib/math/Simd_SSE.h
index 859d8ce07..2391fb106 100644
--- a/neo/idlib/math/Simd_SSE.h
+++ b/neo/idlib/math/Simd_SSE.h
@@ -46,10 +46,14 @@ class idSIMD_SSE : public idSIMD_MMX {
 	using idSIMD_MMX::MinMax;
 
 	virtual const char *VPCALL GetName( void ) const;
+
 	virtual void VPCALL Dot( float *dst,			const idPlane &constant, const idDrawVert *src,	const int count );
 	virtual	void VPCALL MinMax( idVec3 &min,		idVec3 &max,			const idDrawVert *src,	const int *indexes,		const int count );
 	virtual void VPCALL Dot( float *dst,			const idVec3 &constant,	const idPlane *src,		const int count );
 
+	virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+	virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+
 #elif defined(_MSC_VER) && defined(_M_IX86)
 	virtual const char *VPCALL GetName( void ) const;
 
@@ -143,12 +147,18 @@ class idSIMD_SSE : public idSIMD_MMX {
 	virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
 	virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples );
 
-#endif
+	virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+	virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+
+// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
+#elif defined(_MSC_VER) && defined(_M_X64)
+
+	virtual const char *VPCALL GetName( void ) const;
 
-	// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
 	virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
 	virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
 
+#endif  /* _MSC_VER */
 };
 
 #endif /* !__MATH_SIMD_SSE_H__ */