From 8b51726ccbb5f0d7820cba0e08d9e093b5376580 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 20 Jun 2020 13:29:11 +0200 Subject: [PATCH] Use "SIMD Everywhere" to enable wider portability --- .gitmodules | 3 + examl/Makefile.AVX.gcc | 6 +- examl/Makefile.KNL.icc | 4 +- examl/Makefile.MIC.icc | 4 +- examl/Makefile.OMP.AVX.gcc | 6 +- examl/Makefile.OMP.SSE3.gcc | 14 +- examl/Makefile.SSE3.gcc | 17 +- examl/avxLikelihood.c | 5 +- examl/axml.c | 7 +- examl/axml.h | 8 +- examl/evaluateGenericSpecial.c | 232 +- examl/evaluatePartialGenericSpecial.c | 206 +- examl/makenewzGenericSpecial.c | 451 +-- examl/mic_native_aa.c | 3 +- examl/newviewGenericSpecial.c | 4946 +------------------------ parser/Makefile.SSE3.gcc | 2 +- parser/axml.c | 7 +- parser/axml.h | 4 - simde | 1 + 19 files changed, 85 insertions(+), 5841 deletions(-) create mode 100644 .gitmodules create mode 160000 simde diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..9c36977 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "simde"] + path = simde + url = https://github.com/nemequ/simde-no-tests.git diff --git a/examl/Makefile.AVX.gcc b/examl/Makefile.AVX.gcc index 31f7b1c..8ceb254 100644 --- a/examl/Makefile.AVX.gcc +++ b/examl/Makefile.AVX.gcc @@ -3,7 +3,7 @@ CC = mpicc -COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter +COMMON_FLAGS = -I.. -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 @@ -21,10 +21,10 @@ all : clean examl-AVX GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h examl-AVX : $(objs) - $(CC) -o examl-AVX $(objs) $(LIBRARIES) $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS) avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS) - $(CC) $(CFLAGS) $(CPPFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c + $(CC) $(CFLAGS) $(CPPFLAGS) -mavx -c -o $@ avxLikelihood.c models.o : models.c $(GLOBAL_DEPS) $(CC) $(CFLAGS) $(COMMON_FLAGS) $(OPT_FLAG_1) $(CPPFLAGS) -c -o models.o models.c diff --git a/examl/Makefile.KNL.icc b/examl/Makefile.KNL.icc index 77e496f..f5cd613 100644 --- a/examl/Makefile.KNL.icc +++ b/examl/Makefile.KNL.icc @@ -7,7 +7,7 @@ CC = mpicc KNLFLAGS=-xMIC-AVX512 -fma -align -finline-functions -D__MIC_NATIVE -qopenmp -D_USE_OMP -COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE $(KNLFLAGS) # -Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter +COMMON_FLAGS = -I.. -std=c99 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE $(KNLFLAGS) # -Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 @@ -25,7 +25,7 @@ all : clean examl-KNL GLOBAL_DEPS = axml.h globalVariables.h examl-KNL : $(objs) - $(CC) -o examl-KNL $(objs) $(LIBRARIES) + $(CC) $(CFLAGS) -o examl-KNL $(objs) $(LIBRARIES) models.o : models.c $(GLOBAL_DEPS) $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c diff --git a/examl/Makefile.MIC.icc b/examl/Makefile.MIC.icc index db2d751..f9aba68 100644 --- a/examl/Makefile.MIC.icc +++ b/examl/Makefile.MIC.icc @@ -4,7 +4,7 @@ CC = mpicc MICFLAGS = -D__MIC_NATIVE -mmic -qopt-streaming-cache-evict=0 -qopenmp -D_USE_OMP #-D_PROFILE_MPI -COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE $(MICFLAGS) # -Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter +COMMON_FLAGS = -I.. -std=c99 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE $(MICFLAGS) # -Wall -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 @@ -22,7 +22,7 @@ all : clean examl-MIC GLOBAL_DEPS = axml.h globalVariables.h examl-MIC : $(objs) - $(CC) -o examl-MIC $(objs) $(LIBRARIES) + $(CC) $(CFLAGS) -o examl-MIC $(objs) $(LIBRARIES) models.o : models.c $(GLOBAL_DEPS) $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c diff --git a/examl/Makefile.OMP.AVX.gcc b/examl/Makefile.OMP.AVX.gcc index 1af181b..15be8d4 100644 --- a/examl/Makefile.OMP.AVX.gcc +++ b/examl/Makefile.OMP.AVX.gcc @@ -3,7 +3,7 @@ CC = mpicc -COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_USE_OMP -fopenmp -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE -Wall # -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter +COMMON_FLAGS = -I.. -D_USE_OMP -fopenmp -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE -Wall # -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 @@ -21,10 +21,10 @@ all : clean examl-OMP-AVX GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h examl-OMP-AVX : $(objs) - $(CC) -o examl-OMP-AVX $(objs) $(LIBRARIES) $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS) avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS) - $(CC) $(CFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c + $(CC) $(CFLAGS) -mavx -c -o $@ avxLikelihood.c models.o : models.c $(GLOBAL_DEPS) $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c diff --git a/examl/Makefile.OMP.SSE3.gcc b/examl/Makefile.OMP.SSE3.gcc index 385aaaf..388341b 100644 --- a/examl/Makefile.OMP.SSE3.gcc +++ b/examl/Makefile.OMP.SSE3.gcc @@ -3,9 +3,9 @@ CC = mpicc -COMMON_FLAGS = -D_USE_OMP -fopenmp -D_GNU_SOURCE -D__SIM_SSE3 -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter +COMMON_FLAGS = -I.. -D_USE_OMP -fopenmp -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter -OPT_FLAG_1 = -O1 +OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2) @@ -14,14 +14,17 @@ LIBRARIES = -lm -fopenmp RM = rm -f -objs = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o byteFile.o partitionAssignment.o communication.o quartets.o +objs = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o all : clean examl-OMP GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h examl-OMP : $(objs) - $(CC) -o examl-OMP $(objs) $(LIBRARIES) $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS) + +avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS) + $(CC) $(CFLAGS) -c -o $@ avxLikelihood.c models.o : models.c $(GLOBAL_DEPS) $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c @@ -34,6 +37,7 @@ axml.o : axml.c $(GLOBAL_DEPS) searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS) topologies.o : topologies.c $(GLOBAL_DEPS) treeIO.o : treeIO.c $(GLOBAL_DEPS) +quartets.o : quartets.c $(GLOBAL_DEPS) models.o : models.c $(GLOBAL_DEPS) evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS) evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS) @@ -43,7 +47,7 @@ restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS) byteFile.o : byteFile.c partitionAssignment.o : partitionAssignment.c $(GLOBAL_DEPS) communication.o : communication.c $(GLOBAL_DEPS) -quartets.o : quartets.c $(GLOBAL_DEPS) + clean : $(RM) *.o examl-OMP diff --git a/examl/Makefile.SSE3.gcc b/examl/Makefile.SSE3.gcc index 7cdedc4..aeb6c81 100644 --- a/examl/Makefile.SSE3.gcc +++ b/examl/Makefile.SSE3.gcc @@ -3,10 +3,9 @@ CC = mpicc +COMMON_FLAGS = -I.. -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter -COMMON_FLAGS = -D_GNU_SOURCE -D__SIM_SSE3 -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter - -OPT_FLAG_1 = -O1 +OPT_FLAG_1 = -O1 OPT_FLAG_2 = -O2 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2) @@ -15,17 +14,20 @@ LIBRARIES = -lm RM = rm -f -objs = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o byteFile.o partitionAssignment.o communication.o quartets.o +objs = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o all : clean examl GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h examl : $(objs) - $(CC) -o examl $(objs) $(LIBRARIES) $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS) + +avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS) + $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ avxLikelihood.c models.o : models.c $(GLOBAL_DEPS) - $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c + $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) $(CPPFLAGS) -c -o models.o models.c bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS) evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS) @@ -35,6 +37,7 @@ axml.o : axml.c $(GLOBAL_DEPS) searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS) topologies.o : topologies.c $(GLOBAL_DEPS) treeIO.o : treeIO.c $(GLOBAL_DEPS) +quartets.o : quartets.c $(GLOBAL_DEPS) models.o : models.c $(GLOBAL_DEPS) evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS) evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS) @@ -44,7 +47,7 @@ restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS) byteFile.o : byteFile.c partitionAssignment.o : partitionAssignment.c $(GLOBAL_DEPS) communication.o : communication.c $(GLOBAL_DEPS) -quartets.o : quartets.c $(GLOBAL_DEPS) + clean : $(RM) *.o examl diff --git a/examl/avxLikelihood.c b/examl/avxLikelihood.c index f4438f3..09ea869 100644 --- a/examl/avxLikelihood.c +++ b/examl/avxLikelihood.c @@ -10,9 +10,8 @@ #include #include "axml.h" #include -#include -#include -#include +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/avx2.h" #ifdef _FMA #include diff --git a/examl/axml.c b/examl/axml.c index eab5dc3..d10b373 100644 --- a/examl/axml.c +++ b/examl/axml.c @@ -52,7 +52,9 @@ #include -#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC)) +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse.h" +#if defined(SIMDE_SSE_NATIVE) #include /* special bug fix, enforces denormalized numbers to be flushed to zero, @@ -2591,7 +2593,8 @@ int main (int argc, char *argv[]) substantial run-time differences for vectors of equal length. */ -#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC)) +#if defined(SIMDE_SSE_NATIVE) +# include _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON); #endif diff --git a/examl/axml.h b/examl/axml.h index 95e90c1..753ca45 100644 --- a/examl/axml.h +++ b/examl/axml.h @@ -42,11 +42,8 @@ #ifdef __MIC_NATIVE #define BYTE_ALIGNMENT 64 #define VECTOR_PADDING 8 -#elif defined __AVX -#define BYTE_ALIGNMENT 32 -#define VECTOR_PADDING 1 #else -#define BYTE_ALIGNMENT 16 +#define BYTE_ALIGNMENT 32 #define VECTOR_PADDING 1 #endif @@ -1337,8 +1334,6 @@ extern void scaleLG4X_EIGN(tree *tr, int model); extern void myBinFwrite(void *ptr, size_t size, size_t nmemb, FILE *byteFile); extern void myBinFread(void *ptr, size_t size, size_t nmemb, FILE *byteFile); -#ifdef __AVX - extern void newviewGTRGAMMAPROT_AVX_LG4(int tipCase, double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4], int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, @@ -1401,7 +1396,6 @@ void newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(int tipCase, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn); -#endif diff --git a/examl/evaluateGenericSpecial.c b/examl/evaluateGenericSpecial.c index 27388f5..7971f4b 100644 --- a/examl/evaluateGenericSpecial.c +++ b/examl/evaluateGenericSpecial.c @@ -43,11 +43,8 @@ /* includes for using SSE3 intrinsics */ -#ifdef __SIM_SSE3 -#include -#include -/*#include */ -#endif +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse3.h" #ifdef __MIC_NATIVE #include "mic_native.h" @@ -147,168 +144,6 @@ static void calcDiagptableFlex_LG4(double z, int numberOfCategories, double *rpt -#ifndef _OPTIMIZED_FUNCTIONS - -/* below a a slow generic implementation of the likelihood computation at the root under the GAMMA model */ - -static double evaluateGAMMA_FLEX(int *wptr, - double *x1_start, double *x2_start, - double *tipVector, - unsigned char *tipX1, const int n, double *diagptable, const int states) -{ - double - sum = 0.0, - term, - *x1, - *x2; - - int - i, - j, - k; - - /* span is the offset within the likelihood array at an inner node that gets us from the values - of site i to the values of site i + 1 */ - - const int - span = states * 4; - - /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is - a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-) - Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes - is done before when we compute the traversal descriptor. - */ - - /* the left node is a tip */ - if(tipX1) - { - /* loop over the sites of this partition */ - for (i = 0; i < n; i++) - { - /* access pre-computed tip vector values via a lookup table */ - x1 = &(tipVector[states * tipX1[i]]); - /* access the other(inner) node at the other end of the branch */ - x2 = &(x2_start[span * i]); - - /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */ - for(j = 0, term = 0.0; j < 4; j++) - /* loop over states and multiply them with the P matrix */ - for(k = 0; k < states; k++) - term += x1[k] * x2[j * states + k] * diagptable[j * states + k]; - - /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4. - Under the GAMMA model the 4 discrete GAMMA rates all have the same probability - of 0.25 */ - - term = LOG(0.25 * FABS(term)); - - sum += wptr[i] * term; - } - } - else - { - for (i = 0; i < n; i++) - { - /* same as before, only that now we access two inner likelihood vectors x1 and x2 */ - - x1 = &(x1_start[span * i]); - x2 = &(x2_start[span * i]); - - for(j = 0, term = 0.0; j < 4; j++) - for(k = 0; k < states; k++) - term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k]; - - term = LOG(0.25 * FABS(term)); - - sum += wptr[i] * term; - } - } - - return sum; -} - - -/* a generic and slow implementation of the CAT model of rate heterogeneity */ - -static double evaluateCAT_FLEX (int *cptr, int *wptr, - double *x1, double *x2, double *tipVector, - unsigned char *tipX1, int n, double *diagptable_start, const int states) -{ - double - sum = 0.0, - term, - *diagptable, - *left, - *right; - - int - i, - l; - - /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless - of whether we are using CAT, GAMMA, DNA or protein data etc */ - - if(tipX1) - { - for (i = 0; i < n; i++) - { - /* same as in the GAMMA implementation */ - left = &(tipVector[states * tipX1[i]]); - right = &(x2[states * i]); - - /* important difference here, we do not have, as for GAMMA - 4 P matrices assigned to each site, but just one. However those - P-Matrices can be different for the sites. - Hence we index into the precalculated P-matrices for individual sites - via the category pointer cptr[i] - */ - diagptable = &diagptable_start[states * cptr[i]]; - - /* similar to gamma, with the only difference that we do not integrate (sum) - over the discrete gamma rates, but simply compute the likelihood of the - site and the given P-matrix */ - - for(l = 0, term = 0.0; l < states; l++) - term += left[l] * right[l] * diagptable[l]; - - /* take the log */ - - term = LOG(FABS(term)); - - /* - multiply the log with the pattern weight of this site. - The site pattern for which we just computed the likelihood may - represent several alignment columns sites that have been compressed - into one site pattern if they are exactly identical AND evolve under the same model, - i.e., form part of the same partition. - */ - - sum += wptr[i] * term; - } - } - else - { - for (i = 0; i < n; i++) - { - /* as before we now access the likelihood arrayes of two inner nodes */ - left = &x1[states * i]; - right = &x2[states * i]; - - diagptable = &diagptable_start[states * cptr[i]]; - - for(l = 0, term = 0.0; l < states; l++) - term += left[l] * right[l] * diagptable[l]; - - term = LOG(FABS(term)); - - sum += wptr[i] * term; - } - } - - return sum; -} - -#endif /* below are the function headers for unreadeble highly optimized versions of the above functions for DNA and protein data that also use SSE3 intrinsics and implement some memory saving tricks. @@ -326,7 +161,6 @@ static double evaluateCAT_FLEX (int *cptr, int *wptr, */ -#ifdef _OPTIMIZED_FUNCTIONS static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr, double *x1_start, double *x2_start, double *tipVector, @@ -395,7 +229,6 @@ static double evaluateGTRCAT (int *cptr, int *wptr, unsigned char *tipX1, int n, double *diagptable_start); -#endif /* This is the core function for computing the log likelihood at a branch */ @@ -668,23 +501,6 @@ void evaluateIterative(tree *tr) } -#ifndef _OPTIMIZED_FUNCTIONS - - /* generic slow functions, memory saving option is not implemented for these */ - - assert(!tr->saveMemory); - - /* decide wheter CAT or GAMMA is used and compute log like */ - - if(tr->rateHetModel == CAT) - partitionLikelihood = evaluateCAT_FLEX(tr->partitionData[model].rateCategory, wgt, - x1_start, x2_start, tr->partitionData[model].tipVector, - tip, width, diagptable, states); - else - partitionLikelihood = evaluateGAMMA_FLEX(wgt, - x1_start, x2_start, tr->partitionData[model].tipVector, - tip, width, diagptable, states); -#else /* for the optimized functions we have a dedicated, optimized function implementation for each rate heterogeneity and data type combination, we switch over the number of states @@ -814,7 +630,6 @@ void evaluateIterative(tree *tr) default: assert(0); } -#endif /* now here is a nasty part, for each partition and each node we maintain an integer counter to count how often how many entries per node were scaled by a constant factor. Here we use this information generated during Felsenstein's @@ -1008,8 +823,6 @@ void evaluateGeneric (tree *tr, nodeptr p, boolean fullTraversal) /* below are the optimized function versions with geeky intrinsics */ -#ifdef _OPTIMIZED_FUNCTIONS - /* binary data */ static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr, @@ -1174,7 +987,6 @@ static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr, { for (i = 0; i < n; i++) { -#ifdef __SIM_SSE3 __m128d tv = _mm_setzero_pd(); @@ -1203,21 +1015,6 @@ static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr, tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); -#else - for(j = 0, term = 0.0; j < 4; j++) - { - double - t = 0.0; - - left = &(tipVector[j][20 * tipX1[i]]); - right = &(x2[80 * i + 20 * j]); - for(l = 0; l < 20; l++) - t += left[l] * right[l] * diagptable[j * 20 + l]; - - term += weights[j] * t; - } -#endif - if(fastScaling) term = LOG(FABS(term)); else @@ -1230,7 +1027,6 @@ static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr, { for (i = 0; i < n; i++) { -#ifdef __SIM_SSE3 __m128d tv = _mm_setzero_pd(); @@ -1258,22 +1054,6 @@ static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr, tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); -#else - for(j = 0, term = 0.0; j < 4; j++) - { - double - t = 0.0; - - left = &(x1[80 * i + 20 * j]); - right = &(x2[80 * i + 20 * j]); - - for(l = 0; l < 20; l++) - t += left[l] * right[l] * diagptable[j * 20 + l]; - - term += weights[j] * t; - } -#endif - if(fastScaling) term = LOG(FABS(term)); else @@ -2073,11 +1853,3 @@ static double evaluateGTRCAT (int *cptr, int *wptr, return sum; } - - - - - -#endif - - diff --git a/examl/evaluatePartialGenericSpecial.c b/examl/evaluatePartialGenericSpecial.c index 8ea1e1f..379a013 100644 --- a/examl/evaluatePartialGenericSpecial.c +++ b/examl/evaluatePartialGenericSpecial.c @@ -40,14 +40,10 @@ #include #include "axml.h" -#ifdef __SIM_SSE3 -#include -#include -#endif - +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse3.h" - -#if defined(_OPTIMIZED_FUNCTIONS) && !defined(__MIC_NATIVE) +#if !defined(__MIC_NATIVE) static inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz, traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, unsigned char **yVector, int mxtips); @@ -85,175 +81,6 @@ static double evaluatePartialGTRCAT_BINARY(int i, double ki, int counter, trave int w, double *EIGN, double *EI, double *EV, double *tipVector, unsigned char **yVector, int branchReference, int mxtips); - - -#else - -static inline void computeVectorCAT_FLEX(double *lVector, int *eVector, double ki, int i, double qz, double rz, - traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, - unsigned char **yVector, int mxtips, const int states) -{ - double - *d1 = (double *)malloc(sizeof(double) * states), - *d2 = (double *)malloc(sizeof(double) * states), - *x1px2 = (double *)malloc(sizeof(double) * states), - ump_x1, - ump_x2, - lz1, - lz2, - *x1, - *x2, - *x3; - - int - scale, - j, - k, - pNumber = ti->pNumber, - rNumber = ti->rNumber, - qNumber = ti->qNumber; - - x3 = &lVector[states * (pNumber - mxtips)]; - - switch(ti->tipCase) - { - case TIP_TIP: - x1 = &(tipVector[states * yVector[qNumber][i]]); - x2 = &(tipVector[states * yVector[rNumber][i]]); - break; - case TIP_INNER: - x1 = &(tipVector[states * yVector[qNumber][i]]); - x2 = &(lVector[states * (rNumber - mxtips)]); - break; - case INNER_INNER: - x1 = &(lVector[states * (qNumber - mxtips)]); - x2 = &(lVector[states * (rNumber - mxtips)]); - break; - default: - assert(0); - } - - lz1 = qz * ki; - lz2 = rz * ki; - - d1[0] = x1[0]; - d2[0] = x2[0]; - - - for(j = 1; j < states; j++) - { - d1[j] = x1[j] * EXP(EIGN[j] * lz1); - d2[j] = x2[j] * EXP(EIGN[j] * lz2); - } - - - for(j = 0; j < states; j++) - { - ump_x1 = 0.0; - ump_x2 = 0.0; - - for(k = 0; k < states; k++) - { - ump_x1 += d1[k] * EI[j * states + k]; - ump_x2 += d2[k] * EI[j * states + k]; - } - - x1px2[j] = ump_x1 * ump_x2; - } - - for(j = 0; j < states; j++) - x3[j] = 0.0; - - for(j = 0; j < states; j++) - for(k = 0; k < states; k++) - x3[k] += x1px2[j] * EV[states * j + k]; - - scale = 1; - for(j = 0; scale && (j < states); j++) - scale = ((x3[j] < minlikelihood) && (x3[j] > minusminlikelihood)); - - if(scale) - { - for(j = 0; j < states; j++) - x3[j] *= twotothe256; - *eVector = *eVector + 1; - } - - free(d1); - free(d2); - free(x1px2); - - return; -} - - -static double evaluatePartialCAT_FLEX(int i, double ki, int counter, traversalInfo *ti, double qz, - int w, double *EIGN, double *EI, double *EV, - double *tipVector, unsigned char **yVector, - int branchReference, int mxtips, const int states) -{ - int - scale = 0, - k; - - double - *lVector = (double *)malloc_aligned(sizeof(double) * states * mxtips), - *d = (double *)malloc_aligned(sizeof(double) * states), - lz, - term, - *x1, - *x2; - - traversalInfo - *trav = &ti[0]; - - assert(isTip(trav->pNumber, mxtips)); - - x1 = &(tipVector[states * yVector[trav->pNumber][i]]); - - for(k = 1; k < counter; k++) - { - double - qz = ti[k].qz[branchReference], - rz = ti[k].rz[branchReference]; - - qz = (qz > zmin) ? log(qz) : log(zmin); - rz = (rz > zmin) ? log(rz) : log(zmin); - - computeVectorCAT_FLEX(lVector, &scale, ki, i, qz, rz, &ti[k], - EIGN, EI, EV, - tipVector, yVector, mxtips, states); - } - - x2 = &lVector[states * (trav->qNumber - mxtips)]; - - assert(0 <= (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips); - - if(qz < zmin) - lz = zmin; - lz = log(qz); - lz *= ki; - - d[0] = 1.0; - - for(k = 1; k < states; k++) - d[k] = EXP (EIGN[k] * lz); - - term = 0.0; - - for(k = 0; k < states; k++) - term += x1[k] * x2[k] * d[k]; - - term = LOG(FABS(term)) + (scale * LOG(minlikelihood)); - - term = term * w; - - free(lVector); - free(d); - - return term; -} - #endif double evaluatePartialGeneric (tree *tr, int i, double ki, int _model) @@ -274,25 +101,8 @@ double evaluatePartialGeneric (tree *tr, int i, double ki, int _model) else branchReference = 0; -#ifndef _OPTIMIZED_FUNCTIONS - if(tr->rateHetModel == CAT) - result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], - tr->partitionData[_model].wgt[index], - tr->partitionData[_model].EIGN, - tr->partitionData[_model].EI, - tr->partitionData[_model].EV, - tr->partitionData[_model].tipVector, - tr->partitionData[_model].yVector, branchReference, tr->mxtips, states); - else - /* - the per-site site likelihood function should only be called for the CAT model - under the GAMMA model this is required only for estimating per-site protein models - which has however been removed in this version of the code - */ - assert(0); - - -#elif defined(__MIC_NATIVE) + +#if defined(__MIC_NATIVE) if (tr->rateHetModel == CAT) result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], tr->partitionData[_model].wgt[index], @@ -362,8 +172,6 @@ else return result; } -#ifdef _OPTIMIZED_FUNCTIONS - static inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz, traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, @@ -1052,7 +860,3 @@ static double evaluatePartialGTRCAT(int i, double ki, int counter, traversalInf return term; } - - - -#endif diff --git a/examl/makenewzGenericSpecial.c b/examl/makenewzGenericSpecial.c index 6d80c67..fa809b9 100644 --- a/examl/makenewzGenericSpecial.c +++ b/examl/makenewzGenericSpecial.c @@ -43,11 +43,8 @@ #include #include "axml.h" -#ifdef __SIM_SSE3 -#include -#include -/*#include */ -#endif +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse3.h" /* includes MIC-optimized functions */ @@ -164,167 +161,9 @@ static void getVects(tree *tr, unsigned char **tipX1, unsigned char **tipX2, dou So if we want to do a Newton-Rpahson optimization we only execute this function once in the beginning for each new branch we are considering ! */ -#ifndef _OPTIMIZED_FUNCTIONS - -static void sumCAT_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, int n, const int states) -{ - int - i, - l; - - double - *sum, - *left, - *right; - - switch(tipCase) - { - - /* switch over possible configurations of the nodes p and q defining the branch */ - - case TIP_TIP: - for (i = 0; i < n; i++) - { - left = &(tipVector[states * tipX1[i]]); - right = &(tipVector[states * tipX2[i]]); - sum = &sumtable[states * i]; - - /* just multiply the values with each other for each site, note the similarity with evaluate() - we precompute the product which will remain constant and then just multiply this pre-computed - product with the changing P matrix exponentaions that depend on the branch lengths */ - - for(l = 0; l < states; l++) - sum[l] = left[l] * right[l]; - } - break; - case TIP_INNER: - - /* same as for TIP_TIP only that - we now access on tip vector and one - inner vector. - - You may also observe that we do not consider using scaling vectors anywhere here. - - This is because we are interested in the first and second derivatives of the likelihood and - hence the addition of the log() of the scaling factor times the number of scaling events - becomes obsolete through the derivative */ - - for (i = 0; i < n; i++) - { - left = &(tipVector[states * tipX1[i]]); - right = &x2[states * i]; - sum = &sumtable[states * i]; - - for(l = 0; l < states; l++) - sum[l] = left[l] * right[l]; - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - left = &x1[states * i]; - right = &x2[states * i]; - sum = &sumtable[states * i]; - - for(l = 0; l < states; l++) - sum[l] = left[l] * right[l]; - } - break; - default: - assert(0); - } -} - - -/* same thing for GAMMA models. The only noteworthy thing here is that we have an additional inner loop over the - number of discrete gamma rates. The data access pattern is also different since for tip vector accesses through our - lookup table, we do not distnguish between rates - - Note the different access pattern in TIP_INNER: - - left = &(tipVector[states * tipX1[i]]); - right = &(x2[span * i + l * states]); - -*/ - -static void sumGAMMA_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, int n, const int states) -{ - int - i, - l, - k; - - const int - span = 4 * states; - - double - *left, - *right, - *sum; - - switch(tipCase) - { - case TIP_TIP: - for(i = 0; i < n; i++) - { - left = &(tipVector[states * tipX1[i]]); - right = &(tipVector[states * tipX2[i]]); - - for(l = 0; l < 4; l++) - { - sum = &sumtable[i * span + l * states]; - - for(k = 0; k < states; k++) - sum[k] = left[k] * right[k]; - - } - } - break; - case TIP_INNER: - for(i = 0; i < n; i++) - { - left = &(tipVector[states * tipX1[i]]); - - for(l = 0; l < 4; l++) - { - right = &(x2[span * i + l * states]); - sum = &sumtable[i * span + l * states]; - - for(k = 0; k < states; k++) - sum[k] = left[k] * right[k]; - - } - } - break; - case INNER_INNER: - for(i = 0; i < n; i++) - { - for(l = 0; l < 4; l++) - { - left = &(x1[span * i + l * states]); - right = &(x2[span * i + l * states]); - sum = &(sumtable[i * span + l * states]); - - - for(k = 0; k < states; k++) - sum[k] = left[k] * right[k]; - } - } - break; - default: - assert(0); - } -} - -#endif - /* optimized functions for branch length optimization */ -#ifdef _OPTIMIZED_FUNCTIONS - static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector, unsigned char *tipX1, unsigned char *tipX2, int n); static void coreGTRGAMMA_BINARY(const int upper, double *sumtable, @@ -382,244 +221,6 @@ static void coreGTRGAMMAPROT(double *gammaRates, double *EIGN, double *sumtable, static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper, int *wgt, volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double *sumtable); -#endif - - -#ifndef _OPTIMIZED_FUNCTIONS - -/* now this is the core function of the newton-Raphson based branch length optimization that actually computes - the first and second derivative of the likelihood given a new proposed branch length lz */ - - -static void coreCAT_FLEX(int upper, int numberOfCategories, double *sum, - volatile double *d1, volatile double *d2, int *wgt, - double *rptr, double *EIGN, int *cptr, double lz, const int states) -{ - int - i, - l; - - double - *d, - - /* arrays to store stuff we can pre-compute */ - - *d_start = (double *)malloc_aligned(numberOfCategories * states * sizeof(double)), - *e =(double *)malloc_aligned(states * sizeof(double)), - *s = (double *)malloc_aligned(states * sizeof(double)), - *dd = (double *)malloc_aligned(states * sizeof(double)), - inv_Li, - dlnLidlz, - d2lnLidlz2, - dlnLdlz = 0.0, - d2lnLdlz2 = 0.0; - - d = d_start; - - e[0] = 0.0; - s[0] = 0.0; - dd[0] = 0.0; - - - /* we are pre-computing values for computing the first and second derivative of P(lz) - since this requires an exponetial that the only thing we really have to derive here */ - - for(l = 1; l < states; l++) - { - s[l] = EIGN[l]; - e[l] = EIGN[l] * EIGN[l]; - dd[l] = s[l] * lz; - } - - /* compute the P matrices and their derivatives for - all per-site rate categories */ - - for(i = 0; i < numberOfCategories; i++) - { - d[states * i] = 1.0; - for(l = 1; l < states; l++) - d[states * i + l] = EXP(dd[l] * rptr[i]); - } - - - /* now loop over the sites in this partition to obtain the per-site 1st and 2nd derivatives */ - - for (i = 0; i < upper; i++) - { - double - r = rptr[cptr[i]], - wr1 = r * wgt[i], - wr2 = r * r * wgt[i]; - - /* get the correct p matrix for the rate at the current site i */ - - d = &d_start[states * cptr[i]]; - - /* this is the likelihood at site i, NOT the log likelihood, we don't need the log - likelihood to compute derivatives ! */ - - inv_Li = sum[states * i]; - - /* those are for storing the first and second derivative of the Likelihood at site i */ - - dlnLidlz = 0.0; - d2lnLidlz2 = 0.0; - - /* now multiply the likelihood and the first and second derivative with the - appropriate derivatives of P(lz) */ - - for(l = 1; l < states; l++) - { - double - tmpv = d[l] * sum[states * i + l]; - - inv_Li += tmpv; - dlnLidlz += tmpv * s[l]; - d2lnLidlz2 += tmpv * e[l]; - } - - /* below we are implementing the other mathematical operations that are required - to obtain the deirivatives */ - - inv_Li = 1.0/ FABS(inv_Li); - - dlnLidlz *= inv_Li; - d2lnLidlz2 *= inv_Li; - - /* under the CAT model, wrptr[] and wr2ptr[] are pre-computed extension sof the weight pointer: - wrptr[i] = wgt[i] * rptr[cptr[i]]. - and - wr2ptr[i] = wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] - - this is also something that is required for the derivatives because when computing the - derivative of the exponential() the rate must be multiplied with the - exponential - - wgt is just the pattern site wieght - */ - - /* compute the accumulated first and second derivatives of this site */ - - dlnLdlz += wr1 * dlnLidlz; - d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz); - } - - /* - set the result values, i.e., the sum of the per-site first and second derivatives of the likelihood function - for this partition. - */ - - *d1 = dlnLdlz; - *d2 = d2lnLdlz2; - - /* free the temporary arrays */ - - free(d_start); - free(e); - free(s); - free(dd); -} - -static void coreGAMMA_FLEX(int upper, double *sumtable, volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, - double *EIGN, double *gammaRates, double lz, int *wgt, const int states) -{ - double - *sum, - diagptable[1024], /* TODO make this dynamic */ - dlnLdlz = 0.0, - d2lnLdlz2 = 0.0, - ki, - kisqr, - tmp, - inv_Li, - dlnLidlz, - d2lnLidlz2; - - int - i, - j, - l; - - const int - gammaStates = 4 * states; - - /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */ - - for(i = 0; i < 4; i++) - { - ki = gammaRates[i]; - kisqr = ki * ki; - - for(l = 1; l < states; l++) - { - diagptable[i * gammaStates + l * 4] = EXP(EIGN[l] * ki * lz); - diagptable[i * gammaStates + l * 4 + 1] = EIGN[l] * ki; - diagptable[i * gammaStates + l * 4 + 2] = EIGN[l] * EIGN[l] * kisqr; - } - } - - /* loop over sites in this partition */ - - for (i = 0; i < upper; i++) - { - double - r = rptr[cptr[i]], - wr1 = r * wgt[i], - wr2 = r * r * wgt[i]; - - /* access the array with pre-computed values */ - sum = &sumtable[i * gammaStates]; - - /* initial per-site likelihood and 1st and 2nd derivatives */ - - inv_Li = 0.0; - dlnLidlz = 0.0; - d2lnLidlz2 = 0.0; - - /* loop over discrete GAMMA rates */ - - for(j = 0; j < 4; j++) - { - inv_Li += sum[j * states]; - - for(l = 1; l < states; l++) - { - inv_Li += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * states + l]); - dlnLidlz += tmp * diagptable[j * gammaStates + l * 4 + 1]; - d2lnLidlz2 += tmp * diagptable[j * gammaStates + l * 4 + 2]; - } - } - - /* finalize derivative computation */ - /* note that wrptr[] here unlike in CAT above is the - integer weight vector of the current site - - The operations: - - EIGN[l] * ki; - EIGN[l] * EIGN[l] * kisqr; - - that are hidden in CAT in wrptr (at least the * ki and * ki *ki part of them - are done explicitely here - - */ - - inv_Li = 1.0 / FABS(inv_Li); - - dlnLidlz *= inv_Li; - d2lnLidlz2 *= inv_Li; - - dlnLdlz += wr1 * dlnLidlz; - d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz); - } - - *ext_dlnLdlz = dlnLdlz; - *ext_d2lnLdlz2 = d2lnLdlz2; - -} - -#endif - /* the function below is called only once at the very beginning of each Newton-Raphson procedure for optimizing barnch lengths. It initially invokes an iterative newview call to get a consistent pair of vectors at the left and the right end of the branch and thereafter invokes the one-time only precomputation of values (sumtable) that can be re-used in each Newton-Raphson @@ -721,15 +322,6 @@ void makenewzIterative(tree *tr) double *sumBuffer = tr->partitionData[model].sumBuffer + x_offset; -#ifndef _OPTIMIZED_FUNCTIONS - assert(!tr->saveMemory); - if(tr->rateHetModel == CAT) - sumCAT_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2, - width, states); - else - sumGAMMA_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2, - width, states); -#else switch(states) { case 2: @@ -834,7 +426,6 @@ void makenewzIterative(tree *tr) default: assert(0); } -#endif } } // for model } // omp parallel region @@ -977,19 +568,6 @@ void execCore(tree *tr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2) dlnLdlz = 0.0, d2lnLdlz2 = 0.0; - #ifndef _OPTIMIZED_FUNCTIONS - - /* compute first and second derivatives with the slow generic functions */ - - if(tr->rateHetModel == CAT) - coreCAT_FLEX(width, tr->partitionData[model].numberOfCategories, sumBuffer, - &dlnLdlz, &d2lnLdlz2, wgt, - tr->partitionData[model].perSiteRates, tr->partitionData[model].EIGN, rateCategory, lz, states); - else - coreGAMMA_FLEX(width, sumBuffer, - &dlnLdlz, &d2lnLdlz2, tr->partitionData[model].EIGN, tr->partitionData[model].gammaRates, lz, - wgt, states); - #else switch(states) { case 2: @@ -1068,7 +646,6 @@ void execCore(tree *tr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2) default: assert(0); } - #endif /* store first and second derivative */ @@ -1402,8 +979,6 @@ void makenewzGeneric(tree *tr, nodeptr p, nodeptr q, double *z0, int maxiter, do /* below are, once again the optimized functions */ -#ifdef _OPTIMIZED_FUNCTIONS - /**** binary ***/ static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum, volatile double *d1, volatile double *d2, @@ -2013,17 +1588,12 @@ static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double * right = &(tipVector[l][20 * tipX2[i]]); sum = &sumtable[i * 80 + l * 20]; -#ifdef __SIM_SSE3 for(k = 0; k < 20; k+=2) { __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k])); _mm_store_pd(&sum[k], sumv); } -#else - for(k = 0; k < 20; k++) - sum[k] = left[k] * right[k]; -#endif } } break; @@ -2037,17 +1607,12 @@ static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double * left = &(tipVector[l][20 * tipX1[i]]); right = &(x2[80 * i + l * 20]); sum = &sumtable[i * 80 + l * 20]; -#ifdef __SIM_SSE3 for(k = 0; k < 20; k+=2) { __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k])); _mm_store_pd(&sum[k], sumv); } -#else - for(k = 0; k < 20; k++) - sum[k] = left[k] * right[k]; -#endif } } break; @@ -2060,17 +1625,12 @@ static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double * right = &(x2[80 * i + l * 20]); sum = &(sumtable[i * 80 + l * 20]); -#ifdef __SIM_SSE3 for(k = 0; k < 20; k+=2) { __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k])); _mm_store_pd(&sum[k], sumv); } -#else - for(k = 0; k < 20; k++) - sum[k] = left[k] * right[k]; -#endif } } break; @@ -2738,10 +2298,3 @@ static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, doub free(d_start); } - - - -#endif - - - diff --git a/examl/mic_native_aa.c b/examl/mic_native_aa.c index 0b2d13d..b95a1b9 100644 --- a/examl/mic_native_aa.c +++ b/examl/mic_native_aa.c @@ -1,4 +1,5 @@ -#include +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/avx2.h" #include #include diff --git a/examl/newviewGenericSpecial.c b/examl/newviewGenericSpecial.c index b8e6daf..a13e452 100644 --- a/examl/newviewGenericSpecial.c +++ b/examl/newviewGenericSpecial.c @@ -41,11 +41,10 @@ #include #include "axml.h" -#ifdef __SIM_SSE3 #include -#include -#include +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/avx2.h" /* required to compute the absoliute values of double precision numbers with SSE3 */ @@ -57,7 +56,6 @@ const union __attribute__ ((aligned (BYTE_ALIGNMENT))) -#endif /* includes MIC-optimized functions */ @@ -212,476 +210,6 @@ static void makeP_FlexLG4(double z1, double z2, double *rptr, double *EI[4], do conditional likelihood arrays at p, given child nodes q and r. Once again we need two generic function implementations, one for CAT and one for GAMMA */ -#ifndef _OPTIMIZED_FUNCTIONS - -static void newviewCAT_FLEX(int tipCase, double *extEV, - int *cptr, - double *x1, double *x2, double *x3, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states) -{ - double - *le, - *ri, - *v, - *vl, - *vr, - ump_x1, - ump_x2, - x1px2; - - int - i, - l, - j, - scale, - addScale = 0; - - const int - statesSquare = states * states; - - - /* here we switch over the different cases for efficiency, but also because - each case accesses different data types. - - We consider three cases: either q and r are both tips, q or r are tips, and q and r are inner - nodes. - */ - - - switch(tipCase) - { - - /* both child nodes of p weher we want to update the conditional likelihood are tips */ - case TIP_TIP: - /* loop over sites */ - for (i = 0; i < n; i++) - { - /* set a pointer to the P-Matrices for the rate category of this site */ - le = &left[cptr[i] * statesSquare]; - ri = &right[cptr[i] * statesSquare]; - - /* pointers to the likelihood entries of the tips q (vl) and r (vr) - We will do reading accesses to these values only. - */ - vl = &(tipVector[states * tipX1[i]]); - vr = &(tipVector[states * tipX2[i]]); - - /* address of the conditional likelihood array entres at site i. This is - a writing access to v */ - v = &x3[states * i]; - - /* initialize v */ - for(l = 0; l < states; l++) - v[l] = 0.0; - - /* loop over states to compute the cond likelihoods at p (v) */ - - for(l = 0; l < states; l++) - { - ump_x1 = 0.0; - ump_x2 = 0.0; - - /* le and ri are the P-matrices */ - - for(j = 0; j < states; j++) - { - ump_x1 += vl[j] * le[l * states + j]; - ump_x2 += vr[j] * ri[l * states + j]; - } - - x1px2 = ump_x1 * ump_x2; - - /* multiply with matrix of eigenvectors extEV */ - - for(j = 0; j < states; j++) - v[j] += x1px2 * extEV[l * states + j]; - } - } - break; - case TIP_INNER: - - /* same as above, only that now vl is a tip and vr is the conditional probability vector - at an inner node. Note that, if we have the case that either q or r is a tip, the - nodes will be flipped to ensure that tipX1 always points to the sequence at the tip. - */ - - for (i = 0; i < n; i++) - { - le = &left[cptr[i] * statesSquare]; - ri = &right[cptr[i] * statesSquare]; - - /* access tip vector lookup table */ - vl = &(tipVector[states * tipX1[i]]); - - /* access conditional likelihoo arrays */ - /* again, vl and vr are reading accesses, while v is a writing access */ - vr = &x2[states * i]; - v = &x3[states * i]; - - /* same as in the loop above */ - - for(l = 0; l < states; l++) - v[l] = 0.0; - - for(l = 0; l < states; l++) - { - ump_x1 = 0.0; - ump_x2 = 0.0; - - for(j = 0; j < states; j++) - { - ump_x1 += vl[j] * le[l * states + j]; - ump_x2 += vr[j] * ri[l * states + j]; - } - - x1px2 = ump_x1 * ump_x2; - - for(j = 0; j < states; j++) - v[j] += x1px2 * extEV[l * states + j]; - } - - /* now let's check for numerical scaling. - The maths in RAxML are a bit non-standard to avoid/economize on arithmetic operations - at the virtual root and for branch length optimization and hence values stored - in the conditional likelihood vectors can become negative. - Below we check if all absolute values stored at position i of v are smaller - than a pre-defined value in axml.h. If they are all smaller we can then safely - multiply them by a large, constant number twotothe256 (without numerical overflow) - that is also speced in axml.h */ - - scale = 1; - for(l = 0; scale && (l < states); l++) - scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); - - if(scale) - { - for(l = 0; l < states; l++) - v[l] *= twotothe256; - - /* if we have scaled the entries to prevent underflow, we need to keep track of how many scaling - multiplications we did per node such as to undo them at the virtual root, e.g., in - evaluateGeneric() - Note here, that, if we scaled the site we need to increment the scaling counter by the wieght, i.e., - the number of sites this potentially compressed pattern represents ! */ - - addScale += wgt[i]; - } - } - break; - case INNER_INNER: - - /* same as above, only that the two child nodes q and r are now inner nodes */ - - for(i = 0; i < n; i++) - { - le = &left[cptr[i] * statesSquare]; - ri = &right[cptr[i] * statesSquare]; - - /* index conditional likelihood vectors of inner nodes */ - - vl = &x1[states * i]; - vr = &x2[states * i]; - v = &x3[states * i]; - - for(l = 0; l < states; l++) - v[l] = 0.0; - - for(l = 0; l < states; l++) - { - ump_x1 = 0.0; - ump_x2 = 0.0; - - for(j = 0; j < states; j++) - { - ump_x1 += vl[j] * le[l * states + j]; - ump_x2 += vr[j] * ri[l * states + j]; - } - - x1px2 = ump_x1 * ump_x2; - - for(j = 0; j < states; j++) - v[j] += x1px2 * extEV[l * states + j]; - } - - scale = 1; - for(l = 0; scale && (l < states); l++) - scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); - - if(scale) - { - for(l = 0; l < states; l++) - v[l] *= twotothe256; - - addScale += wgt[i]; - } - } - break; - default: - assert(0); - } - - /* increment the scaling counter by the additional scalings done at node p */ - - *scalerIncrement = addScale; -} - - -static void newviewGAMMA_FLEX(int tipCase, - double *x1, double *x2, double *x3, double *extEV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states, const int maxStateValue) -{ - double - *uX1, - *uX2, - *v, - x1px2, - *vl, - *vr, - al, - ar; - - int - i, - j, - l, - k, - scale, - addScale = 0; - - const int - statesSquare = states * states, - span = states * 4, - /* this is required for doing some pre-computations that help to save - numerical operations. What we are actually computing here are additional lookup tables - for each possible state a certain data-type can assume. - for DNA with ambuguity coding this is 15, for proteins this is 22 or 23, since there - also exist one or two amibguity codes for protein data. - Essentially this is very similar to the tip vectors which we also use as lookup tables */ - precomputeLength = maxStateValue * span; - - switch(tipCase) - { - case TIP_TIP: - { - /* allocate pre-compute memory space */ - - double - *umpX1 = (double*)malloc(sizeof(double) * precomputeLength), - *umpX2 = (double*)malloc(sizeof(double) * precomputeLength); - - /* multiply all possible tip state vectors with the respective P-matrices - */ - - for(i = 0; i < maxStateValue; i++) - { - v = &(tipVector[states * i]); - - for(k = 0; k < span; k++) - { - - umpX1[span * i + k] = 0.0; - umpX2[span * i + k] = 0.0; - - for(l = 0; l < states; l++) - { - umpX1[span * i + k] += v[l] * left[k * states + l]; - umpX2[span * i + k] += v[l] * right[k * states + l]; - } - - } - } - - for(i = 0; i < n; i++) - { - /* access the precomputed arrays (pre-computed multiplication of conditional with the tip state) - */ - - uX1 = &umpX1[span * tipX1[i]]; - uX2 = &umpX2[span * tipX2[i]]; - - /* loop over discrete GAMMA rates */ - - for(j = 0; j < 4; j++) - { - /* the rest is the same as for CAT */ - v = &x3[i * span + j * states]; - - for(k = 0; k < states; k++) - v[k] = 0.0; - - for(k = 0; k < states; k++) - { - x1px2 = uX1[j * states + k] * uX2[j * states + k]; - - for(l = 0; l < states; l++) - v[l] += x1px2 * extEV[states * k + l]; - } - - } - } - - /* free precomputed vectors */ - - free(umpX1); - free(umpX2); - } - break; - case TIP_INNER: - { - /* we do analogous pre-computations as above, with the only difference that we now do them - only for one tip vector */ - - double - *umpX1 = (double*)malloc(sizeof(double) * precomputeLength), - *ump_x2 = (double*)malloc(sizeof(double) * states); - - /* precompute P and left tip vector product */ - - for(i = 0; i < maxStateValue; i++) - { - v = &(tipVector[states * i]); - - for(k = 0; k < span; k++) - { - - umpX1[span * i + k] = 0.0; - - for(l = 0; l < states; l++) - umpX1[span * i + k] += v[l] * left[k * states + l]; - - - } - } - - for (i = 0; i < n; i++) - { - /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */ - - uX1 = &umpX1[span * tipX1[i]]; - - /* loop over discrete GAMMA rates */ - - for(k = 0; k < 4; k++) - { - v = &(x2[span * i + k * states]); - - for(l = 0; l < states; l++) - { - ump_x2[l] = 0.0; - - for(j = 0; j < states; j++) - ump_x2[l] += v[j] * right[k * statesSquare + l * states + j]; - } - - v = &(x3[span * i + states * k]); - - for(l = 0; l < states; l++) - v[l] = 0; - - for(l = 0; l < states; l++) - { - x1px2 = uX1[k * states + l] * ump_x2[l]; - for(j = 0; j < states; j++) - v[j] += x1px2 * extEV[l * states + j]; - } - } - - /* also do numerical scaling as above. Note that here we need to scale - 4 * 4 values for DNA or 4 * 20 values for protein data. - If they are ALL smaller than our threshold, we scale. Note that, - this can cause numerical problems with GAMMA, if the values generated - by the four discrete GAMMA rates are too different. - - For details, see: - - F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees" - - */ - - - v = &x3[span * i]; - scale = 1; - for(l = 0; scale && (l < span); l++) - scale = (ABS(v[l]) < minlikelihood); - - - if (scale) - { - for(l = 0; l < span; l++) - v[l] *= twotothe256; - - addScale += wgt[i]; - } - } - - free(umpX1); - free(ump_x2); - } - break; - case INNER_INNER: - - /* same as above, without pre-computations */ - - for (i = 0; i < n; i++) - { - for(k = 0; k < 4; k++) - { - vl = &(x1[span * i + states * k]); - vr = &(x2[span * i + states * k]); - v = &(x3[span * i + states * k]); - - - for(l = 0; l < states; l++) - v[l] = 0; - - - for(l = 0; l < states; l++) - { - - al = 0.0; - ar = 0.0; - - for(j = 0; j < states; j++) - { - al += vl[j] * left[k * statesSquare + l * states + j]; - ar += vr[j] * right[k * statesSquare + l * states + j]; - } - - x1px2 = al * ar; - - for(j = 0; j < states; j++) - v[j] += x1px2 * extEV[states * l + j]; - - } - } - - v = &(x3[span * i]); - scale = 1; - for(l = 0; scale && (l < span); l++) - scale = ((ABS(v[l]) < minlikelihood)); - - if(scale) - { - for(l = 0; l < span; l++) - v[l] *= twotothe256; - - addScale += wgt[i]; - } - } - break; - default: - assert(0); - } - - /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications - carried out for computing the likelihood array at node p */ - - *scalerIncrement = addScale; -} - -#endif @@ -817,71 +345,6 @@ void computeTraversalInfo(nodeptr p, traversalInfo *ti, int *counter, int maxTip file. */ -#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX)) - -static void newviewGTRGAMMAPROT_LG4(int tipCase, - double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4], - int *ex3, unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling); - -static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase, - double *x1_start, double *x2_start, double *x3_start, - double *EV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - const int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn); - -static void newviewGTRGAMMA(int tipCase, - double *x1_start, double *x2_start, double *x3_start, - double *EV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - const int n, double *left, double *right, int *wgt, int *scalerIncrement - ); - -static void newviewGTRCAT( int tipCase, double *EV, int *cptr, - double *x1_start, double *x2_start, double *x3_start, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement); - - -static void newviewGTRCAT_SAVE( int tipCase, double *EV, int *cptr, - double *x1_start, double *x2_start, double *x3_start, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats); - -static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase, - double *x1, double *x2, double *x3, double *extEV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn - ); - -static void newviewGTRGAMMAPROT(int tipCase, - double *x1, double *x2, double *x3, double *extEV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement); -static void newviewGTRCATPROT(int tipCase, double *extEV, - int *cptr, - double *x1, double *x2, double *x3, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement ); - -static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV, - int *cptr, - double *x1, double *x2, double *x3, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats); - -#endif - -#ifdef _OPTIMIZED_FUNCTIONS - static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, @@ -894,8 +357,6 @@ static void newviewGTRGAMMA_BINARY(int tipCase, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling ); -#endif - boolean isGap(unsigned int *x, int pos) { return (x[pos / 32] & mask32[pos % 32]); @@ -1260,26 +721,6 @@ void newviewIterative (tree *tr, int startIndex) assert(0); } -#ifndef _OPTIMIZED_FUNCTIONS - - /* memory saving not implemented */ - - assert(!tr->saveMemory); - - /* figure out if we need to compute the CAT or GAMMA model of rate heterogeneity */ - - if(tr->rateHetModel == CAT) - newviewCAT_FLEX(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, - x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement, states); - else - newviewGAMMA_FLEX(tInfo->tipCase, - x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement, states, getUndetermined(tr->partitionData[model].dataType) + 1); - -#else /* dedicated highly optimized functions. Analogously to the functions in evaluateGeneric() we also siwtch over the state number */ @@ -1309,32 +750,21 @@ void newviewIterative (tree *tr, int startIndex) if(tr->saveMemory) #ifdef __MIC_NATIVE assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC"); -#elif __AVX +#else newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, (int*)NULL, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); -#else - newviewGTRCAT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, - x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap, - x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #endif else #ifdef __MIC_NATIVE assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC"); -#elif __AVX +#else newviewGTRCAT_AVX(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, tipX1, tipX2, width, left, right, wgt, &scalerIncrement); -#else - newviewGTRCAT(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, - x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement); #endif } else @@ -1344,20 +774,13 @@ void newviewIterative (tree *tr, int startIndex) if(tr->saveMemory) #ifdef __MIC_NATIVE assert(0 && "Memory saving is not implemented on Intel MIC"); -#elif __AVX +#else newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, (int*)NULL, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn); -#else - newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase, - x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement, - x1_gap, x2_gap, x3_gap, - x1_gapColumn, x2_gapColumn, x3_gapColumn); #endif else #ifdef __MIC_NATIVE @@ -1366,16 +789,11 @@ void newviewIterative (tree *tr, int startIndex) tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight); -#elif __AVX +#else newviewGTRGAMMA_AVX(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, tipX1, tipX2, width, left, right, wgt, &scalerIncrement); -#else - newviewGTRGAMMA(tInfo->tipCase, - x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement); #endif } @@ -1388,30 +806,21 @@ void newviewIterative (tree *tr, int startIndex) { #ifdef __MIC_NATIVE assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC"); -#elif __AVX +#else newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, (int*)NULL, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); -#else - newviewGTRCATPROT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, - x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, - tipX1, tipX2, width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap, - x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #endif } else { #ifdef __MIC_NATIVE assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC"); -#elif __AVX +#else newviewGTRCATPROT_AVX(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, tipX1, tipX2, width, left, right, wgt, &scalerIncrement); -#else - newviewGTRCATPROT(tInfo->tipCase, tr->partitionData[model].EV, rateCategory, - x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, - tipX1, tipX2, width, left, right, wgt, &scalerIncrement); #endif } } @@ -1421,7 +830,7 @@ void newviewIterative (tree *tr, int startIndex) { #ifdef __MIC_NATIVE assert(0 && "Memory saving is not implemented on Intel MIC"); -#elif __AVX +#else newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, @@ -1430,15 +839,6 @@ void newviewIterative (tree *tr, int startIndex) width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn); -#else - newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase, - x1_start, x2_start, x3_start, - tr->partitionData[model].EV, - tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement, - x1_gap, x2_gap, x3_gap, - x1_gapColumn, x2_gapColumn, x3_gapColumn); #endif } else @@ -1451,21 +851,13 @@ void newviewIterative (tree *tr, int startIndex) tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight); -#elif __AVX +#else newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV_LG4, tr->partitionData[model].tipVector_LG4, (int*)NULL, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE); -#else - newviewGTRGAMMAPROT_LG4(tInfo->tipCase, - x1_start, x2_start, x3_start, - tr->partitionData[model].EV_LG4, - tr->partitionData[model].tipVector_LG4, - (int*)NULL, tipX1, tipX2, - width, left, right, - wgt, &scalerIncrement, TRUE); #endif } else @@ -1476,16 +868,11 @@ void newviewIterative (tree *tr, int startIndex) tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight); -#elif __AVX +#else newviewGTRGAMMAPROT_AVX(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, tipX1, tipX2, width, left, right, wgt, &scalerIncrement); -#else - newviewGTRGAMMAPROT(tInfo->tipCase, - x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, - tipX1, tipX2, - width, left, right, wgt, &scalerIncrement); #endif } } @@ -1494,7 +881,6 @@ void newviewIterative (tree *tr, int startIndex) default: assert(0); } -#endif /* important step, here we essentiallt recursively compute the number of scaling multiplications at node p: it's the sum of the number of scaling multiplications already conducted @@ -1595,4302 +981,32 @@ void newviewGeneric (tree *tr, nodeptr p, boolean masked) /* optimized function implementations */ -#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX)) -static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase, - double *x1_start, double *x2_start, double *x3_start, - double *EV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - const int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn) +/*** BINARY DATA functions *****/ + +static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr, + double *x1_start, double *x2_start, double *x3_start, double *tipVector, + int *ex3, unsigned char *tipX1, unsigned char *tipX2, + int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { - int - i, - j, - k, - l, - addScale = 0, - scaleGap = 0; - double - *x1, - *x2, - *x3, - *x1_ptr = x1_start, - *x2_ptr = x2_start, - max, - maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))), - EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - __m128d - values[8], - EVV[8]; - - for(k = 0; k < 4; k++) - for (l=0; l < 4; l++) - EV_t[4 * l + k] = EV[4 * k + l]; - - for(k = 0; k < 8; k++) - EVV[k] = _mm_load_pd(&EV_t[k * 2]); - - + *le, + *ri, + *x1, *x2, *x3; + int i, l, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { - double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); - + for(i = 0; i < n; i++) + { + x1 = &(tipVector[2 * tipX1[i]]); + x2 = &(tipVector[2 * tipX2[i]]); + x3 = &x3_start[2 * i]; - for (i = 1; i < 16; i++) - { - __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); - __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); - } - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&right[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc); - - } - } - - uX1 = &umpX1[240]; - uX2 = &umpX2[240]; - - for (j = 0; j < 4; j++) - { - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); - __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 ); - _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 ); - } - - - x3 = x3_start; - - for (i = 0; i < n; i++) - { - if(!(x3_gap[i / 32] & mask32[i % 32])) - { - uX1 = &umpX1[16 * tipX1[i]]; - uX2 = &umpX2[16 * tipX2[i]]; - - for (j = 0; j < 4; j++) - { - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - - __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); - __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); - - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 ); - _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 ); - } - - x3 += 16; - } - } - } - break; - case TIP_INNER: - { - double - *uX1, - umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - for (i = 1; i < 16; i++) - { - __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); - __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); - } - } - - { - __m128d maxv =_mm_setzero_pd(); - - scaleGap = 0; - - x2 = x2_gapColumn; - x3 = x3_gapColumn; - - uX1 = &umpX1[240]; - - for (j = 0; j < 4; j++) - { - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - scaleGap = 1; - - __m128d sv = _mm_set1_pd(twotothe256); - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - } - - x3 = x3_start; - - for (i = 0; i < n; i++) - { - if((x3_gap[i / 32] & mask32[i % 32])) - { - if(scaleGap) - { - addScale += wgt[i]; - } - } - else - { - __m128d maxv =_mm_setzero_pd(); - - if(x2_gap[i / 32] & mask32[i % 32]) - x2 = x2_gapColumn; - else - { - x2 = x2_ptr; - x2_ptr += 16; - } - - uX1 = &umpX1[16 * tipX1[i]]; - - - for (j = 0; j < 4; j++) - { - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - { - // - // load left side from tip vector - // - - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - } - - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - __m128d sv = _mm_set1_pd(twotothe256); - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - - - addScale += wgt[i]; - - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - - x3 += 16; - } - } - } - break; - case INNER_INNER: - { - __m128d maxv =_mm_setzero_pd(); - - scaleGap = 0; - - x1 = x1_gapColumn; - x2 = x2_gapColumn; - x3 = x3_gapColumn; - - for (j = 0; j < 4; j++) - { - - double *x1_p = &x1[j*4]; - double *left_k0_p = &left[j*16]; - double *left_k1_p = &left[j*16 + 1*4]; - double *left_k2_p = &left[j*16 + 2*4]; - double *left_k3_p = &left[j*16 + 3*4]; - - __m128d x1_0 = _mm_load_pd( &x1_p[0] ); - __m128d x1_2 = _mm_load_pd( &x1_p[2] ); - - __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); - __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); - __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); - __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); - __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); - __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); - __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); - __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - __m128d sv = _mm_set1_pd(twotothe256); - - scaleGap = 1; - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - } - - - x3 = x3_start; - - for (i = 0; i < n; i++) - { - if(x3_gap[i / 32] & mask32[i % 32]) - { - if(scaleGap) - { - addScale += wgt[i]; - } - } - else - { - __m128d maxv =_mm_setzero_pd(); - - if(x1_gap[i / 32] & mask32[i % 32]) - x1 = x1_gapColumn; - else - { - x1 = x1_ptr; - x1_ptr += 16; - } - - if(x2_gap[i / 32] & mask32[i % 32]) - x2 = x2_gapColumn; - else - { - x2 = x2_ptr; - x2_ptr += 16; - } - - - for (j = 0; j < 4; j++) - { - - double *x1_p = &x1[j*4]; - double *left_k0_p = &left[j*16]; - double *left_k1_p = &left[j*16 + 1*4]; - double *left_k2_p = &left[j*16 + 2*4]; - double *left_k3_p = &left[j*16 + 3*4]; - - __m128d x1_0 = _mm_load_pd( &x1_p[0] ); - __m128d x1_2 = _mm_load_pd( &x1_p[2] ); - - __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); - __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); - __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); - __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); - __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); - __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); - __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); - __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - - // - // multiply/add right side - // - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - __m128d sv = _mm_set1_pd(twotothe256); - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - - - addScale += wgt[i]; - - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - - - - x3 += 16; - - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; -} - - -static void newviewGTRGAMMA(int tipCase, - double *x1_start, double *x2_start, double *x3_start, - double *EV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - const int n, double *left, double *right, int *wgt, int *scalerIncrement - ) -{ - int - i, - j, - k, - l, - addScale = 0; - - double - *x1, - *x2, - *x3, - max, - maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))), - EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - __m128d - values[8], - EVV[8]; - - for(k = 0; k < 4; k++) - for (l=0; l < 4; l++) - EV_t[4 * l + k] = EV[4 * k + l]; - - for(k = 0; k < 8; k++) - EVV[k] = _mm_load_pd(&EV_t[k * 2]); - - switch(tipCase) - { - case TIP_TIP: - { - double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - - for (i = 1; i < 16; i++) - { - __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); - __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); - } - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&right[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc); - - } - } - - for (i = 0; i < n; i++) - { - x3 = &x3_start[i * 16]; - - - uX1 = &umpX1[16 * tipX1[i]]; - uX2 = &umpX2[16 * tipX2[i]]; - - for (j = 0; j < 4; j++) - { - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - - __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); - __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); - - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 ); - _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 ); - } - } - } - break; - case TIP_INNER: - { - double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - - for (i = 1; i < 16; i++) - { - __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); - __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); - - for (j = 0; j < 4; j++) - for (k = 0; k < 4; k++) - { - __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); - __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); - - __m128d acc = _mm_setzero_pd(); - - acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); - acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); - - acc = _mm_hadd_pd(acc, acc); - _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); - } - } - - for (i = 0; i < n; i++) - { - __m128d maxv =_mm_setzero_pd(); - - x2 = &x2_start[i * 16]; - x3 = &x3_start[i * 16]; - - uX1 = &umpX1[16 * tipX1[i]]; - - for (j = 0; j < 4; j++) - { - - // - // multiply/add right side - // - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - { - // - // load left side from tip vector - // - - __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); - __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); - - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - } - - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - __m128d sv = _mm_set1_pd(twotothe256); - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - - - addScale += wgt[i]; - - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - } - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - __m128d maxv =_mm_setzero_pd(); - - - x1 = &x1_start[i * 16]; - x2 = &x2_start[i * 16]; - x3 = &x3_start[i * 16]; - - for (j = 0; j < 4; j++) - { - - double *x1_p = &x1[j*4]; - double *left_k0_p = &left[j*16]; - double *left_k1_p = &left[j*16 + 1*4]; - double *left_k2_p = &left[j*16 + 2*4]; - double *left_k3_p = &left[j*16 + 3*4]; - - __m128d x1_0 = _mm_load_pd( &x1_p[0] ); - __m128d x1_2 = _mm_load_pd( &x1_p[2] ); - - __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); - __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); - __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); - __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); - __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); - __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); - __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); - __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - - // - // multiply/add right side - // - double *x2_p = &x2[j*4]; - double *right_k0_p = &right[j*16]; - double *right_k1_p = &right[j*16 + 1*4]; - double *right_k2_p = &right[j*16 + 2*4]; - double *right_k3_p = &right[j*16 + 3*4]; - __m128d x2_0 = _mm_load_pd( &x2_p[0] ); - __m128d x2_2 = _mm_load_pd( &x2_p[2] ); - - __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); - __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); - __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); - __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); - __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); - __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); - __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); - __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - // - // multiply left * right - // - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - - // - // multiply with EV matrix (!?) - // - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - - values[j * 2] = EV_t_l0_k0; - values[j * 2 + 1] = EV_t_l2_k0; - - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); - maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); - } - - - _mm_store_pd(maxima, maxv); - - max = MAX(maxima[0], maxima[1]); - - if(max < minlikelihood) - { - __m128d sv = _mm_set1_pd(twotothe256); - - _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); - _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); - _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); - _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); - _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); - _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); - _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); - _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); - - - addScale += wgt[i]; - - } - else - { - _mm_store_pd(&x3[0], values[0]); - _mm_store_pd(&x3[2], values[1]); - _mm_store_pd(&x3[4], values[2]); - _mm_store_pd(&x3[6], values[3]); - _mm_store_pd(&x3[8], values[4]); - _mm_store_pd(&x3[10], values[5]); - _mm_store_pd(&x3[12], values[6]); - _mm_store_pd(&x3[14], values[7]); - } - } - - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; - -} -static void newviewGTRCAT( int tipCase, double *EV, int *cptr, - double *x1_start, double *x2_start, double *x3_start, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement) -{ - double - *le, - *ri, - *x1, - *x2, - *x3, - EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - int - i, - j, - scale, - addScale = 0; - - __m128d - minlikelihood_sse = _mm_set1_pd( minlikelihood ), - sc = _mm_set1_pd(twotothe256), - EVV[8]; - - for(i = 0; i < 4; i++) - for (j=0; j < 4; j++) - EV_t[4 * j + i] = EV[4 * i + j]; - - for(i = 0; i < 8; i++) - EVV[i] = _mm_load_pd(&EV_t[i * 2]); - - switch(tipCase) - { - case TIP_TIP: - for (i = 0; i < n; i++) - { - x1 = &(tipVector[4 * tipX1[i]]); - x2 = &(tipVector[4 * tipX2[i]]); - - x3 = &x3_start[i * 4]; - - le = &left[cptr[i] * 16]; - ri = &right[cptr[i] * 16]; - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - break; - case TIP_INNER: - for (i = 0; i < n; i++) - { - x1 = &(tipVector[4 * tipX1[i]]); - x2 = &x2_start[4 * i]; - x3 = &x3_start[4 * i]; - - le = &left[cptr[i] * 16]; - ri = &right[cptr[i] * 16]; - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - scale = 1; - - __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - else - { - v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); - _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); - - - addScale += wgt[i]; - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - - - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - x1 = &x1_start[4 * i]; - x2 = &x2_start[4 * i]; - x3 = &x3_start[4 * i]; - - le = &left[cptr[i] * 16]; - ri = &right[cptr[i] * 16]; - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - scale = 1; - - __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - else - { - v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); - _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); - - - addScale += wgt[i]; - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; -} - - - -static void newviewGTRCAT_SAVE( int tipCase, double *EV, int *cptr, - double *x1_start, double *x2_start, double *x3_start, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats) -{ - double - *le, - *ri, - *x1, - *x2, - *x3, - *x1_ptr = x1_start, - *x2_ptr = x2_start, - *x3_ptr = x3_start, - EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); - - int - i, - j, - scale, - scaleGap = 0, - addScale = 0; - - __m128d - minlikelihood_sse = _mm_set1_pd( minlikelihood ), - sc = _mm_set1_pd(twotothe256), - EVV[8]; - - for(i = 0; i < 4; i++) - for (j=0; j < 4; j++) - EV_t[4 * j + i] = EV[4 * i + j]; - - for(i = 0; i < 8; i++) - EVV[i] = _mm_load_pd(&EV_t[i * 2]); - - { - x1 = x1_gapColumn; - x2 = x2_gapColumn; - x3 = x3_gapColumn; - - le = &left[maxCats * 16]; - ri = &right[maxCats * 16]; - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - if(tipCase != TIP_TIP) - { - scale = 1; - - __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - else - { - v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); - _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); - - scaleGap = TRUE; - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - } - - - switch(tipCase) - { - case TIP_TIP: - for (i = 0; i < n; i++) - { - if(noGap(x3_gap, i)) - { - x1 = &(tipVector[4 * tipX1[i]]); - x2 = &(tipVector[4 * tipX2[i]]); - - x3 = x3_ptr; - - if(isGap(x1_gap, i)) - le = &left[maxCats * 16]; - else - le = &left[cptr[i] * 16]; - - if(isGap(x2_gap, i)) - ri = &right[maxCats * 16]; - else - ri = &right[cptr[i] * 16]; - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - - x3_ptr += 4; - } - } - break; - case TIP_INNER: - for (i = 0; i < n; i++) - { - if(isGap(x3_gap, i)) - { - if(scaleGap) - addScale += wgt[i]; - } - else - { - x1 = &(tipVector[4 * tipX1[i]]); - - x2 = x2_ptr; - x3 = x3_ptr; - - if(isGap(x1_gap, i)) - le = &left[maxCats * 16]; - else - le = &left[cptr[i] * 16]; - - if(isGap(x2_gap, i)) - { - ri = &right[maxCats * 16]; - x2 = x2_gapColumn; - } - else - { - ri = &right[cptr[i] * 16]; - x2 = x2_ptr; - x2_ptr += 4; - } - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - scale = 1; - - __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - else - { - v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); - _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); - - addScale += wgt[i]; - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - - x3_ptr += 4; - } - - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - if(isGap(x3_gap, i)) - { - if(scaleGap) - addScale += wgt[i]; - } - else - { - x3 = x3_ptr; - - if(isGap(x1_gap, i)) - { - x1 = x1_gapColumn; - le = &left[maxCats * 16]; - } - else - { - le = &left[cptr[i] * 16]; - x1 = x1_ptr; - x1_ptr += 4; - } - - if(isGap(x2_gap, i)) - { - x2 = x2_gapColumn; - ri = &right[maxCats * 16]; - } - else - { - ri = &right[cptr[i] * 16]; - x2 = x2_ptr; - x2_ptr += 4; - } - - __m128d x1_0 = _mm_load_pd( &x1[0] ); - __m128d x1_2 = _mm_load_pd( &x1[2] ); - - __m128d left_k0_0 = _mm_load_pd( &le[0] ); - __m128d left_k0_2 = _mm_load_pd( &le[2] ); - __m128d left_k1_0 = _mm_load_pd( &le[4] ); - __m128d left_k1_2 = _mm_load_pd( &le[6] ); - __m128d left_k2_0 = _mm_load_pd( &le[8] ); - __m128d left_k2_2 = _mm_load_pd( &le[10] ); - __m128d left_k3_0 = _mm_load_pd( &le[12] ); - __m128d left_k3_2 = _mm_load_pd( &le[14] ); - - left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); - left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); - - left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); - left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); - - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); - left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); - left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); - - left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); - left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); - - left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); - left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); - - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); - left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); - left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); - - __m128d x2_0 = _mm_load_pd( &x2[0] ); - __m128d x2_2 = _mm_load_pd( &x2[2] ); - - __m128d right_k0_0 = _mm_load_pd( &ri[0] ); - __m128d right_k0_2 = _mm_load_pd( &ri[2] ); - __m128d right_k1_0 = _mm_load_pd( &ri[4] ); - __m128d right_k1_2 = _mm_load_pd( &ri[6] ); - __m128d right_k2_0 = _mm_load_pd( &ri[8] ); - __m128d right_k2_2 = _mm_load_pd( &ri[10] ); - __m128d right_k3_0 = _mm_load_pd( &ri[12] ); - __m128d right_k3_2 = _mm_load_pd( &ri[14] ); - - right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); - right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); - - right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); - right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); - - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); - right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); - right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); - - right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); - right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); - - right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); - right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); - - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); - right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); - right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); - - __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); - __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); - - __m128d EV_t_l0_k0 = EVV[0]; - __m128d EV_t_l0_k2 = EVV[1]; - __m128d EV_t_l1_k0 = EVV[2]; - __m128d EV_t_l1_k2 = EVV[3]; - __m128d EV_t_l2_k0 = EVV[4]; - __m128d EV_t_l2_k2 = EVV[5]; - __m128d EV_t_l3_k0 = EVV[6]; - __m128d EV_t_l3_k2 = EVV[7]; - - - EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); - EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); - - EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); - EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); - - EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); - EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); - - EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); - EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); - - EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); - EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); - EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); - - EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); - - scale = 1; - - __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - else - { - v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); - _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); - - addScale += wgt[i]; - } - else - { - _mm_store_pd(x3, EV_t_l0_k0); - _mm_store_pd(&x3[2], EV_t_l2_k0); - } - - x3_ptr += 4; - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; -} - -static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase, - double *x1, double *x2, double *x3, double *extEV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn - ) -{ - double *uX1, *uX2, *v; - double x1px2; - int i, j, l, k, scale, addScale = 0, - gapScaling = 0; - double - *vl, *vr, *x1v, *x2v, - *x1_ptr = x1, - *x2_ptr = x2, - *x3_ptr = x3; - - - - switch(tipCase) - { - case TIP_TIP: - { - double umpX1[1840], umpX2[1840]; - - for(i = 0; i < 23; i++) - { - v = &(tipVector[20 * i]); - - for(k = 0; k < 80; k++) - { - double *ll = &left[k * 20]; - double *rr = &right[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - __m128d umpX2v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - umpX2v = _mm_hadd_pd(umpX2v, umpX2v); - - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); - _mm_storel_pd(&umpX2[80 * i + k], umpX2v); - } - } - - { - uX1 = &umpX1[1760]; - uX2 = &umpX2[1760]; - - for(j = 0; j < 4; j++) - { - v = &x3_gapColumn[j * 20]; - - __m128d zero = _mm_setzero_pd(); - for(k = 0; k < 20; k+=2) - _mm_store_pd(&v[k], zero); - - for(k = 0; k < 20; k++) - { - double *eev = &extEV[k * 20]; - x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d ee = _mm_load_pd(&eev[l]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[l], vv); - } - } - } - } - - for(i = 0; i < n; i++) - { - if(!(x3_gap[i / 32] & mask32[i % 32])) - { - uX1 = &umpX1[80 * tipX1[i]]; - uX2 = &umpX2[80 * tipX2[i]]; - - for(j = 0; j < 4; j++) - { - v = &x3_ptr[j * 20]; - - - __m128d zero = _mm_setzero_pd(); - for(k = 0; k < 20; k+=2) - _mm_store_pd(&v[k], zero); - - for(k = 0; k < 20; k++) - { - double *eev = &extEV[k * 20]; - x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d ee = _mm_load_pd(&eev[l]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[l], vv); - } - } - } - x3_ptr += 80; - } - } - } - break; - case TIP_INNER: - { - double umpX1[1840], ump_x2[20]; - - - for(i = 0; i < 23; i++) - { - v = &(tipVector[20 * i]); - - for(k = 0; k < 80; k++) - { - double *ll = &left[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); - - } - } - - { - uX1 = &umpX1[1760]; - - for(k = 0; k < 4; k++) - { - v = &(x2_gapColumn[k * 20]); - - for(l = 0; l < 20; l++) - { - double *r = &right[k * 400 + l * 20]; - __m128d ump_x2v = _mm_setzero_pd(); - - for(j = 0; j < 20; j+= 2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d rr = _mm_load_pd(&r[j]); - ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); - } - - ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); - - _mm_storel_pd(&ump_x2[l], ump_x2v); - } - - v = &(x3_gapColumn[20 * k]); - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - double *eev = &extEV[l * 20]; - x1px2 = uX1[k * 20 + l] * ump_x2[l]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d ee = _mm_load_pd(&eev[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[j], vv); - } - } - - } - - { - v = x3_gapColumn; - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if (scale) - { - gapScaling = 1; - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - } - } - - for (i = 0; i < n; i++) - { - if((x3_gap[i / 32] & mask32[i % 32])) - { - if(gapScaling) - { - addScale += wgt[i]; - } - } - else - { - uX1 = &umpX1[80 * tipX1[i]]; - - if(x2_gap[i / 32] & mask32[i % 32]) - x2v = x2_gapColumn; - else - { - x2v = x2_ptr; - x2_ptr += 80; - } - - for(k = 0; k < 4; k++) - { - v = &(x2v[k * 20]); - - for(l = 0; l < 20; l++) - { - double *r = &right[k * 400 + l * 20]; - __m128d ump_x2v = _mm_setzero_pd(); - - for(j = 0; j < 20; j+= 2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d rr = _mm_load_pd(&r[j]); - ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); - } - - ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); - - _mm_storel_pd(&ump_x2[l], ump_x2v); - } - - v = &x3_ptr[20 * k]; - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - double *eev = &extEV[l * 20]; - x1px2 = uX1[k * 20 + l] * ump_x2[l]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d ee = _mm_load_pd(&eev[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[j], vv); - } - } - - } - - - { - v = x3_ptr; - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if (scale) - { - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - addScale += wgt[i]; - } - - x3_ptr += 80; - } - } - } - break; - case INNER_INNER: - { - for(k = 0; k < 4; k++) - { - vl = &(x1_gapColumn[20 * k]); - vr = &(x2_gapColumn[20 * k]); - v = &(x3_gapColumn[20 * k]); - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - { - __m128d al = _mm_setzero_pd(); - __m128d ar = _mm_setzero_pd(); - - double *ll = &left[k * 400 + l * 20]; - double *rr = &right[k * 400 + l * 20]; - double *EVEV = &extEV[20 * l]; - - for(j = 0; j < 20; j+=2) - { - __m128d lv = _mm_load_pd(&ll[j]); - __m128d rv = _mm_load_pd(&rr[j]); - __m128d vll = _mm_load_pd(&vl[j]); - __m128d vrr = _mm_load_pd(&vr[j]); - - al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); - ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); - } - - al = _mm_hadd_pd(al, al); - ar = _mm_hadd_pd(ar, ar); - - al = _mm_mul_pd(al, ar); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d EVV = _mm_load_pd(&EVEV[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); - - _mm_store_pd(&v[j], vv); - } - } - - } - } - - - { - v = x3_gapColumn; - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - if (scale) - { - gapScaling = 1; - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - - } - } - - for (i = 0; i < n; i++) - { - if(x3_gap[i / 32] & mask32[i % 32]) - { - if(gapScaling) - { - addScale += wgt[i]; - } - } - else - { - if(x1_gap[i / 32] & mask32[i % 32]) - x1v = x1_gapColumn; - else - { - x1v = x1_ptr; - x1_ptr += 80; - } - - if(x2_gap[i / 32] & mask32[i % 32]) - x2v = x2_gapColumn; - else - { - x2v = x2_ptr; - x2_ptr += 80; - } - - for(k = 0; k < 4; k++) - { - vl = &(x1v[20 * k]); - vr = &(x2v[20 * k]); - v = &x3_ptr[20 * k]; - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - { - __m128d al = _mm_setzero_pd(); - __m128d ar = _mm_setzero_pd(); - - double *ll = &left[k * 400 + l * 20]; - double *rr = &right[k * 400 + l * 20]; - double *EVEV = &extEV[20 * l]; - - for(j = 0; j < 20; j+=2) - { - __m128d lv = _mm_load_pd(&ll[j]); - __m128d rv = _mm_load_pd(&rr[j]); - __m128d vll = _mm_load_pd(&vl[j]); - __m128d vrr = _mm_load_pd(&vr[j]); - - al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); - ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); - } - - al = _mm_hadd_pd(al, al); - ar = _mm_hadd_pd(ar, ar); - - al = _mm_mul_pd(al, ar); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d EVV = _mm_load_pd(&EVEV[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); - - _mm_store_pd(&v[j], vv); - } - } - - } - } - - - - { - v = x3_ptr; - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if (scale) - { - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - addScale += wgt[i]; - } - x3_ptr += 80; - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; -} - - - -static void newviewGTRGAMMAPROT(int tipCase, - double *x1, double *x2, double *x3, double *extEV, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement) -{ - double *uX1, *uX2, *v; - double x1px2; - int i, j, l, k, scale, addScale = 0; - double *vl, *vr; - - - - switch(tipCase) - { - case TIP_TIP: - { - double umpX1[1840], umpX2[1840]; - - for(i = 0; i < 23; i++) - { - v = &(tipVector[20 * i]); - - for(k = 0; k < 80; k++) - { - double *ll = &left[k * 20]; - double *rr = &right[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - __m128d umpX2v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - umpX2v = _mm_hadd_pd(umpX2v, umpX2v); - - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); - _mm_storel_pd(&umpX2[80 * i + k], umpX2v); - - } - } - - for(i = 0; i < n; i++) - { - uX1 = &umpX1[80 * tipX1[i]]; - uX2 = &umpX2[80 * tipX2[i]]; - - for(j = 0; j < 4; j++) - { - v = &x3[i * 80 + j * 20]; - - - __m128d zero = _mm_setzero_pd(); - for(k = 0; k < 20; k+=2) - _mm_store_pd(&v[k], zero); - - for(k = 0; k < 20; k++) - { - double *eev = &extEV[k * 20]; - x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d ee = _mm_load_pd(&eev[l]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[l], vv); - } - } - - - } - } - } - break; - case TIP_INNER: - { - double umpX1[1840], ump_x2[20]; - - - for(i = 0; i < 23; i++) - { - v = &(tipVector[20 * i]); - - for(k = 0; k < 80; k++) - { - double *ll = &left[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); - - - } - } - - for (i = 0; i < n; i++) - { - uX1 = &umpX1[80 * tipX1[i]]; - - for(k = 0; k < 4; k++) - { - v = &(x2[80 * i + k * 20]); - - for(l = 0; l < 20; l++) - { - double *r = &right[k * 400 + l * 20]; - __m128d ump_x2v = _mm_setzero_pd(); - - for(j = 0; j < 20; j+= 2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d rr = _mm_load_pd(&r[j]); - ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); - } - - ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); - - _mm_storel_pd(&ump_x2[l], ump_x2v); - } - - v = &(x3[80 * i + 20 * k]); - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - double *eev = &extEV[l * 20]; - x1px2 = uX1[k * 20 + l] * ump_x2[l]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d ee = _mm_load_pd(&eev[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[j], vv); - } - } - - } - - - { - v = &(x3[80 * i]); - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if (scale) - { - - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - - - addScale += wgt[i]; - - } - } - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - for(k = 0; k < 4; k++) - { - vl = &(x1[80 * i + 20 * k]); - vr = &(x2[80 * i + 20 * k]); - v = &(x3[80 * i + 20 * k]); - - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - - for(l = 0; l < 20; l++) - { - - { - __m128d al = _mm_setzero_pd(); - __m128d ar = _mm_setzero_pd(); - - double *ll = &left[k * 400 + l * 20]; - double *rr = &right[k * 400 + l * 20]; - double *EVEV = &extEV[20 * l]; - - for(j = 0; j < 20; j+=2) - { - __m128d lv = _mm_load_pd(&ll[j]); - __m128d rv = _mm_load_pd(&rr[j]); - __m128d vll = _mm_load_pd(&vl[j]); - __m128d vrr = _mm_load_pd(&vr[j]); - - al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); - ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); - } - - al = _mm_hadd_pd(al, al); - ar = _mm_hadd_pd(ar, ar); - - al = _mm_mul_pd(al, ar); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d EVV = _mm_load_pd(&EVEV[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); - - _mm_store_pd(&v[j], vv); - } - } - - } - } - - - - { - v = &(x3[80 * i]); - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if (scale) - { - - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - - - addScale += wgt[i]; - - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; - -} - - - -static void newviewGTRCATPROT(int tipCase, double *extEV, - int *cptr, - double *x1, double *x2, double *x3, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement ) -{ - double - *le, *ri, *v, *vl, *vr; - - int i, l, j, scale, addScale = 0; - - switch(tipCase) - { - case TIP_TIP: - { - for (i = 0; i < n; i++) - { - le = &left[cptr[i] * 400]; - ri = &right[cptr[i] * 400]; - - vl = &(tipVector[20 * tipX1[i]]); - vr = &(tipVector[20 * tipX2[i]]); - v = &x3[20 * i]; - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - - for(l = 0; l < 20; l++) - { - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - - } - } - } - break; - case TIP_INNER: - { - for (i = 0; i < n; i++) - { - le = &left[cptr[i] * 400]; - ri = &right[cptr[i] * 400]; - - vl = &(tipVector[20 * tipX1[i]]); - vr = &x2[20 * i]; - v = &x3[20 * i]; - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - - - for(l = 0; l < 20; l++) - { - - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - - } - - { - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 20); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if(scale) - { - - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 20; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - addScale += wgt[i]; - } - } - } - break; - case INNER_INNER: - for(i = 0; i < n; i++) - { - le = &left[cptr[i] * 400]; - ri = &right[cptr[i] * 400]; - - vl = &x1[20 * i]; - vr = &x2[20 * i]; - v = &x3[20 * i]; - - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - - for(l = 0; l < 20; l++) - { - - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - - } - - { - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 20); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if(scale) - { - - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 20; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - - - addScale += wgt[i]; - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; - -} - -static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV, - int *cptr, - double *x1, double *x2, double *x3, double *tipVector, - unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, - unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, - double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats) -{ - double - *le, - *ri, - *v, - *vl, - *vr, - *x1_ptr = x1, - *x2_ptr = x2, - *x3_ptr = x3; - - int - i, - l, - j, - scale, - scaleGap = 0, - addScale = 0; - - { - vl = x1_gapColumn; - vr = x2_gapColumn; - v = x3_gapColumn; - - le = &left[maxCats * 400]; - ri = &right[maxCats * 400]; - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - for(l = 0; l < 20; l++) - { - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - } - - if(tipCase != TIP_TIP) - { - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 20); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - - if(scale) - { - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 20; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - scaleGap = TRUE; - } - } - } - - switch(tipCase) - { - case TIP_TIP: - { - for (i = 0; i < n; i++) - { - if(noGap(x3_gap, i)) - { - vl = &(tipVector[20 * tipX1[i]]); - vr = &(tipVector[20 * tipX2[i]]); - v = x3_ptr; - - if(isGap(x1_gap, i)) - le = &left[maxCats * 400]; - else - le = &left[cptr[i] * 400]; - - if(isGap(x2_gap, i)) - ri = &right[maxCats * 400]; - else - ri = &right[cptr[i] * 400]; - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - for(l = 0; l < 20; l++) - { - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - } - - x3_ptr += 20; - - } - } - } - break; - case TIP_INNER: - { - for (i = 0; i < n; i++) - { - if(isGap(x3_gap, i)) - { - if(scaleGap) - addScale += wgt[i]; - } - else - { - vl = &(tipVector[20 * tipX1[i]]); - - vr = x2_ptr; - v = x3_ptr; - - if(isGap(x1_gap, i)) - le = &left[maxCats * 400]; - else - le = &left[cptr[i] * 400]; - - if(isGap(x2_gap, i)) - { - ri = &right[maxCats * 400]; - vr = x2_gapColumn; - } - else - { - ri = &right[cptr[i] * 400]; - vr = x2_ptr; - x2_ptr += 20; - } - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - for(l = 0; l < 20; l++) - { - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - } - - { - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 20); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - - if(scale) - { - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 20; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - addScale += wgt[i]; - } - x3_ptr += 20; - } - } - } - break; - case INNER_INNER: - for(i = 0; i < n; i++) - { - if(isGap(x3_gap, i)) - { - if(scaleGap) - addScale += wgt[i]; - } - else - { - v = x3_ptr; - - if(isGap(x1_gap, i)) - { - vl = x1_gapColumn; - le = &left[maxCats * 400]; - } - else - { - le = &left[cptr[i] * 400]; - vl = x1_ptr; - x1_ptr += 20; - } - - if(isGap(x2_gap, i)) - { - vr = x2_gapColumn; - ri = &right[maxCats * 400]; - } - else - { - ri = &right[cptr[i] * 400]; - vr = x2_ptr; - x2_ptr += 20; - } - - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], _mm_setzero_pd()); - - for(l = 0; l < 20; l++) - { - __m128d x1v = _mm_setzero_pd(); - __m128d x2v = _mm_setzero_pd(); - double - *ev = &extEV[l * 20], - *lv = &le[l * 20], - *rv = &ri[l * 20]; - - for(j = 0; j < 20; j+=2) - { - x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); - x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); - } - - x1v = _mm_hadd_pd(x1v, x1v); - x2v = _mm_hadd_pd(x2v, x2v); - - x1v = _mm_mul_pd(x1v, x2v); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); - _mm_store_pd(&v[j], vv); - } - - } - - { - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 20); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } - - if(scale) - { - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 20; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } - - addScale += wgt[i]; - } - x3_ptr += 20; - } - } - break; - default: - assert(0); - } - - - *scalerIncrement = addScale; - -} - -static void newviewGTRGAMMAPROT_LG4(int tipCase, - double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4], - int *ex3, unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) -{ - double *uX1, *uX2, *v; - double x1px2; - int i, j, l, k, scale, addScale = 0; - double *vl, *vr; -#ifndef __SIM_SSE3 - double al, ar; -#endif - - - - switch(tipCase) - { - case TIP_TIP: - { - double umpX1[1840], umpX2[1840]; - - for(i = 0; i < 23; i++) - { - - - for(k = 0; k < 80; k++) - { - - v = &(tipVector[k / 20][20 * i]); -#ifdef __SIM_SSE3 - double *ll = &left[k * 20]; - double *rr = &right[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - __m128d umpX2v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - umpX2v = _mm_hadd_pd(umpX2v, umpX2v); - - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); - _mm_storel_pd(&umpX2[80 * i + k], umpX2v); -#else - umpX1[80 * i + k] = 0.0; - umpX2[80 * i + k] = 0.0; - - for(l = 0; l < 20; l++) - { - umpX1[80 * i + k] += v[l] * left[k * 20 + l]; - umpX2[80 * i + k] += v[l] * right[k * 20 + l]; - } -#endif - } - } - - for(i = 0; i < n; i++) - { - uX1 = &umpX1[80 * tipX1[i]]; - uX2 = &umpX2[80 * tipX2[i]]; - - for(j = 0; j < 4; j++) - { - v = &x3[i * 80 + j * 20]; - -#ifdef __SIM_SSE3 - __m128d zero = _mm_setzero_pd(); - for(k = 0; k < 20; k+=2) - _mm_store_pd(&v[k], zero); - - for(k = 0; k < 20; k++) - { - double *eev = &extEV[j][k * 20]; - x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d ee = _mm_load_pd(&eev[l]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[l], vv); - } - } - -#else - - for(k = 0; k < 20; k++) - v[k] = 0.0; - - for(k = 0; k < 20; k++) - { - x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; - - for(l = 0; l < 20; l++) - v[l] += x1px2 * extEV[j][20 * k + l]; - } -#endif - } - } - } - break; - case TIP_INNER: - { - double umpX1[1840], ump_x2[20]; - - - for(i = 0; i < 23; i++) - { - - - for(k = 0; k < 80; k++) - { - v = &(tipVector[k / 20][20 * i]); -#ifdef __SIM_SSE3 - double *ll = &left[k * 20]; - - __m128d umpX1v = _mm_setzero_pd(); - - for(l = 0; l < 20; l+=2) - { - __m128d vv = _mm_load_pd(&v[l]); - umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); - } - - umpX1v = _mm_hadd_pd(umpX1v, umpX1v); - _mm_storel_pd(&umpX1[80 * i + k], umpX1v); -#else - umpX1[80 * i + k] = 0.0; - - for(l = 0; l < 20; l++) - umpX1[80 * i + k] += v[l] * left[k * 20 + l]; -#endif - - } - } - - for (i = 0; i < n; i++) - { - uX1 = &umpX1[80 * tipX1[i]]; - - for(k = 0; k < 4; k++) - { - v = &(x2[80 * i + k * 20]); -#ifdef __SIM_SSE3 - for(l = 0; l < 20; l++) - { - double *r = &right[k * 400 + l * 20]; - __m128d ump_x2v = _mm_setzero_pd(); - - for(j = 0; j < 20; j+= 2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d rr = _mm_load_pd(&r[j]); - ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); - } - - ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); - - _mm_storel_pd(&ump_x2[l], ump_x2v); - } - - v = &(x3[80 * i + 20 * k]); - - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); - - for(l = 0; l < 20; l++) - { - double *eev = &extEV[k][l * 20]; - x1px2 = uX1[k * 20 + l] * ump_x2[l]; - __m128d x1px2v = _mm_set1_pd(x1px2); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d ee = _mm_load_pd(&eev[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); - - _mm_store_pd(&v[j], vv); - } - } -#else - for(l = 0; l < 20; l++) - { - ump_x2[l] = 0.0; - - for(j = 0; j < 20; j++) - ump_x2[l] += v[j] * right[k * 400 + l * 20 + j]; - } - - v = &(x3[80 * i + 20 * k]); - - for(l = 0; l < 20; l++) - v[l] = 0; - - for(l = 0; l < 20; l++) - { - x1px2 = uX1[k * 20 + l] * ump_x2[l]; - for(j = 0; j < 20; j++) - v[j] += x1px2 * extEV[k][l * 20 + j]; - } -#endif - } - -#ifdef __SIM_SSE3 - { - v = &(x3[80 * i]); - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } -#else - v = &x3[80 * i]; - scale = 1; - for(l = 0; scale && (l < 80); l++) - scale = (ABS(v[l]) < minlikelihood); -#endif - - if (scale) - { -#ifdef __SIM_SSE3 - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } -#else - for(l = 0; l < 80; l++) - v[l] *= twotothe256; -#endif - - if(useFastScaling) - addScale += wgt[i]; - else - ex3[i] += 1; - } - } - } - break; - case INNER_INNER: - for (i = 0; i < n; i++) - { - for(k = 0; k < 4; k++) - { - vl = &(x1[80 * i + 20 * k]); - vr = &(x2[80 * i + 20 * k]); - v = &(x3[80 * i + 20 * k]); - -#ifdef __SIM_SSE3 - __m128d zero = _mm_setzero_pd(); - for(l = 0; l < 20; l+=2) - _mm_store_pd(&v[l], zero); -#else - for(l = 0; l < 20; l++) - v[l] = 0; -#endif - - for(l = 0; l < 20; l++) - { -#ifdef __SIM_SSE3 - { - __m128d al = _mm_setzero_pd(); - __m128d ar = _mm_setzero_pd(); - - double *ll = &left[k * 400 + l * 20]; - double *rr = &right[k * 400 + l * 20]; - double *EVEV = &extEV[k][20 * l]; - - for(j = 0; j < 20; j+=2) - { - __m128d lv = _mm_load_pd(&ll[j]); - __m128d rv = _mm_load_pd(&rr[j]); - __m128d vll = _mm_load_pd(&vl[j]); - __m128d vrr = _mm_load_pd(&vr[j]); - - al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); - ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); - } - - al = _mm_hadd_pd(al, al); - ar = _mm_hadd_pd(ar, ar); - - al = _mm_mul_pd(al, ar); - - for(j = 0; j < 20; j+=2) - { - __m128d vv = _mm_load_pd(&v[j]); - __m128d EVV = _mm_load_pd(&EVEV[j]); - - vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); - - _mm_store_pd(&v[j], vv); - } - } -#else - al = 0.0; - ar = 0.0; - - for(j = 0; j < 20; j++) - { - al += vl[j] * left[k * 400 + l * 20 + j]; - ar += vr[j] * right[k * 400 + l * 20 + j]; - } - - x1px2 = al * ar; - - for(j = 0; j < 20; j++) - v[j] += x1px2 * extEV[k][20 * l + j]; -#endif - } - } - - -#ifdef __SIM_SSE3 - { - v = &(x3[80 * i]); - __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); - - scale = 1; - for(l = 0; scale && (l < 80); l += 2) - { - __m128d vv = _mm_load_pd(&v[l]); - __m128d v1 = _mm_and_pd(vv, absMask.m); - v1 = _mm_cmplt_pd(v1, minlikelihood_sse); - if(_mm_movemask_pd( v1 ) != 3) - scale = 0; - } - } -#else - v = &(x3[80 * i]); - scale = 1; - for(l = 0; scale && (l < 80); l++) - scale = ((ABS(v[l]) < minlikelihood)); -#endif - - if (scale) - { -#ifdef __SIM_SSE3 - __m128d twoto = _mm_set_pd(twotothe256, twotothe256); - - for(l = 0; l < 80; l+=2) - { - __m128d ex3v = _mm_load_pd(&v[l]); - _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); - } -#else - for(l = 0; l < 80; l++) - v[l] *= twotothe256; -#endif - - if(useFastScaling) - addScale += wgt[i]; - else - ex3[i] += 1; - } - } - break; - default: - assert(0); - } - - if(useFastScaling) - *scalerIncrement = addScale; - -} - -#endif - -#ifdef _OPTIMIZED_FUNCTIONS - -/*** BINARY DATA functions *****/ - -static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr, - double *x1_start, double *x2_start, double *x3_start, double *tipVector, - int *ex3, unsigned char *tipX1, unsigned char *tipX2, - int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) -{ - double - *le, - *ri, - *x1, *x2, *x3; - int i, l, scale, addScale = 0; - - switch(tipCase) - { - case TIP_TIP: - { - for(i = 0; i < n; i++) - { - x1 = &(tipVector[2 * tipX1[i]]); - x2 = &(tipVector[2 * tipX2[i]]); - x3 = &x3_start[2 * i]; - - le = &left[cptr[i] * 4]; - ri = &right[cptr[i] * 4]; + le = &left[cptr[i] * 4]; + ri = &right[cptr[i] * 4]; _mm_store_pd(x3, _mm_setzero_pd()); @@ -6210,9 +1326,3 @@ static void newviewGTRGAMMA_BINARY(int tipCase, /**** BINARY DATA functions end ****/ - - - -#endif - - diff --git a/parser/Makefile.SSE3.gcc b/parser/Makefile.SSE3.gcc index a58368d..9ca8464 100644 --- a/parser/Makefile.SSE3.gcc +++ b/parser/Makefile.SSE3.gcc @@ -2,7 +2,7 @@ # Makefile cleanup October 2006, Courtesy of Peter Cordes CC = gcc -CFLAGS = -fomit-frame-pointer -O2 -D_GNU_SOURCE -msse -funroll-loops #-Wall -Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast +CFLAGS = -I.. -fomit-frame-pointer -O2 -D_GNU_SOURCE -msse -funroll-loops #-Wall -Wunused-parameter -Wredundant-decls -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit -Wimplicit-function-declaration -Wimplicit-int -Wimport -Wunused -Wunused-function -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value -Wunused-variable -Wformat -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast LIBRARIES = -lm diff --git a/parser/axml.c b/parser/axml.c index 2ff2c49..9d2eb5b 100644 --- a/parser/axml.c +++ b/parser/axml.c @@ -60,7 +60,10 @@ #endif -#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC)) +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse.h" + +#if defined(SIMDE_SSE_NATIVE) #include /* special bug fix, enforces denormalized numbers to be flushed to zero, @@ -116,9 +119,7 @@ void *malloc_aligned(size_t size) if(ptr == (void*)NULL) assert(0); -#ifdef __AVX assert(0); -#endif #else diff --git a/parser/axml.h b/parser/axml.h index 7a97136..40d1622 100644 --- a/parser/axml.h +++ b/parser/axml.h @@ -34,11 +34,7 @@ #include "../versionHeader/version.h" -#ifdef __AVX #define BYTE_ALIGNMENT 32 -#else -#define BYTE_ALIGNMENT 16 -#endif diff --git a/simde b/simde new file mode 160000 index 0000000..294f44c --- /dev/null +++ b/simde @@ -0,0 +1 @@ +Subproject commit 294f44c9e948640f9e7d8a2ef7d6d196e14f26eb