From 4affd0a16d9330b328f2723b3ed5fe0e1be0da15 Mon Sep 17 00:00:00 2001 From: Daniel Rapp Date: Thu, 30 Mar 2017 10:41:46 -0600 Subject: [PATCH] Parallelization via OpenMP see: https://github.com/lvdmaaten/bhtsne/issues/18. These changes are inspired by https://github.com/maximsch2/bhtsne. This approach was selected as it requires minimal changes to parallelize the algorithm. In particular, these changes correspond roughly to the changes in https://github.com/maximsch2/bhtsne/commit/08d8a2aa1dfe41796ce2cbea11ea2e83232ce469 --- README.md | 8 ++++++ docker/dev/test.py | 0 setup.py | 51 +++++++++++++++++----------------- tsne/bh_sne_src/CMakeLists.txt | 10 ++----- tsne/bh_sne_src/Makefile | 6 ++-- tsne/bh_sne_src/sptree.cpp | 33 ++++++++++++---------- tsne/bh_sne_src/sptree.h | 13 ++++----- tsne/bh_sne_src/tsne.cpp | 11 ++++++-- 8 files changed, 73 insertions(+), 59 deletions(-) mode change 100644 => 100755 docker/dev/test.py diff --git a/README.md b/README.md index b09e1e6..a2e2b7a 100644 --- a/README.md +++ b/README.md @@ -72,3 +72,11 @@ More Information ---------------- See *Barnes-Hut-SNE* (2013), L.J.P. van der Maaten. It is available on [arxiv](http://arxiv.org/abs/1301.3342). + +On OSX +------ +Run `brew install gcc --without-multilib` (to get OpenMP version of compiler) + +You'll need to set environment variables for CC and CXX to get setuptools to build things properly. + +`export CXX=/usr/local/bin/g++-6; export CC=/usr/local/bin/gcc-6; pip install -e .` \ No newline at end of file diff --git a/docker/dev/test.py b/docker/dev/test.py old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py index 54b2016..b91f5e4 100644 --- a/setup.py +++ b/setup.py @@ -26,37 +26,38 @@ v2 = int(parts[1]) v3 = int(parts[2]) if len(parts) == 3 else None - if v2 >= 10: - # More than 10.10 - extra_compile_args=['-I/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers'] - else: - extra_compile_args=['-I/System/Library/Frameworks/vecLib.framework/Headers'] - ext_modules = [Extension(name='tsne.bh_sne', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], - include_dirs=[numpy.get_include(), 'tsne/bh_sne_src/'], - extra_compile_args=extra_compile_args + ['-ffast-math', '-O3'], - extra_link_args=['-Wl,-framework', '-Wl,Accelerate', '-lcblas'], - language='c++')] + sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], + include_dirs=[numpy.get_include(), 'tsne/bh_sne_src/'], + extra_compile_args=['-ffast-math', '-fopenmp', '-flto'], + extra_link_args=['-Wl,-framework', '-Wl,Accelerate', '-lgomp'], + language='c++'), + + Extension(name='tsne.bh_sne_3d', + sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne_3d.pyx'], + include_dirs=[numpy.get_include(), 'tsne/bh_sne_src/'], + extra_compile_args=['-ffast-math', '-DTSNE3D', '-fopenmp', '-flto'], + extra_link_args=['-Wl,-framework', '-Wl,Accelerate', '-lgomp'], + language='c++')] else: # LINUX - ext_modules = [Extension(name='tsne.bh_sne', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], - include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], - library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], - extra_compile_args=['-msse2', '-O3', '-fPIC', '-w', '-ffast-math'], - extra_link_args=['-Wl,-Bstatic', '-lcblas', '-Wl,-Bdynamic'], - language='c++'), + sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], + include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], + library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], + extra_compile_args=['-msse2', '-fPIC', '-w', '-ffast-math', '-O2', '-fopenmp', '-flto'], + extra_link_args=['-lgomp'], + language='c++'), Extension(name='tsne.bh_sne_3d', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne_3d.pyx'], - include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], - library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], - extra_compile_args=['-msse2', '-O3', '-fPIC', '-w', '-ffast-math', '-DTSNE3D'], - extra_link_args=['-Wl,-Bstatic', '-lcblas', '-Wl,-Bdynamic'], - language='c++')] + sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne_3d.pyx'], + include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], + library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], + extra_compile_args=['-msse2', '-fPIC', '-w', '-ffast-math', '-DTSNE3D', '-O2', '-fopenmp', + '-flto'], + extra_link_args=['-lgomp'], + language='c++')] ext_modules = cythonize(ext_modules) @@ -77,4 +78,4 @@ packages=find_packages(), ext_modules=ext_modules, install_requires=required -) + ) diff --git a/tsne/bh_sne_src/CMakeLists.txt b/tsne/bh_sne_src/CMakeLists.txt index d70d11b..41059f8 100644 --- a/tsne/bh_sne_src/CMakeLists.txt +++ b/tsne/bh_sne_src/CMakeLists.txt @@ -1,15 +1,11 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.0 FATAL_ERROR) -PROJECT(bh_tsne CXX) -ENABLE_LANGUAGE(CXX) -set_target_properties(${TARGET} PROPERTIES LINKER_LANGUAGE CXX) SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") -SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -ffast-math -funroll-loops -msse2") -SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-framework -Wl,Accelerate -lcblas") +FIND_PACKAGE(OpenMP REQUIRED) -SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}) +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -flto -ffast-math ${OpenMP_CXX_FLAGS}") +SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") file(GLOB SOURCES *.cpp) add_executable(bh_tsne ${SOURCES}) diff --git a/tsne/bh_sne_src/Makefile b/tsne/bh_sne_src/Makefile index 8c8a1c8..16102ba 100644 --- a/tsne/bh_sne_src/Makefile +++ b/tsne/bh_sne_src/Makefile @@ -2,8 +2,8 @@ #CFLAGS = -march=haswell -ffast-math -O3 -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize #CFLAGS = -march=haswell -ffast-math -O3 -CXX = g++ -CFLAGS = -ffast-math -O3 +CXX = /usr/local/bin/g++-6 +CFLAGS = -O2 -flto -ffast-math -fopenmp all: bh_tsne bh_tsne_3d @@ -20,7 +20,7 @@ tsne.o: tsne.cpp tsne.h sptree.h vptree.h $(CXX) $(CFLAGS) -c tsne.cpp tsne_3d.o: tsne.cpp tsne.h sptree.h vptree.h - $(CXX) $(CFLAGS) -DTSNE3D -c tsne.cpp + $(CXX) $(CFLAGS) -DTSNE3D -c tsne.cpp -o tsne_3d.o clean: rm -Rf *.o bh_tsne bh_tsne_3d diff --git a/tsne/bh_sne_src/sptree.cpp b/tsne/bh_sne_src/sptree.cpp index 2d187c6..ec6378b 100644 --- a/tsne/bh_sne_src/sptree.cpp +++ b/tsne/bh_sne_src/sptree.cpp @@ -56,12 +56,12 @@ Cell::~Cell() { } template -double Cell::getCorner(unsigned int d) { +double Cell::getCorner(unsigned int d) const { return corner[d]; } template -double Cell::getWidth(unsigned int d) { +double Cell::getWidth(unsigned int d) const { return width[d]; } @@ -77,7 +77,7 @@ void Cell::setWidth(unsigned int d, double val) { // Checks whether a point lies in a cell template -bool Cell::containsPoint(double point[]) +bool Cell::containsPoint(double point[]) const { for(int d = 0; d < NDims; d++) { if(corner[d] - width[d] > point[d]) return false; @@ -348,11 +348,13 @@ unsigned int SPTree::getDepth() { // Compute non-edge forces using Barnes-Hut algorithm template -void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q) +double SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const { + double resultSum = 0; + double buff[NDims]; // make buff local for parallelization // Make sure that we spend no time on empty nodes or self-interactions - if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return; + if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return resultSum; // Compute distance between point and center-of-mass double sqdist = .0; @@ -375,33 +377,37 @@ void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, // Compute and add t-SNE force between point and current node sqdist = 1.0 / (1.0 + sqdist); double mult = cum_size * sqdist; - *sum_Q += mult; + resultSum += mult; mult *= sqdist; for(unsigned int d = 0; d < NDims; d++) neg_f[d] += mult * buff[d]; } else { // Recursively apply Barnes-Hut to children - for(unsigned int i = 0; i < no_children; i++) children[i]->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); + for(unsigned int i = 0; i < no_children; i++){ + resultSum += children[i]->computeNonEdgeForces(point_index, theta, neg_f); + } } + return resultSum; } // Computes edge forces template -void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f) +void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f) const { // Loop over all edges in the graph - unsigned int ind1 = 0; - unsigned int ind2 = 0; - double sqdist; + #pragma omp parallel for schedule(static) for(unsigned int n = 0; n < N; n++) { + unsigned int ind1 = n * NDims; for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { + double buff[NDims]; // make buff local for parallelization + // Compute pairwise distance and Q-value - sqdist = 1.0; - ind2 = col_P[i] * NDims; + double sqdist = 1.0; + unsigned int ind2 = col_P[i] * NDims; for(unsigned int d = 0; d < NDims; d++) { buff[d] = data[ind1 + d] - data[ind2 + d]; @@ -413,7 +419,6 @@ void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, // Sum positive force for(unsigned int d = 0; d < NDims; d++) pos_f[ind1 + d] += sqdist * buff[d]; } - ind1 += NDims; } } diff --git a/tsne/bh_sne_src/sptree.h b/tsne/bh_sne_src/sptree.h index be42686..a783c68 100644 --- a/tsne/bh_sne_src/sptree.h +++ b/tsne/bh_sne_src/sptree.h @@ -48,11 +48,11 @@ class Cell { Cell(double* inp_corner, double* inp_width); ~Cell(); - double getCorner(unsigned int d); - double getWidth(unsigned int d); + double getCorner(unsigned int d) const; + double getWidth(unsigned int d) const; void setCorner(unsigned int d, double val); void setWidth(unsigned int d, double val); - bool containsPoint(double point[]); + bool containsPoint(double point[]) const; }; template @@ -65,9 +65,6 @@ class SPTree // Fixed constants static const unsigned int QT_NODE_CAPACITY = 1; - // A buffer we use when doing force computations - double buff[NDims]; - // Properties of this node in the tree SPTree* parent; unsigned int dimension; @@ -102,8 +99,8 @@ class SPTree void rebuildTree(); void getAllIndices(unsigned int* indices); unsigned int getDepth(); - void computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q); - void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f); + double computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const; + void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f) const; void print(); private: diff --git a/tsne/bh_sne_src/tsne.cpp b/tsne/bh_sne_src/tsne.cpp index 5fac463..b6ca910 100644 --- a/tsne/bh_sne_src/tsne.cpp +++ b/tsne/bh_sne_src/tsne.cpp @@ -228,8 +228,13 @@ void TSNE::computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp double* pos_f = (double*) calloc(N * D, sizeof(double)); double* neg_f = (double*) calloc(N * D, sizeof(double)); if(pos_f == NULL || neg_f == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } + tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f); - for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q); + + #pragma omp parallel for schedule(guided) reduction(+:sum_Q) + for(int n = 0; n < N; n++){ + sum_Q += tree->computeNonEdgeForces(n, theta, neg_f + n * D); + } // Compute final t-SNE gradient for(int i = 0; i < N * D; i++) { @@ -334,7 +339,9 @@ double TSNE::evaluateError(unsigned int* row_P, unsigned int* col_P, double* val SPTree* tree = new SPTree(Y, N); double* buff = (double*) calloc(D, sizeof(double)); double sum_Q = .0; - for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q); + for(int n = 0; n < N; n++){ + sum_Q += tree->computeNonEdgeForces(n, theta, buff); + } // Loop over all edges to compute t-SNE error int ind1, ind2;