diff --git a/examples/exec_example_cpu.sh b/examples/exec_example_cpu.sh deleted file mode 100755 index 6b0066b4..00000000 --- a/examples/exec_example_cpu.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -python3 make_points.py sources sources_1E4.bin 10000 -python3 make_points.py targets targets_1E4.bin 10000 -direct-cpu sources_1E4.bin targets_1E4.bin direct_benchmark.bin direct_summary.csv 10000 10000 0.0 0 1 -tree-cpu sources_1E4.bin targets_1E4.bin direct_benchmark.bin tree_summary.csv 10000 10000 0.5 5 500 5 0 0.0 1 diff --git a/examples/exec_example_gpu.sh b/examples/exec_example_gpu.sh deleted file mode 100755 index 427a0c24..00000000 --- a/examples/exec_example_gpu.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -python3 make_points.py sources sources_1E4.bin 10000 -python3 make_points.py targets targets_1E4.bin 10000 -direct-gpu sources_1E4.bin targets_1E4.bin direct_benchmark.bin direct_summary.csv 10000 10000 0.0 0 1 -tree-gpu sources_1E4.bin targets_1E4.bin direct_benchmark.bin tree_summary.csv 10000 10000 0.5 5 500 5 0 0.0 1 diff --git a/examples/library_example.c b/examples/library_example.c deleted file mode 100644 index 2d3a589f..00000000 --- a/examples/library_example.c +++ /dev/null @@ -1,69 +0,0 @@ -#include -#include -#include - -#include "../src/treedriverWrapper.h" - -int main(int argc, char **argv) -{ - int N = 100000; - int M = 100000; - - int pot_type = 0; //Coulomb, Lagrange interpolation - double kappa = 0.; - - int order = 5; - double theta = 0.5; - - int max_per_leaf = 500; - int max_per_batch = 5; - - int number_of_threads = 1; - - double *xS = malloc(N*sizeof(double)); - double *yS = malloc(N*sizeof(double)); - double *zS = malloc(N*sizeof(double)); - double *qS = malloc(N*sizeof(double)); - double *wS = malloc(N*sizeof(double)); - - double *xT = malloc(M*sizeof(double)); - double *yT = malloc(M*sizeof(double)); - double *zT = malloc(M*sizeof(double)); - double *qT = malloc(M*sizeof(double)); - - double *potential = malloc(M*sizeof(double)); - - for (int i = 0; i < N; ++i) { - xS[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - yS[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - zS[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - qS[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - wS[i] = 1.; - } - - for (int i = 0; i < M; ++i) { - xT[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - yT[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - zT[i] = ((double)rand()/(double)(RAND_MAX)) * 2. - 1.; - qT[i] = 1.; - } - - treedriverWrapper(M, N, xT, yT, zT, qT, xS, yS, zS, qS, wS, potential, - pot_type, kappa, order, theta, max_per_leaf, max_per_batch, - number_of_threads); - - printf("Treedriver has finished.\n"); - - free(xS); - free(yS); - free(zS); - free(qS); - free(wS); - free(xT); - free(yT); - free(zT); - free(qT); - free(potential); - - return 0; -} diff --git a/examples/make_points.py b/examples/make_points.py deleted file mode 100644 index efaecdd1..00000000 --- a/examples/make_points.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -import sys - -if (sys.argv[1]).lower() == 'sources': - outfile = sys.argv[2] - N = int(sys.argv[3]) - - sources = 2*np.random.rand(N,5)-1.0 - sources[:,-1] = np.ones(N) # set quadrature weights to 1 - - sources.tofile(outfile) - -elif (sys.argv[1]).lower() == 'targets': - outfile = sys.argv[2] - N = int(sys.argv[3]) - - targets = 2*np.random.rand(N,4)-1.0 - targets[:,-1] = np.ones(N) # set target weights to 1 - - targets.tofile(outfile) -else: - print("Error! Neither sources nor target output specified. Exiting.") diff --git a/interfaces/python/BaryTreeInterface.py b/interfaces/python/BaryTreeInterface.py index ea61cadf..5279c9b2 100644 --- a/interfaces/python/BaryTreeInterface.py +++ b/interfaces/python/BaryTreeInterface.py @@ -14,7 +14,7 @@ _cpu_treecodeRoutines = ctypes.CDLL('libBaryTree_cpu.dylib') try: - _gpu_treecodeRoutines = ctypes.CDLL('libTaryTree_gpu.so') + _gpu_treecodeRoutines = ctypes.CDLL('libBaryTree_gpu.so') except OSError: try: _gpu_treecodeRoutines = ctypes.CDLL('libBaryTree_gpu.dylib') diff --git a/src/tree-pc-coulomb-SS.c b/src/tree-pc-coulomb-SS.c deleted file mode 100644 index 7a85ab46..00000000 --- a/src/tree-pc-coulomb-SS.c +++ /dev/null @@ -1,669 +0,0 @@ -/* - *Procedures for Particle-Cluster Treecode - */ -#include -#include -#include -#include -#include - -#include "array.h" -#include "globvars.h" -#include "tnode.h" -#include "tools.h" - -#include "partition.h" -#include "tree.h" - -void fill_in_cluster_data_SS(struct particles *clusters, struct particles *sources, struct tnode *troot, int order){ - - int pointsPerCluster = (order+1)*(order+1)*(order+1); - int numInterpPoints = numnodes * pointsPerCluster; - make_vector(clusters->x, numInterpPoints); - make_vector(clusters->y, numInterpPoints); - make_vector(clusters->z, numInterpPoints); - make_vector(clusters->q, numInterpPoints); - make_vector(clusters->w, numInterpPoints); // will be used in singularity subtraction - clusters->num=numInterpPoints; - - for (int i=0; i< numInterpPoints; i++){ - clusters->q[i]=0.0; - clusters->w[i]=0.0; - } - -#pragma acc data copyin(tt[0:torderlim], \ - sources->x[0:sources->num], sources->y[0:sources->num], sources->z[0:sources->num], sources->q[0:sources->num], sources->w[0:sources->num]) \ - copy(clusters->x[0:clusters->num], clusters->y[0:clusters->num], clusters->z[0:clusters->num], clusters->q[0:clusters->num], clusters->w[0:clusters->num]) - - - - addNodeToArray_SS(troot, sources, clusters, order, numInterpPoints, pointsPerCluster); - - return; -} - -void addNodeToArray_SS(struct tnode *p, struct particles *sources, struct particles *clusters, int order, int numInterpPoints, int pointsPerCluster) -{ - int torderlim = order+1; - int startingIndex = p->node_index * pointsPerCluster; - int i; - - - pc_comp_ms_modifiedF_SS(p, sources->x, sources->y, sources->z, sources->q, sources->w, \ - clusters->x,clusters->y,clusters->z,clusters->q,clusters->w); - - - for (i = 0; i < p->num_children; i++) { - addNodeToArray_SS(p->child[i],sources,clusters,order,numInterpPoints,pointsPerCluster); - } - - return; -} - - - - -void pc_comp_ms_modifiedF_SS(struct tnode *p, double *xS, double *yS, double *zS, double *qS, double *wS, - double *clusterX, double *clusterY, double *clusterZ, double *clusterQ, double *clusterW){ - - int i,j,k; - int pointsPerCluster = torderlim*torderlim*torderlim; - int pointsInNode = p->numpar; - int startingIndexInClusters = p->node_index * pointsPerCluster; - int startingIndexInSources = p->ibeg-1; - - double x0, x1, y0, y1, z0, z1; // bounding box - - double weights[torderlim]; - double dj[torderlim]; - double *modifiedF, *modifiedF2; - make_vector(modifiedF,pointsInNode); - make_vector(modifiedF2,pointsInNode); - - double nodeX[torderlim], nodeY[torderlim], nodeZ[torderlim]; - - int *exactIndX, *exactIndY, *exactIndZ; - make_vector(exactIndX, pointsInNode); - make_vector(exactIndY, pointsInNode); - make_vector(exactIndZ, pointsInNode); - - - // Set the bounding box. -// x0 = p->x_min-1e-13*(p->x_max-p->x_min); -// x1 = p->x_max+2e-13*(p->x_max-p->x_min); -// y0 = p->y_min-1e-13*(p->y_max-p->y_min); -// y1 = p->y_max+2e-13*(p->y_max-p->y_min); -// z0 = p->z_min-1e-13*(p->z_max-p->z_min); -// z1 = p->z_max+2e-13*(p->z_max-p->z_min); - - x0 = p->x_min; - x1 = p->x_max; - y0 = p->y_min; - y1 = p->y_max; - z0 = p->z_min; - z1 = p->z_max; - - // Make and zero-out arrays to store denominator sums - double sumX, sumY, sumZ; - - -#pragma acc kernels present(xS, yS, zS, qS, wS, clusterX, clusterY, clusterZ, clusterQ, clusterW, tt) \ - create(modifiedF[0:pointsInNode],exactIndX[0:pointsInNode],exactIndY[0:pointsInNode],exactIndZ[0:pointsInNode], \ - modifiedF2[0:pointsInNode],nodeX[0:torderlim],nodeY[0:torderlim],nodeZ[0:torderlim],weights[0:torderlim],dj[0:torderlim]) - { - - #pragma acc loop independent - for (j=0;jnum; i++) - EnP[i] = 2.0 * M_PI * kappaSq * targets->q[i]; // change this to whatever it should be - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - - double *EnP2; - make_vector(EnP2,targets->num); - for (i = 0; i < targets->num; i++) - EnP2[i] = 0.0; - -#pragma acc data copyin(sources->x[0:sources->num], sources->y[0:sources->num], sources->z[0:sources->num], sources->q[0:sources->num], sources->w[0:sources->num], \ - targets->x[0:targets->num], targets->y[0:targets->num], targets->z[0:targets->num], targets->q[0:targets->num], \ - clusters->x[0:clusters->num], clusters->y[0:clusters->num], clusters->z[0:clusters->num], clusters->q[0:clusters->num], clusters->w[0:clusters->num]) \ - copy(EnP2[0:targets->num]) - { - - #pragma omp for private(j) schedule(guided) - for (i = 0; i < batches->num; i++) { - for (j = 0; j < p->num_children; j++) { - compute_pc_coulomb_SS(p->child[j], - batches->index[i], batches->center[i], batches->radius[i], - sources->x, sources->y, sources->z, sources->q, sources->w, - targets->x, targets->y, targets->z, targets->q, kappaSq, EnP2, - clusters->x, clusters->y, clusters->z, clusters->q, clusters->w); - } - } - - #pragma acc wait - } // end acc data region - - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0) - EnP[k] += EnP2[k]; - } - free_vector(EnP2); - } // end omp parallel region - - *tpeng = sum(EnP, targets->num); - - return; - -} /* END of function pc_treecode */ - - - - -void compute_pc_coulomb_SS(struct tnode *p, - int *batch_ind, double *batch_mid, double batch_rad, - double *xS, double *yS, double *zS, double *qS, double *wS, - double *xT, double *yT, double *zT, double *qT, double kappaSq, double *EnP, - double * clusterX, double * clusterY, double * clusterZ, double * clusterM, double * clusterM2 ) -{ - double dist; - double tx, ty, tz; - int i, j, k, ii, kk; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - /* determine DIST for MAC test */ - tx = batch_mid[0] - p->x_mid; - ty = batch_mid[1] - p->y_mid; - tz = batch_mid[2] - p->z_mid; - dist = sqrt(tx*tx + ty*ty + tz*tz); - - - int smallEnoughLeaf=0; - if (p->numpar < torderlim*torderlim*torderlim){ - smallEnoughLeaf=1; - }else{ - smallEnoughLeaf=0; - } - if (((p->radius + batch_rad) < dist * sqrt(thetasq)) && (p->sqradius != 0.00) && (smallEnoughLeaf==0) ) { - - int numberOfTargets = batch_ind[1] - batch_ind[0] + 1; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart = numberOfInterpolationPoints*p->node_index; - - - - double xi,yi,zi,qi,r; - int batchStart = batch_ind[0] - 1; - - int streamID = rand() % 4; - # pragma acc kernels async(streamID) present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM, clusterM2) - { - #pragma acc loop independent - for (i = 0; i < numberOfTargets; i++){ - tempPotential = 0.0; - xi = xT[ batchStart + i]; - yi = yT[ batchStart + i]; - zi = zT[ batchStart + i]; - qi = qT[ batchStart + i]; - #pragma acc loop independent - for (j = 0; j < numberOfInterpolationPoints; j++){ - - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - clusterX[clusterStart + j]; - dyt = yi - clusterY[clusterStart + j]; - dzt = zi - clusterZ[clusterStart + j]; - - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - - tempPotential += (clusterM[clusterStart + j]-qi*clusterM2[clusterStart + j]* exp(-r*r/kappaSq) ) / r; - - } - #pragma acc atomic - EnP[batchStart + i] += tempPotential; - } - } - - - -// EnP[batch_ind[0] - 1 + i] += ( Weights1[j] - Weights2[j]*qT[ batch_ind[0] - 1 + i]*exp(-r*r/kappaSq) ) / r - } else { - /* - * If MAC fails check to see if there are children. If not, perform direct - * calculation. If there are children, call routine recursively for each. - */ - - - if ( (p->num_children == 0)|(smallEnoughLeaf==1) ) { -// printf("MAC rejected, and node has no children. Calling pc_comp_dierct()...\n"); - pc_comp_direct_coulomb_SS(p->ibeg, p->iend, batch_ind[0], batch_ind[1], - xS, yS, zS, qS, wS, xT, yT, zT, qT, kappaSq, EnP); - } else { -// printf("MAC rejected, recursing over children...\n"); - for (i = 0; i < p->num_children; i++) { - compute_pc_coulomb_SS(p->child[i], batch_ind, batch_mid, batch_rad, - xS, yS, zS, qS, wS, xT, yT, zT, qT, kappaSq, EnP, - clusterX, clusterY, clusterZ, clusterM, clusterM2); - } - } - } - - return; - -} /* END of function compute_pc */ - - - - -/* - * comp_direct directly computes the potential on the targets in the current - * cluster due to the current source, determined by the global variable TARPOS - */ -void pc_comp_direct_coulomb_SS(int ibeg, int iend, int batch_ibeg, int batch_iend, - double *xS, double *yS, double *zS, double *qS, double *wS, - double *xT, double *yT, double *zT, double *qT, double kappaSq, double *EnP) -{ - /* local variables */ - int i, ii; - double tx, ty, tz; - double d_peng, r; - - int streamID = rand() % 4; - # pragma acc kernels async(streamID) present(xS,yS,zS,qS,wS,xT,yT,zT,qT,EnP) - { - #pragma acc loop independent - for (ii = batch_ibeg - 1; ii < batch_iend; ii++) { - d_peng = 0.0; - #pragma acc loop independent - for (i = ibeg - 1; i < iend; i++) { - tx = xS[i] - xT[ii]; - ty = yS[i] - yT[ii]; - tz = zS[i] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { - d_peng += wS[i]* ( qS[i] - qT[ii]* exp(-r*r/kappaSq) ) / r; - } - } - #pragma acc atomic - EnP[ii] += d_peng; - } - } - - return; - -} /* END function pc_comp_direct_yuk */ - - -//d_peng += wS[i]* ( qS[i] - qT[ii]* exp(-r*r/kappaSq) ) / r; - - -void pc_interaction_list_treecode_Coulomb_SS(struct tnode_array *tree_array, struct particles *clusters, struct batch *batches, - int *tree_inter_list, int *direct_inter_list, - struct particles *sources, struct particles *targets, - double *tpeng,double kappaSq, double *EnP, int numDevices, int numThreads) -{ - int i, j; - - for (i = 0; i < targets->num; i++) - EnP[i] = 2.0 * M_PI * kappaSq * targets->q[i]; - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - printf("this_thread: %i\n", this_thread); - - double *EnP2, *EnP3; - make_vector(EnP2,targets->num); - make_vector(EnP3,targets->num); - for (i = 0; i < targets->num; i++) - EnP3[i] = 0.0; - EnP2[i] = 0.0; - - - double *xS = sources->x; - double *yS = sources->y; - double *zS = sources->z; - double *qS = sources->q; - double *wS = sources->w; - - double *xT = targets->x; - double *yT = targets->y; - double *zT = targets->z; - double *qT = targets->q; - - double *xC = clusters->x; - double *yC = clusters->y; - double *zC = clusters->z; - double *qC = clusters->q; - double *wC = clusters->w; - -// printf("\n\nInside compute region, clusters->q[0] = %f\n\n",clusters->q[0]); -// printf("\n\nInside compute region, clusters->q[213599] = %f\n\n",clusters->q[213599]); - - int * ibegs = tree_array->ibeg; - int * iends = tree_array->iend; - - #pragma acc data copyin(xS[0:sources->num], yS[0:sources->num], zS[0:sources->num], qS[0:sources->num], wS[0:sources->num], \ - xT[0:targets->num], yT[0:targets->num], zT[0:targets->num], qT[0:targets->num], \ - xC[0:clusters->num], yC[0:clusters->num], zC[0:clusters->num], qC[0:clusters->num], wC[0:clusters->num], tree_inter_list[0:numnodes*batches->num], \ - direct_inter_list[0:batches->num * numleaves], ibegs[0:numnodes], iends[0:numnodes]) \ - copy(EnP3[0:targets->num], EnP2[0:targets->num]) - { - - - int batch_ibeg, batch_iend, node_index; - double dist; - double tx, ty, tz; - int i, j, k, ii, jj; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - int source_start; - int source_end; - - double d_peng, r; - double xi,yi,zi,qi; - - int numberOfTargets; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart, batchStart; - - int numberOfClusterApproximations, numberOfDirectSums; - - int streamID; - #pragma omp for private(j,ii,jj,batch_ibeg,batch_iend,numberOfClusterApproximations,numberOfDirectSums,numberOfTargets,batchStart,node_index,clusterStart,streamID) schedule(guided) - for (i = 0; i < batches->num; i++) { - batch_ibeg = batches->index[i][0]; - batch_iend = batches->index[i][1]; - numberOfClusterApproximations = batches->index[i][2]; - numberOfDirectSums = batches->index[i][3]; - - - numberOfTargets = batch_iend - batch_ibeg + 1; - batchStart = batch_ibeg - 1; - - for (j = 0; j < numberOfClusterApproximations; j++) { - node_index = tree_inter_list[i * numnodes + j]; - clusterStart = numberOfInterpolationPoints*node_index; - - - - streamID = j%3; - #pragma acc kernels async(streamID) //present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM) - { - #pragma acc loop independent - for (ii = 0; ii < numberOfTargets; ii++){ - tempPotential = 0.0; - xi = xT[ batchStart + ii]; - yi = yT[ batchStart + ii]; - zi = zT[ batchStart + ii]; - qi = qT[ batchStart + ii]; - - for (jj = 0; jj < numberOfInterpolationPoints; jj++){ - - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - xC[clusterStart + jj]; - dyt = yi - yC[clusterStart + jj]; - dzt = zi - zC[clusterStart + jj]; - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); -// tempPotential += qC[clusterStart + jj] *exp(-kappa*r)/ r; - tempPotential += (qC[clusterStart + jj]-qi*wC[clusterStart + jj]* exp(-r*r/kappaSq) ) / r; -// tempPotential += (clusterM[clusterStart + j]-qi*clusterM2[clusterStart + j]* exp(-r*r/kappaSq) ) / r; - } - #pragma acc atomic - EnP3[batchStart + ii] += tempPotential; - } - } // end kernel - } // end for loop over cluster approximations - - for (j = 0; j < numberOfDirectSums; j++) { - - node_index = direct_inter_list[i * numleaves + j]; - - source_start=ibegs[node_index]-1; - source_end=iends[node_index]; - - - streamID = j%3; - # pragma acc kernels async(streamID) - { - #pragma acc loop independent - for (ii = batchStart; ii < batchStart+numberOfTargets; ii++) { - d_peng = 0.0; - for (jj = source_start; jj < source_end; jj++) { - tx = xS[jj] - xT[ii]; - ty = yS[jj] - yT[ii]; - tz = zS[jj] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { -// d_peng += qS[jj] * wS[jj] *exp(-kappa*r)/ r; - d_peng += wS[jj]*( qS[jj] - qT[ii]* exp(-r*r/kappaSq) )/r; -// d_peng += wS[i]* ( qS[i] - qT[ii]* exp(-r*r/kappaSq) ) / r; - - } - } - #pragma acc atomic - EnP2[ii] += d_peng; - } - } // end kernel - } // end loop over number of leaves -//#pragma acc wait - - } // end loop over target batches - - - #pragma acc wait - } // end acc data region - - - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0) EnP[k] += EnP2[k]; - if (EnP2[k] != 0.0)EnP[k] += EnP3[k]; - } - - free_vector(EnP2); - free_vector(EnP3); - } // end omp parallel region - - printf("Exited the main comp_pc call.\n"); - *tpeng = sum(EnP, targets->num); - - return; - - } /* END of function pc_treecode */ - - diff --git a/src/tree-pc-dcf.c b/src/tree-pc-dcf.c deleted file mode 100644 index 61a2ec5a..00000000 --- a/src/tree-pc-dcf.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - *Procedures for Particle-Cluster Treecode - */ -#include -#include -#include -#include -#include - -#include "array.h" -#include "globvars.h" -#include "tnode.h" -#include "tools.h" - -#include "partition.h" -#include "tree.h" - - -void pc_interaction_list_treecode_dcf(struct tnode_array *tree_array, - struct particles *clusters, struct batch *batches, - int *tree_inter_list, int *direct_inter_list, - struct particles *sources, struct particles *targets, - double *tpeng, double eta, double *EnP, - int numDevices, int numThreads) -{ - int i, j; - - for (i = 0; i < targets->num; i++) - EnP[i] = 0.0; - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - printf("this_thread: %i\n", this_thread); - - double *EnP2, *EnP3; - make_vector(EnP2,targets->num); - make_vector(EnP3,targets->num); - for (i = 0; i < targets->num; i++) - EnP3[i] = 0.0; - EnP2[i] = 0.0; - - - double *xS = sources->x; - double *yS = sources->y; - double *zS = sources->z; - double *qS = sources->q; - double *wS = sources->w; - - double *xT = targets->x; - double *yT = targets->y; - double *zT = targets->z; - double *qT = targets->q; - - double *xC = clusters->x; - double *yC = clusters->y; - double *zC = clusters->z; - double *qC = clusters->q; - -// printf("\n\nInside compute region, clusters->q[0] = %f\n\n",clusters->q[0]); -// printf("\n\nInside compute region, clusters->q[213599] = %f\n\n",clusters->q[213599]); - - int * ibegs = tree_array->ibeg; - int * iends = tree_array->iend; - - #pragma acc data copyin(xS[0:sources->num], yS[0:sources->num], zS[0:sources->num], qS[0:sources->num], wS[0:sources->num], \ - xT[0:targets->num], yT[0:targets->num], zT[0:targets->num], qT[0:targets->num], \ - xC[0:clusters->num], yC[0:clusters->num], zC[0:clusters->num], qC[0:clusters->num], tree_inter_list[0:numnodes*batches->num], \ - direct_inter_list[0:batches->num * numleaves], ibegs[0:numnodes], iends[0:numnodes]) \ - copy(EnP3[0:targets->num], EnP2[0:targets->num]) - { - - - int batch_ibeg, batch_iend, node_index; - double dist; - double tx, ty, tz; - int i, j, k, ii, jj; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - int source_start; - int source_end; - - double d_peng, r; - double xi,yi,zi; - - int numberOfTargets; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart, batchStart; - - int numberOfClusterApproximations, numberOfDirectSums; - - int streamID; - #pragma omp for private(j,ii,jj,batch_ibeg,batch_iend,numberOfClusterApproximations,numberOfDirectSums,numberOfTargets,batchStart,node_index,clusterStart,streamID) - for (i = 0; i < batches->num; i++) { - batch_ibeg = batches->index[i][0]; - batch_iend = batches->index[i][1]; - numberOfClusterApproximations = batches->index[i][2]; - numberOfDirectSums = batches->index[i][3]; - - - numberOfTargets = batch_iend - batch_ibeg + 1; - batchStart = batch_ibeg - 1; - - for (j = 0; j < numberOfClusterApproximations; j++) { - node_index = tree_inter_list[i * numnodes + j]; - clusterStart = numberOfInterpolationPoints*node_index; - - - - streamID = j%3; - #pragma acc kernels async(streamID) //present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM) - { - #pragma acc loop independent - for (ii = 0; ii < numberOfTargets; ii++){ - tempPotential = 0.0; - xi = xT[ batchStart + ii]; - yi = yT[ batchStart + ii]; - zi = zT[ batchStart + ii]; - - for (jj = 0; jj < numberOfInterpolationPoints; jj++){ - - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - xC[clusterStart + jj]; - dyt = yi - yC[clusterStart + jj]; - dzt = zi - zC[clusterStart + jj]; - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - tempPotential += qC[clusterStart + jj] * erf(r / eta) / r; - - } - #pragma acc atomic - EnP3[batchStart + ii] += tempPotential; - } - } // end kernel - } // end for loop over cluster approximations - - for (j = 0; j < numberOfDirectSums; j++) { - - node_index = direct_inter_list[i * numleaves + j]; - - source_start=ibegs[node_index]-1; - source_end=iends[node_index]; - - - streamID = j%3; - # pragma acc kernels async(streamID) - { - #pragma acc loop independent - for (ii = batchStart; ii < batchStart+numberOfTargets; ii++) { - d_peng = 0.0; - for (jj = source_start; jj < source_end; jj++) { - tx = xS[jj] - xT[ii]; - ty = yS[jj] - yT[ii]; - tz = zS[jj] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { - d_peng += qS[jj] * wS[jj] * erf(r / eta) / r; - } - } - #pragma acc atomic - EnP2[ii] += d_peng; - } - } // end kernel - } // end loop over number of leaves -//#pragma acc wait - - } // end loop over target batches - - - #pragma acc wait - } // end acc data region - - - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0) - #pragma omp critical - { - EnP[k] += EnP2[k]; - EnP[k] += EnP3[k]; - } // end omp critical - } - - free_vector(EnP2); - free_vector(EnP3); - } // end omp parallel region - - printf("Exited the main comp_pc call.\n"); - *tpeng = sum(EnP, targets->num); - - return; - - } /* END of function pc_treecode */ - diff --git a/src/tree-pc-hermite-dcf.c b/src/tree-pc-hermite-dcf.c deleted file mode 100644 index 9bd1ab73..00000000 --- a/src/tree-pc-hermite-dcf.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - *Procedures for Particle-Cluster Treecode - */ -#include -#include -#include -#include - -#include "array.h" -#include "globvars.h" -#include "tnode.h" -#include "batch.h" -#include "particles.h" -#include "tools.h" - -#include "partition.h" -#include "tree.h" - -#include - - - - - -void pc_interaction_list_treecode_hermite_dcf(struct tnode_array *tree_array, - struct particles *clusters, struct batch *batches, - int *tree_inter_list, int *direct_inter_list, - struct particles *sources, struct particles *targets, - double *tpeng, double eta, double *EnP, - int numDevices, int numThreads) -{ - int i, j; - - for (i = 0; i < targets->num; i++) - EnP[i] = 0.0; - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - printf("this_thread: %i\n", this_thread); - - double *EnP2, *EnP3; - make_vector(EnP2,targets->num); - make_vector(EnP3,targets->num); - for (i = 0; i < targets->num; i++) - EnP3[i] = 0.0; - EnP2[i] = 0.0; - - - double *xS = sources->x; - double *yS = sources->y; - double *zS = sources->z; - double *qS = sources->q; - double *wS = sources->w; - - double *xT = targets->x; - double *yT = targets->y; - double *zT = targets->z; - double *qT = targets->q; - - double *xC = clusters->x; - double *yC = clusters->y; - double *zC = clusters->z; - double *qC = clusters->q; - double *qxC = clusters->qx; - double *qyC = clusters->qy; - double *qzC = clusters->qz; - double *qxyC = clusters->qxy; - double *qyzC = clusters->qyz; - double *qxzC = clusters->qxz; - double *qxyzC = clusters->qxyz; - -// printf("\n\nInside compute region, clusters->q[0] = %f\n\n",clusters->q[0]); -// printf("\n\nInside compute region, clusters->q[213599] = %f\n\n",clusters->q[213599]); - - int * ibegs = tree_array->ibeg; - int * iends = tree_array->iend; - - #pragma acc data copyin(xS[0:sources->num], yS[0:sources->num], zS[0:sources->num], qS[0:sources->num], wS[0:sources->num], \ - xT[0:targets->num], yT[0:targets->num], zT[0:targets->num], qT[0:targets->num], \ - xC[0:clusters->num], yC[0:clusters->num], zC[0:clusters->num],tree_inter_list[0:numnodes*batches->num], \ - qxC[0:clusters->num],qyC[0:clusters->num],qzC[0:clusters->num],qxyC[0:clusters->num], \ - qyzC[0:clusters->num],qxzC[0:clusters->num],qxyzC[0:clusters->num],qC[0:clusters->num], \ - direct_inter_list[0:batches->num * numleaves], ibegs[0:numnodes], iends[0:numnodes]) \ - copy(EnP3[0:targets->num], EnP2[0:targets->num]) - -//#pragma acc data copyin(targets->x[0:targets->num], targets->y[0:targets->num], targets->z[0:targets->num], targets->q[0:targets->num], \ -// sources->x[0:sources->num], sources->y[0:sources->num], sources->z[0:sources->num], sources->q[0:sources->num], sources->w[0:sources->num], \ -// clusters->x[0:clusters->num], clusters->y[0:clusters->num], clusters->z[0:clusters->num], clusters->q[0:clusters->num], \ -// clusters->qx[0:clusters->num],clusters->qy[0:clusters->num],clusters->qz[0:clusters->num],clusters->qxy[0:clusters->num],clusters->qyz[0:clusters->num], \ -// clusters->qxz[0:clusters->num], clusters->qxyz[0:clusters->num]) \ -// copy(EnP3[0:targets->num], EnP2[0:targets->num]) - { - - - - - int batch_ibeg, batch_iend, node_index; - double dist; - double tx, ty, tz; - int i, j, k, ii, jj; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - int source_start; - int source_end; - - double d_peng, r; - double xi,yi,zi; - - double rinv, r2inv, r4inv; - double r_eta, pot, auxpot, auxpot_eta, dpot1, dpot2, dpot3; - - double twoinvEta2 = 2.0 / (eta * eta); - double teninvEta2 = 5.0 * twoinvEta2; - double fourinvEta4 = twoinvEta2 * twoinvEta2; - - int numberOfTargets; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart, batchStart, sourceIdx; - - int numberOfClusterApproximations, numberOfDirectSums; - - int streamID; - #pragma omp for private(j,ii,jj,sourceIdx,batch_ibeg,batch_iend,numberOfClusterApproximations,numberOfDirectSums,numberOfTargets,batchStart,node_index,clusterStart,streamID,rinv,r2inv,r4inv,r_eta,pot,auxpot,auxpot_eta,dpot1,dpot2,dpot3) - for (i = 0; i < batches->num; i++) { - batch_ibeg = batches->index[i][0]; - batch_iend = batches->index[i][1]; - numberOfClusterApproximations = batches->index[i][2]; - numberOfDirectSums = batches->index[i][3]; - - - numberOfTargets = batch_iend - batch_ibeg + 1; - batchStart = batch_ibeg - 1; - - for (j = 0; j < numberOfClusterApproximations; j++) { - node_index = tree_inter_list[i * numnodes + j]; - clusterStart = numberOfInterpolationPoints*node_index; - - - - streamID = j%3; - #pragma acc kernels async(streamID) //present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM) - { - #pragma acc loop independent - for (ii = 0; ii < numberOfTargets; ii++){ - tempPotential = 0.0; - xi = xT[ batchStart + ii]; - yi = yT[ batchStart + ii]; - zi = zT[ batchStart + ii]; - - for (jj = 0; jj < numberOfInterpolationPoints; jj++){ - sourceIdx = clusterStart + jj; - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - xC[sourceIdx]; - dyt = yi - yC[sourceIdx]; - dzt = zi - zC[sourceIdx]; -// tempPotential += qC[clusterStart + jj] / sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - - rinv = 1 / r; - r2inv = rinv * rinv; - r4inv = r2inv * r2inv; - r_eta = r / eta; - - pot = erf(r_eta) * rinv; - auxpot = 2.0 / sqrt(M_PI) * exp(-r_eta * r_eta); - auxpot_eta = auxpot / eta; - - dpot1 = r2inv * (pot - auxpot); - dpot2 = r2inv * ( 3.0 * pot * r2inv - auxpot_eta - * ( 3.0 * r2inv + twoinvEta2)); - dpot3 = r2inv * (15.0 * pot * r4inv - auxpot_eta - * (15.0 * r4inv + teninvEta2 * r2inv + fourinvEta4)); - - tempPotential += pot * qC[sourceIdx] - + dpot1 * (qxC[sourceIdx] * dxt - + qyC[sourceIdx] * dyt - + qzC[sourceIdx] * dzt) - + dpot2 * (qxyC[sourceIdx] * dxt * dyt - + qyzC[sourceIdx] * dyt * dzt - + qxzC[sourceIdx] * dxt * dzt) - + dpot3 * qxyzC[sourceIdx] * dxt * dyt * dzt; - - - - } - #pragma acc atomic - EnP3[batchStart + ii] += tempPotential; - } - } // end kernel - } // end for loop over cluster approximations - - for (j = 0; j < numberOfDirectSums; j++) { - - node_index = direct_inter_list[i * numleaves + j]; - - source_start=ibegs[node_index]-1; - source_end=iends[node_index]; - - - streamID = j%3; - # pragma acc kernels async(streamID) - { - #pragma acc loop independent - for (ii = batchStart; ii < batchStart+numberOfTargets; ii++) { - d_peng = 0.0; - for (jj = source_start; jj < source_end; jj++) { - tx = xS[jj] - xT[ii]; - ty = yS[jj] - yT[ii]; - tz = zS[jj] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { - d_peng += erf(r / eta) * qS[jj] * wS[jj] / r; - } - } - #pragma acc atomic - EnP2[ii] += d_peng; - } - } // end kernel - } // end loop over number of leaves -//#pragma acc wait - - } // end loop over target batches - - - #pragma acc wait - } // end acc data region - - - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0) - #pragma omp critical - { - EnP[k] += EnP2[k]; - EnP[k] += EnP3[k]; - } - } - - free_vector(EnP2); - free_vector(EnP3); - } // end omp parallel region - - printf("Exited the main comp_pc call.\n"); - *tpeng = sum(EnP, targets->num); - - return; - - } /* END of function pc_treecode */ - diff --git a/src/tree-pc-tcf.c b/src/tree-pc-tcf.c deleted file mode 100644 index 0989c848..00000000 --- a/src/tree-pc-tcf.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - *Procedures for Particle-Cluster Treecode - */ -#include -#include -#include -#include -#include - -#include "array.h" -#include "globvars.h" -#include "tnode.h" -#include "tools.h" - -#include "partition.h" -#include "tree.h" - - -void pc_interaction_list_treecode_tcf(struct tnode_array *tree_array, - struct particles *clusters, struct batch *batches, - int *tree_inter_list, int *direct_inter_list, - struct particles *sources, struct particles *targets, - double *tpeng, double kappa, double eta, double *EnP, - int numDevices, int numThreads) -{ - int i, j; - - for (i = 0; i < targets->num; i++) - EnP[i] = 0.0; - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - printf("this_thread: %i\n", this_thread); - - double *EnP2, *EnP3; - make_vector(EnP2,targets->num); - make_vector(EnP3,targets->num); - for (i = 0; i < targets->num; i++) - EnP3[i] = 0.0; - EnP2[i] = 0.0; - - - double *xS = sources->x; - double *yS = sources->y; - double *zS = sources->z; - double *qS = sources->q; - double *wS = sources->w; - - double *xT = targets->x; - double *yT = targets->y; - double *zT = targets->z; - double *qT = targets->q; - - double *xC = clusters->x; - double *yC = clusters->y; - double *zC = clusters->z; - double *qC = clusters->q; - -// printf("\n\nInside compute region, clusters->q[0] = %f\n\n",clusters->q[0]); -// printf("\n\nInside compute region, clusters->q[213599] = %f\n\n",clusters->q[213599]); - - int * ibegs = tree_array->ibeg; - int * iends = tree_array->iend; - - #pragma acc data copyin(xS[0:sources->num], yS[0:sources->num], zS[0:sources->num], qS[0:sources->num], wS[0:sources->num], \ - xT[0:targets->num], yT[0:targets->num], zT[0:targets->num], qT[0:targets->num], \ - xC[0:clusters->num], yC[0:clusters->num], zC[0:clusters->num], qC[0:clusters->num], tree_inter_list[0:numnodes*batches->num], \ - direct_inter_list[0:batches->num * numleaves], ibegs[0:numnodes], iends[0:numnodes]) \ - copy(EnP3[0:targets->num], EnP2[0:targets->num]) - { - - - int batch_ibeg, batch_iend, node_index; - double dist; - double tx, ty, tz; - int i, j, k, ii, jj; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - int source_start; - int source_end; - - double d_peng, r; - double xi, yi, zi; - double kap_r, r_eta; - - double kap_eta_2 = kappa * eta / 2.0; - - int numberOfTargets; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart, batchStart; - - int numberOfClusterApproximations, numberOfDirectSums; - - int streamID; - #pragma omp for private(j,ii,jj,batch_ibeg,batch_iend,numberOfClusterApproximations,numberOfDirectSums,numberOfTargets,batchStart,node_index,clusterStart,streamID,kap_r,r_eta) - for (i = 0; i < batches->num; i++) { - batch_ibeg = batches->index[i][0]; - batch_iend = batches->index[i][1]; - numberOfClusterApproximations = batches->index[i][2]; - numberOfDirectSums = batches->index[i][3]; - - - numberOfTargets = batch_iend - batch_ibeg + 1; - batchStart = batch_ibeg - 1; - - for (j = 0; j < numberOfClusterApproximations; j++) { - node_index = tree_inter_list[i * numnodes + j]; - clusterStart = numberOfInterpolationPoints*node_index; - - - - streamID = j%3; - #pragma acc kernels async(streamID) //present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM) - { - #pragma acc loop independent - for (ii = 0; ii < numberOfTargets; ii++){ - tempPotential = 0.0; - xi = xT[ batchStart + ii]; - yi = yT[ batchStart + ii]; - zi = zT[ batchStart + ii]; - - for (jj = 0; jj < numberOfInterpolationPoints; jj++){ - - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - xC[clusterStart + jj]; - dyt = yi - yC[clusterStart + jj]; - dzt = zi - zC[clusterStart + jj]; - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - kap_r = kappa * r; - r_eta = r / eta; - tempPotential += qC[clusterStart + jj] / r - * (exp(-kap_r) * erfc(kap_eta_2 - r_eta) - - exp( kap_r) * erfc(kap_eta_2 + r_eta)); - - } - #pragma acc atomic - EnP3[batchStart + ii] += tempPotential; - } - } // end kernel - } // end for loop over cluster approximations - - for (j = 0; j < numberOfDirectSums; j++) { - - node_index = direct_inter_list[i * numleaves + j]; - - source_start=ibegs[node_index]-1; - source_end=iends[node_index]; - - - streamID = j%3; - # pragma acc kernels async(streamID) - { - #pragma acc loop independent - for (ii = batchStart; ii < batchStart+numberOfTargets; ii++) { - d_peng = 0.0; - for (jj = source_start; jj < source_end; jj++) { - tx = xS[jj] - xT[ii]; - ty = yS[jj] - yT[ii]; - tz = zS[jj] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { - kap_r = kappa * r; - r_eta = r / eta; - d_peng += qS[jj] * wS[jj] / r - * (exp(-kap_r) * erfc(kap_eta_2 - r_eta) - - exp( kap_r) * erfc(kap_eta_2 + r_eta)); - } - } - #pragma acc atomic - EnP2[ii] += d_peng; - } - } // end kernel - } // end loop over number of leaves -//#pragma acc wait - - } // end loop over target batches - - - #pragma acc wait - } // end acc data region - - - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0) - #pragma omp critical - { - EnP[k] += EnP2[k]; - EnP[k] += EnP3[k]; - } // end omp critical - } - - free_vector(EnP2); - free_vector(EnP3); - } // end omp parallel region - - printf("Exited the main comp_pc call.\n"); - *tpeng = sum(EnP, targets->num); - - return; - - } /* END of function pc_treecode */ - diff --git a/src/tree-pc-yuk-SS.c b/src/tree-pc-yuk-SS.c deleted file mode 100644 index 7a580cd9..00000000 --- a/src/tree-pc-yuk-SS.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - *Procedures for Particle-Cluster Treecode - */ -#include -#include -#include -#include -#include - -#include "array.h" -#include "globvars.h" -#include "tnode.h" -#include "tools.h" - -#include "partition.h" -#include "tree.h" - - - -void pc_treecode_yuk_SS(struct tnode *p, struct batch *batches, - struct particles *sources, struct particles *targets, struct particles *clusters, - double kappa, double *tpeng, double *EnP, int numDevices, int numThreads) -{ - /* local variables */ - int i, j; - - for (i = 0; i < targets->num; i++) - EnP[i] = 4.0 * M_PI * targets->q[i] / kappa / kappa; // 4*pi*f_t/k**2 - - #pragma omp parallel num_threads(numThreads) - { - -#ifdef OPENACC_ENABLED - if (omp_get_thread_num() < numDevices) - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); -#endif - - int this_thread = omp_get_thread_num(), num_threads = omp_get_num_threads(); - if (this_thread==0){printf("numDevices: %i\n", numDevices);} - if (this_thread==0){printf("num_threads: %i\n", num_threads);} - - double *EnP2; - make_vector(EnP2,targets->num); - for (i = 0; i < targets->num; i++) - EnP2[i] = 0.0; - -#pragma acc data copyin(sources->x[0:sources->num], sources->y[0:sources->num], sources->z[0:sources->num], sources->q[0:sources->num], sources->w[0:sources->num], \ - targets->x[0:targets->num], targets->y[0:targets->num], targets->z[0:targets->num], targets->q[0:targets->num], \ - clusters->x[0:clusters->num], clusters->y[0:clusters->num], clusters->z[0:clusters->num], clusters->q[0:clusters->num], clusters->w[0:clusters->num]) \ - copy(EnP2[0:targets->num]) - { - #pragma omp for private(j) - for (i = 0; i < batches->num; i++) { - for (j = 0; j < p->num_children; j++) { - compute_pc_yuk_SS(p->child[j], - batches->index[i], batches->center[i], batches->radius[i], - sources->x, sources->y, sources->z, sources->q, sources->w, - targets->x, targets->y, targets->z, targets->q, kappa, EnP2, - clusters->x, clusters->y, clusters->z, clusters->q, clusters->w); - } - } - #pragma acc wait - } // end acc data region - for (int k = 0; k < targets->num; k++){ - if (EnP2[k] != 0.0){ - EnP[k] += EnP2[k]; - } - } - free_vector(EnP2); - } // end omp parallel region - - - *tpeng = sum(EnP, targets->num); - - return; - -} /* END of function pc_treecode */ - - - - -void compute_pc_yuk_SS(struct tnode *p, - int *batch_ind, double *batch_mid, double batch_rad, - double *xS, double *yS, double *zS, double *qS, double *wS, - double *xT, double *yT, double *zT, double *qT, double kappa, double *EnP, - double * clusterX, double * clusterY, double * clusterZ, double * clusterM, double * clusterM2 ) -{ - double dist; - double tx, ty, tz; - int i, j, k, ii, kk; - double dxt,dyt,dzt,tempPotential; - double temp_i[torderlim], temp_j[torderlim], temp_k[torderlim]; - - /* determine DIST for MAC test */ - tx = batch_mid[0] - p->x_mid; - ty = batch_mid[1] - p->y_mid; - tz = batch_mid[2] - p->z_mid; - dist = sqrt(tx*tx + ty*ty + tz*tz); - - int smallEnoughLeaf=0; - if (p->numpar < torderlim*torderlim*torderlim){ - smallEnoughLeaf=1; - }else{ - smallEnoughLeaf=0; - } - if (((p->radius + batch_rad) < dist * sqrt(thetasq)) && (p->sqradius != 0.00) && (smallEnoughLeaf==0) ) { - - - int numberOfTargets = batch_ind[1] - batch_ind[0] + 1; - int numberOfInterpolationPoints = torderlim*torderlim*torderlim; - int clusterStart = numberOfInterpolationPoints*p->node_index; - - - - double xi,yi,zi,qi,r; - int batchStart = batch_ind[0] - 1; - - int streamID = rand() % 2; - # pragma acc kernels async(streamID) present(xT,yT,zT,qT,EnP, clusterX, clusterY, clusterZ, clusterM, clusterM2) - { - #pragma acc loop independent - for (i = 0; i < numberOfTargets; i++){ - tempPotential = 0.0; - xi = xT[ batchStart + i]; - yi = yT[ batchStart + i]; - zi = zT[ batchStart + i]; - qi = qT[ batchStart + i]; - #pragma acc loop independent - for (j = 0; j < numberOfInterpolationPoints; j++){ - - // Compute x, y, and z distances between target i and interpolation point j - dxt = xi - clusterX[clusterStart + j]; - dyt = yi - clusterY[clusterStart + j]; - dzt = zi - clusterZ[clusterStart + j]; - - r = sqrt(dxt*dxt + dyt*dyt + dzt*dzt); - -// tempPotential += (clusterM[clusterStart + j]-qi ) * exp(-kappa*r) / r; - tempPotential += (clusterM[clusterStart + j]-qi*clusterM2[clusterStart + j] ) * exp(-kappa*r) / r; - // tempPotential += clusterM[clusterStart + j]* exp(-kappa*r) / r - qi*clusterM2[clusterStart + j] * exp(-kappa*r) / r; - - } - #pragma acc atomic - EnP[batchStart + i] += tempPotential; - } - } - - - } else { - /* - * If MAC fails check to see if there are children. If not, perform direct - * calculation. If there are children, call routine recursively for each. - */ - - - if ( (p->num_children == 0)|(smallEnoughLeaf==1) ) { -// printf("MAC rejected, and node has no children. Calling pc_comp_dierct()...\n"); - pc_comp_direct_yuk_SS(p->ibeg, p->iend, batch_ind[0], batch_ind[1], - xS, yS, zS, qS, wS, xT, yT, zT, qT, kappa, EnP); - } else { -// printf("MAC rejected, recursing over children...\n"); - for (i = 0; i < p->num_children; i++) { - compute_pc_yuk_SS(p->child[i], batch_ind, batch_mid, batch_rad, - xS, yS, zS, qS, wS, xT, yT, zT, qT, kappa, EnP, - clusterX, clusterY, clusterZ, clusterM, clusterM2); - } - } - } - - return; - -} /* END of function compute_pc */ - - - - -/* - * comp_direct directly computes the potential on the targets in the current - * cluster due to the current source, determined by the global variable TARPOS - */ -void pc_comp_direct_yuk_SS(int ibeg, int iend, int batch_ibeg, int batch_iend, - double *xS, double *yS, double *zS, double *qS, double *wS, - double *xT, double *yT, double *zT, double *qT, double kappa, double *EnP) -{ - /* local variables */ - int i, ii; - double tx, ty, tz; - double d_peng, r; - - int streamID = rand() % 2; - # pragma acc kernels async(streamID) present(xS,yS,zS,qS,wS,xT,yT,zT,qT,EnP) - { - #pragma acc loop independent - for (ii = batch_ibeg - 1; ii < batch_iend; ii++) { - d_peng = 0.0; - #pragma acc loop independent - for (i = ibeg - 1; i < iend; i++) { - tx = xS[i] - xT[ii]; - ty = yS[i] - yT[ii]; - tz = zS[i] - zT[ii]; - r = sqrt(tx*tx + ty*ty + tz*tz); - if (r > DBL_MIN) { - d_peng += (qS[i] - qT[ii]) * wS[i] * exp(-kappa*r) / r; - } - } - #pragma acc atomic - EnP[ii] += d_peng; - } - } - - return; - -} /* END function pc_comp_direct_yuk */ - - diff --git a/src/treedriverWrapper.c b/src/treedriverWrapper.c deleted file mode 100644 index 22f24244..00000000 --- a/src/treedriverWrapper.c +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include - -#include "array.h" -#include "particles.h" -#include "tools.h" -#include "tree.h" - -#include "treedriver.h" -#include "treedriverWrapper.h" - - -/* definition of primary treecode driver */ - -void treedriverWrapper(int numTargets, int numSources, - double *targetX, double *targetY, double *targetZ, double *targetValue, - double *sourceX, double *sourceY, double *sourceZ, double *sourceValue, - double *sourceWeight, double *outputArray, int pot_type, double kappa, - int order, double theta, int maxparnode, int batch_size, int numThreads) -{ - int particleOrder[numTargets]; - for (int i = 0; i < numTargets; ++i) particleOrder[i] = i; - - // Assemble the arrays of data into the particle structs. - struct particles *sources = NULL; - struct particles *targets = NULL; - sources = malloc(sizeof(struct particles)); - targets = malloc(sizeof(struct particles)); - - targets->num = numTargets; - targets->x = targetX; - targets->y = targetY; - targets->z = targetZ; - targets->q = targetValue; - targets->order=particleOrder; - - sources->num = numSources; - sources->x = sourceX; - sources->y = sourceY; - sources->z = sourceZ; - sources->q = sourceValue; - sources->w = sourceWeight; - sources->order=particleOrder; - - double time_tree[4]; - int tree_type = 1; - double tpeng = 0; - - // Initialize all GPUs -#ifdef OPENACC_ENABLED - if (numThreads > 0) { - #pragma omp parallel num_threads(numThreads) - { - acc_set_device_num(omp_get_thread_num(), acc_get_device_type()); - acc_init(acc_get_device_type()); - } - } -#endif - - // Call the treedriver - treedriver(sources, targets, - order, theta, maxparnode, batch_size, - pot_type, kappa, tree_type, - outputArray, &tpeng, time_tree, numThreads); - - free(sources); - free(targets); - - return; -}