diff --git a/DataLoader/Compare.cs b/DataLoader/Compare.cs index 1ff818e..9cac820 100644 --- a/DataLoader/Compare.cs +++ b/DataLoader/Compare.cs @@ -78,6 +78,7 @@ public static int Compare(int nrCandidates, int nrSpectra, int topN, Random r) var resultArrayEigenB = new int[spectraIdx.Length * topN]; var resultArrayCuda = new int[spectraIdx.Length * topN]; var resultArrayCudaB = new int[spectraIdx.Length * topN]; + var resultArrayCudaB2 = new int[spectraIdx.Length * topN]; var memStat = 1; try { @@ -140,6 +141,20 @@ public static int Compare(int nrCandidates, int nrSpectra, int topN, Random r) sw4.Stop(); + var sw5 = Stopwatch.StartNew(); + + IntPtr resultCudaB2 = findTopCandidatesCudaBatched2(csrRowoffsetsPtr, csrIdxPtr, + sValuesPtr, sIdxPtr, + csrRowoffsets.Length, csrIdx.Length, + spectraValues.Length, spectraIdx.Length, + topN, (float) 0.02, NORMALIZE, USE_GAUSSIAN, BATCH_SIZE, 0); + + Marshal.Copy(resultCudaB2, resultArrayCudaB2, 0, spectraIdx.Length * topN); + + memStat = releaseMemoryCuda(resultCudaB2); + + sw5.Stop(); + Console.WriteLine("Time for candidate search Eigen SpMV:"); Console.WriteLine(sw1.Elapsed.TotalSeconds.ToString()); Console.WriteLine("Time for candidate search Eigen SpMM:"); @@ -148,6 +163,8 @@ public static int Compare(int nrCandidates, int nrSpectra, int topN, Random r) Console.WriteLine(sw3.Elapsed.TotalSeconds.ToString()); Console.WriteLine("Time for candidate search Cuda SpGEMM:"); Console.WriteLine(sw4.Elapsed.TotalSeconds.ToString()); + Console.WriteLine("Time for candidate search Cuda SpMM:"); + Console.WriteLine(sw5.Elapsed.TotalSeconds.ToString()); } catch (Exception ex) { @@ -172,6 +189,7 @@ public static int Compare(int nrCandidates, int nrSpectra, int topN, Random r) Console.WriteLine(resultArrayEigenB[i]); Console.WriteLine(resultArrayCuda[i]); Console.WriteLine(resultArrayCudaB[i]); + Console.WriteLine(resultArrayCudaB2[i]); Console.WriteLine("-----"); } @@ -183,6 +201,7 @@ public static int Compare(int nrCandidates, int nrSpectra, int topN, Random r) Console.WriteLine(resultArrayEigenB[i]); Console.WriteLine(resultArrayCuda[i]); Console.WriteLine(resultArrayCudaB[i]); + Console.WriteLine(resultArrayCudaB2[i]); Console.WriteLine("-----"); } diff --git a/DataLoader/Cuda.cs b/DataLoader/Cuda.cs index b3610e0..92e3c77 100644 --- a/DataLoader/Cuda.cs +++ b/DataLoader/Cuda.cs @@ -25,10 +25,20 @@ private static extern IntPtr findTopCandidatesCudaBatched(IntPtr cR, IntPtr cI, int batchSize, int verbose); + [DllImport(dllCuda, CallingConvention = CallingConvention.Cdecl)] + private static extern IntPtr findTopCandidatesCudaBatched2(IntPtr cR, IntPtr cI, + IntPtr sV, IntPtr sI, + int cRL, int cNNZ, + int sVL, int sIL, + int n, float tolerance, + bool normalize, bool gaussianTol, + int batchSize, + int verbose); + [DllImport(dllCuda, CallingConvention = CallingConvention.Cdecl)] private static extern int releaseMemoryCuda(IntPtr result); - public static int Cuda(int nrCandidates, int nrSpectra, int topN, Random r, bool batched) + public static int Cuda(int nrCandidates, int nrSpectra, int topN, Random r, bool batched, int batchMode) { // generate candidate vectors var csrRowoffsets = new int[nrCandidates + 1]; @@ -119,23 +129,46 @@ public static int Cuda(int nrCandidates, int nrSpectra, int topN, Random r, bool } else { - IntPtr csrRowoffsetsPtr = csrRowoffsetsLoc.AddrOfPinnedObject(); - IntPtr csrIdxPtr = csrIdxLoc.AddrOfPinnedObject(); - IntPtr sValuesPtr = sValuesLoc.AddrOfPinnedObject(); - IntPtr sIdxPtr = sIdxLoc.AddrOfPinnedObject(); - - IntPtr result = findTopCandidatesCudaBatched(csrRowoffsetsPtr, csrIdxPtr, - sValuesPtr, sIdxPtr, - csrRowoffsets.Length, csrIdx.Length, - spectraValues.Length, spectraIdx.Length, - topN, (float) 0.02, - NORMALIZE, USE_GAUSSIAN, - BATCH_SIZE, - 1000); - - Marshal.Copy(result, resultArray, 0, spectraIdx.Length * topN); - - memStat = releaseMemoryCuda(result); + if (batchMode == 2) + { + IntPtr csrRowoffsetsPtr = csrRowoffsetsLoc.AddrOfPinnedObject(); + IntPtr csrIdxPtr = csrIdxLoc.AddrOfPinnedObject(); + IntPtr sValuesPtr = sValuesLoc.AddrOfPinnedObject(); + IntPtr sIdxPtr = sIdxLoc.AddrOfPinnedObject(); + + IntPtr result = findTopCandidatesCudaBatched2(csrRowoffsetsPtr, csrIdxPtr, + sValuesPtr, sIdxPtr, + csrRowoffsets.Length, csrIdx.Length, + spectraValues.Length, spectraIdx.Length, + topN, (float) 0.02, + NORMALIZE, USE_GAUSSIAN, + BATCH_SIZE, + 1000); + + Marshal.Copy(result, resultArray, 0, spectraIdx.Length * topN); + + memStat = releaseMemoryCuda(result); + } + else + { + IntPtr csrRowoffsetsPtr = csrRowoffsetsLoc.AddrOfPinnedObject(); + IntPtr csrIdxPtr = csrIdxLoc.AddrOfPinnedObject(); + IntPtr sValuesPtr = sValuesLoc.AddrOfPinnedObject(); + IntPtr sIdxPtr = sIdxLoc.AddrOfPinnedObject(); + + IntPtr result = findTopCandidatesCudaBatched(csrRowoffsetsPtr, csrIdxPtr, + sValuesPtr, sIdxPtr, + csrRowoffsets.Length, csrIdx.Length, + spectraValues.Length, spectraIdx.Length, + topN, (float) 0.02, + NORMALIZE, USE_GAUSSIAN, + BATCH_SIZE, + 1000); + + Marshal.Copy(result, resultArray, 0, spectraIdx.Length * topN); + + memStat = releaseMemoryCuda(result); + } } } catch (Exception ex) @@ -160,7 +193,8 @@ public static int Cuda(int nrCandidates, int nrSpectra, int topN, Random r, bool } Console.WriteLine($"MemStat: {memStat}"); - Console.WriteLine("Time for candidate search (SpMV):"); + var mode = batched ? batchMode == 2 ? "(SpMM)" : "(SpGEMM)" : "(SpMV)"; + Console.WriteLine($"Time for candidate search {mode}:"); Console.WriteLine(sw.Elapsed.TotalSeconds.ToString()); // diff --git a/DataLoader/DataLoader.cs b/DataLoader/DataLoader.cs index e2ceda6..68bf96f 100644 --- a/DataLoader/DataLoader.cs +++ b/DataLoader/DataLoader.cs @@ -43,12 +43,17 @@ public static void Main(string[] args) // call subroutine for specified mode if (mode == "Cuda") { - var status = Cuda(nrCandidates, nrSpectra, topN, r, false); + var status = Cuda(nrCandidates, nrSpectra, topN, r, false, 1); Console.WriteLine($"Cuda routine exited with status: {status}"); } else if (mode == "CudaB") { - var status = Cuda(nrCandidates, nrSpectra, topN, r, true); + var status = Cuda(nrCandidates, nrSpectra, topN, r, true, 1); + Console.WriteLine($"Cuda routine exited with status: {status}"); + } + else if (mode == "CudaBAlt") + { + var status = Cuda(nrCandidates, nrSpectra, topN, r, true, 2); Console.WriteLine($"Cuda routine exited with status: {status}"); } else if (mode == "Eigen") diff --git a/VectorSearchCUDA/dllmain.cpp b/VectorSearchCUDA/dllmain.cpp index a5b7839..957bef6 100644 --- a/VectorSearchCUDA/dllmain.cpp +++ b/VectorSearchCUDA/dllmain.cpp @@ -10,8 +10,8 @@ #include const int versionMajor = 1; -const int versionMinor = 3; -const int versionFix = 4; +const int versionMinor = 4; +const int versionFix = 5; #define METHOD_EXPORTS #ifdef METHOD_EXPORTS @@ -43,6 +43,15 @@ extern "C" { int, int); + EXPORT int* findTopCandidatesCudaBatched2(int*, int*, + int*, int*, + int, int, + int, int, + int, float, + bool, bool, + int, + int); + EXPORT int releaseMemoryCuda(int*); } @@ -262,7 +271,7 @@ int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx, throw std::invalid_argument("Cannot return more hits than number of candidates!"); } - std::cout << "Running CUDA matrix search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl; + std::cout << "Running CUDA sparse matrix search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl; float t = round(tolerance * MASS_MULTIPLIER); int* result = new int[sILength * n]; @@ -440,6 +449,11 @@ int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx, // host result max for (int s = 0; s < batchSize; ++s) { + + if (i + s >= sILength) { + break; + } + std::vector rowIdx; std::vector rowValues; for (int j = 0; j < spgemM_nnz; ++j) { @@ -449,7 +463,7 @@ int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx, } } - // need to create an idx vector because we can sort rowIdx directly + // need to create an idx vector because we can't sort rowIdx directly //std::sort(rowIdx.data(), rowIdx.data() + rowIdx.size(), [&](int i, int j) {return rowValues[i] > rowValues[j];}); std::vector idx(rowIdx.size()); std::iota(idx.begin(), idx.end(), 0); @@ -462,6 +476,11 @@ int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx, } } + // host memory deallocation + delete[] spgemM_csrRowoffsets; + delete[] spgemM_csrColIdx; + delete[] spgemM_csrValues; + // device destroy descriptors CHECK_CUSPARSE(cusparseDestroySpMat(Mat)); CHECK_CUSPARSE(cusparseDestroySpMat(spgemM)); @@ -496,6 +515,207 @@ int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx, return result; } +/// +/// A function that calculates the top n candidates for each spectrum. Uses cusparseSpMM to calculate matrix product. +/// +/// Rowoffsets (int array) of the CSR sparse matrix (L: rows + 1). +/// Column indices (int array) of the CSR sparse matrix (L: NNZ). +/// An integer array of peaks from experimental spectra flattened. +/// An integer array that contains indices of where each spectrum starts in spectraValues. +/// Length (int) of csrRowoffsets (rows + 1). +/// Number of non-zero entries (int) in the CSR sparse matrix. +/// Length (int) of spectraValues. +/// Length (int) of spectraIdx. +/// How many of the best hits should be returned (int). +/// Tolerance for peak matching (float). +/// If candidate vectors should be normalized to sum(elements) = 1 (bool). +/// If spectrum peaks should be modelled as normal distributions or not (bool). +/// How many spectra (int) should be searched at once. +/// Print info every (int) processed spectra. +/// An integer array of length sILength * n containing the indexes of the top n candidates for each spectrum. +int* findTopCandidatesCudaBatched2(int* csrRowoffsets, int* csrColIdx, + int* spectraValues, int* spectraIdx, + int csrRowoffsetsLength, int csrNNZ, + int sVLength, int sILength, + int n, float tolerance, + bool normalize, bool gaussianTol, + int batchSize, + int verbose) { + + if (n >= csrRowoffsetsLength) { + throw std::invalid_argument("Cannot return more hits than number of candidates!"); + } + + std::cout << "Running CUDA dense matrix search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl; + + float t = round(tolerance * MASS_MULTIPLIER); + int* result = new int[sILength * n]; + float* csrValues = new float[csrNNZ]; + + // create csrValues + for (int i = 0; i < csrRowoffsetsLength - 1; ++i) { + int startIter = csrRowoffsets[i]; + int endIter = csrRowoffsets[i + 1]; + int nrNonZero = endIter - startIter; + float val = normalize ? 1.0 / (float) nrNonZero : 1.0; + for (int j = startIter; j < endIter; ++j) { + csrValues[j] = val; + } + } + + // cusparse spmM variables + float alpha = 1.0; + float beta = 0.0; + + // device memory management + int* dm_csrRowoffsets; + int* dm_csrColIdx; + float* dm_csrValues; + float* dM_dnValues; + float* dspmM_dnValues; + + // device buffer managment + void* dBuffer = NULL; + size_t bufferSize = 0; + + // device m memory management + CHECK_CUDA(cudaMalloc((void**) &dm_csrRowoffsets, csrRowoffsetsLength * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**) &dm_csrColIdx, csrNNZ * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**) &dm_csrValues, csrNNZ * sizeof(float))); + + // device m memory copy + CHECK_CUDA(cudaMemcpy(dm_csrRowoffsets, csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(dm_csrColIdx, csrColIdx, csrNNZ * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(dm_csrValues, csrValues, csrNNZ * sizeof(float), cudaMemcpyHostToDevice)); + + // device M memory management + CHECK_CUDA(cudaMalloc((void**) &dM_dnValues, ENCODING_SIZE * batchSize * sizeof(float))); + + // device spmm memory management + CHECK_CUDA(cudaMalloc((void**) &dspmM_dnValues, (csrRowoffsetsLength - 1) * batchSize * sizeof(float))); + + // device setup cusparse m + cusparseHandle_t handle = NULL; + cusparseSpMatDescr_t mat; + + CHECK_CUSPARSE(cusparseCreate(&handle)); + CHECK_CUSPARSE(cusparseCreateCsr(&mat, csrRowoffsetsLength - 1, ENCODING_SIZE, csrNNZ, + dm_csrRowoffsets, dm_csrColIdx, dm_csrValues, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + + for (int i = 0; i < sILength; i += batchSize) { + + // calculate spectrum matrix + float* M = new float[ENCODING_SIZE * batchSize] {0.0}; + + for (int s = 0; s < batchSize; ++s) { + + if (i + s < sILength) { + + int startIter = spectraIdx[i + s]; + int endIter = i + s + 1 == sILength ? sVLength : spectraIdx[i + s + 1]; + + for (int j = startIter; j < endIter; ++j) { + auto currentPeak = spectraValues[j]; + auto minPeak = currentPeak - t > 0 ? currentPeak - t : 0; + auto maxPeak = currentPeak + t < ENCODING_SIZE ? currentPeak + t : ENCODING_SIZE - 1; + + for (int k = minPeak; k <= maxPeak; ++k) { + float currentVal = M[s * ENCODING_SIZE + k]; + float newVal = gaussianTol ? normpdf((float) k, (float) currentPeak, (float) (t / 3.0)) : 1.0; + M[s * ENCODING_SIZE + k] = max(currentVal, newVal); + } + } + } + } + + // device setup cusparse M + cusparseDnMatDescr_t Mat; + + CHECK_CUDA(cudaMemcpy(dM_dnValues, M, ENCODING_SIZE * batchSize * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUSPARSE(cusparseCreateDnMat(&Mat, ENCODING_SIZE, batchSize, ENCODING_SIZE, dM_dnValues, + CUDA_R_32F, CUSPARSE_ORDER_COL)); + + // host setup result matrix + float* spmm = new float[(csrRowoffsetsLength - 1) * batchSize] {0.0}; + + // device setup cusparse spmM result + cusparseDnMatDescr_t spmM; + + CHECK_CUDA(cudaMemcpy(dspmM_dnValues, spmm, (csrRowoffsetsLength - 1) * batchSize * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUSPARSE(cusparseCreateDnMat(&spmM, csrRowoffsetsLength - 1, batchSize, csrRowoffsetsLength - 1, dspmM_dnValues, + CUDA_R_32F, CUSPARSE_ORDER_COL)); + + // device allocate buffer + CHECK_CUSPARSE(cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, mat, Mat, &beta, spmM, CUDA_R_32F, + CUSPARSE_SPMM_CSR_ALG1, &bufferSize)); + CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); + + // device spmm computation + CHECK_CUSPARSE(cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, mat, Mat, &beta, spmM, CUDA_R_32F, + CUSPARSE_SPMM_CSR_ALG1, dBuffer)); + + // host get spmM result + CHECK_CUDA(cudaMemcpy(spmm, dspmM_dnValues, (csrRowoffsetsLength - 1) * batchSize * sizeof(float), cudaMemcpyDeviceToHost)); + + // host result max + for (int s = 0; s < batchSize; ++s) { + + if (i + s >= sILength) { + break; + } + + int currentCol = s * (csrRowoffsetsLength - 1); + std::vector values(csrRowoffsetsLength - 1, 0.0); + for (int j = 0; j < csrRowoffsetsLength - 1; ++j) { + values[j] = spmm[currentCol + j]; + } + + std::vector idx(csrRowoffsetsLength - 1); + std::iota(idx.begin(), idx.end(), 0); + std::sort(idx.begin(), idx.end(), [&](int i, int j) {return values[i] > values[j];}); + + if (idx.size() >= n) { + for (int j = 0; j < n; ++j) { + result[(i + s) * n + j] = idx[j]; + } + } + } + + // host memory deallocation + delete[] spmm; + delete[] M; + + // device destroy descriptors + CHECK_CUSPARSE(cusparseDestroyDnMat(Mat)); + CHECK_CUSPARSE(cusparseDestroyDnMat(spmM)); + + if (verbose != 0 && (i + batchSize) % verbose == 0) { + std::cout << "Searched " << i + batchSize << " spectra in total..." << std::endl; + } + } + + // device destroy descriptors + CHECK_CUSPARSE(cusparseDestroySpMat(mat)); + CHECK_CUSPARSE(cusparseDestroy(handle)); + + // device memory deallocation + CHECK_CUDA(cudaFree(dm_csrRowoffsets)); + CHECK_CUDA(cudaFree(dm_csrColIdx)); + CHECK_CUDA(cudaFree(dm_csrValues)); + CHECK_CUDA(cudaFree(dM_dnValues)); + CHECK_CUDA(cudaFree(dspmM_dnValues)); + CHECK_CUDA(cudaFree(dBuffer)); + + // host memory deallocation + delete[] csrValues; + + return result; +} + /// /// Free memory after result has been marshalled. /// diff --git a/dll/VectorSearchCUDA.dll b/dll/VectorSearchCUDA.dll index a0b4d88..42f7ca1 100644 Binary files a/dll/VectorSearchCUDA.dll and b/dll/VectorSearchCUDA.dll differ