update goldilocks and fix ut

okx · Sep 3, 2024 · 0e9d5d5 · 0e9d5d5
1 parent aaff872
commit 0e9d5d5
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 29 deletions.
diff --git a/src/goldilocks b/src/goldilocks
diff --git a/src/prover/prover.cpp b/src/prover/prover.cpp
@@ -53,7 +53,7 @@
 #include "cuda_utils.hpp"
 #include "ntt_goldilocks.hpp"
 #include <pthread.h>
-#include "chelpers_steps_pack.cuh"
+#include "chelpers_steps_gpu.cuh"
 
 int asynctask(void* (*task)(void* args), void* arg)
 {
@@ -601,7 +601,7 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
         /*************************************/
 
 #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
-        CHelpersStepsPackGPU cHelpersSteps;
+        CHelpersStepsGPU cHelpersSteps;
 #elif defined(__AVX512__)
         CHelpersStepsAvx512 cHelpersSteps;
 #elif defined(__PACK__)
@@ -854,7 +854,7 @@ void Prover::genAggregatedProof(ProverRequest *pProverRequest)
 
     if(USE_GENERIC_PARSER) {
 #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
-        CHelpersStepsPackGPU cHelpersSteps;
+        CHelpersStepsGPU cHelpersSteps;
 #elif defined(__AVX512__)
         CHelpersStepsAvx512 cHelpersSteps;
 #elif defined(__PACK__) 
@@ -963,7 +963,7 @@ void Prover::genFinalProof(ProverRequest *pProverRequest)
     FRIProofC12 fproofRecursiveF((1 << polBitsRecursiveF), FIELD_EXTENSION, starksRecursiveF->starkInfo.starkStruct.steps.size(), starksRecursiveF->starkInfo.evMap.size(), starksRecursiveF->starkInfo.nPublics);
     if(USE_GENERIC_PARSER) {
         #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
-            CHelpersStepsPackGPU cHelpersSteps;
+            CHelpersStepsGPU cHelpersSteps;
         #elif defined(__AVX512__)
             CHelpersStepsAvx512 cHelpersSteps;
         #elif defined(__PACK__) 

diff --git a/src/starkpil/chelpers_steps_pack.cu → src/starkpil/chelpers_steps_gpu.cu b/src/starkpil/chelpers_steps_pack.cu → src/starkpil/chelpers_steps_gpu.cu
@@ -5,20 +5,20 @@
 #ifdef __AVX512__
 #include "chelpers_steps_avx512.hpp"
 #endif
-#include "chelpers_steps_pack.cuh"
+#include "chelpers_steps_gpu.cuh"
 #include "goldilocks_cubic_extension.cuh"
 #include "cuda_utils.cuh"
 #include "cuda_utils.hpp"
 #include "timer.hpp"
 
 const uint64_t MAX_U64 = 0xFFFFFFFFFFFFFFFF;
 
-CHelpersStepsPackGPU *cHelpersSteps[MAX_GPUS];
+CHelpersStepsGPU *cHelpersSteps[MAX_GPUS];
 uint64_t *gpuSharedStorage[MAX_GPUS];
 uint64_t *streamExclusiveStorage[nStreams*MAX_GPUS];
 cudaStream_t streams[nStreams*MAX_GPUS];
 
-void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
+void CHelpersStepsGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
 
     prepare(starkInfo, params, parserArgs, parserParams);
 
@@ -145,8 +145,8 @@ void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params,
 
     for (int d=0;d<nDevices;d++) {
         CHECKCUDAERR(cudaSetDevice(d));
-        CHECKCUDAERR(cudaMalloc((void **)&(cHelpersSteps[d]), sizeof(CHelpersStepsPackGPU)));
-        CHECKCUDAERR(cudaMemcpy(cHelpersSteps[d], this, sizeof(CHelpersStepsPackGPU), cudaMemcpyHostToDevice));
+        CHECKCUDAERR(cudaMalloc((void **)&(cHelpersSteps[d]), sizeof(CHelpersStepsGPU)));
+        CHECKCUDAERR(cudaMemcpy(cHelpersSteps[d], this, sizeof(CHelpersStepsGPU), cudaMemcpyHostToDevice));
     }
 
     for (uint32_t s = 0; s < nStreams*nDevices; s++) {
@@ -155,7 +155,7 @@ void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params,
     }
 }
 
-void CHelpersStepsPackGPU::cleanupGPU() {
+void CHelpersStepsGPU::cleanupGPU() {
     CHECKCUDAERR(cudaGetDeviceCount(&nDevices));
     for (int d=0;d<nDevices;d++) {
         cudaFree(gpuSharedStorage[d]);
@@ -172,7 +172,7 @@ void CHelpersStepsPackGPU::cleanupGPU() {
 }
 
 
-void CHelpersStepsPackGPU::calculateExpressions(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
+void CHelpersStepsGPU::calculateExpressions(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
 
     if (!starkInfo.reduceMemory || parserParams.stage == 2) { // in these cases, cpu version is faster
 #ifdef __AVX512__
@@ -188,21 +188,27 @@ void CHelpersStepsPackGPU::calculateExpressions(StarkInfo &starkInfo, StepsParam
     cleanupGPU();
 }
 
-void CHelpersStepsPackGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams,
+void CHelpersStepsGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams,
     uint64_t rowIni, uint64_t rowEnd){
 
     if(rowEnd < rowIni || rowEnd > domainSize || (rowEnd -rowIni) % nrowsPack != 0) {
         zklog.info("Invalid range for rowIni " + to_string(rowIni) + " and rowEnd " + to_string(rowEnd));
         exitProcess();
     }
 
+    if ((rowEnd - rowIni) < nrowsPack*nCudaThreads*nStreams*nDevices) {
+        nCudaThreads = (rowEnd - rowIni) / (nrowsPack*nStreams*nDevices);
+        subDomainSize = nrowsPack * nCudaThreads;
+        printf("nCudaThreads: %u\n", nCudaThreads);
+    }
+
     assert((rowEnd - rowIni) % (nrowsPack*nCudaThreads*nStreams*nDevices) == 0);
     uint64_t nrowPerStream = (rowEnd - rowIni) / nStreams /nDevices;
 
     for (int s=0; s<nStreams*nDevices; s++) {
         int d = s/nStreams;
         CHECKCUDAERR(cudaSetDevice(d));
-        CHelpersStepsPackGPU *cHelpersSteps_d = cHelpersSteps[d];
+        CHelpersStepsGPU *cHelpersSteps_d = cHelpersSteps[d];
         uint64_t *sharedStorage = gpuSharedStorage[d];
         uint64_t *exclusiveStorage = streamExclusiveStorage[s];
         cudaStream_t stream = streams[s];
@@ -233,7 +239,7 @@ void CHelpersStepsPackGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, Ste
     TimerStopAndLog(WAIT_STREAM);
 }
 
-void CHelpersStepsPackGPU::loadData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
+void CHelpersStepsGPU::loadData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
 
     ConstantPolsStarks *constPols = domainExtended ? params.pConstPols2ns : params.pConstPols;
     Polinomial &x = domainExtended ? params.x_2ns : params.x_n;
@@ -272,7 +278,7 @@ void CHelpersStepsPackGPU::loadData(StarkInfo &starkInfo, StepsParams &params, u
     CHECKCUDAERR(cudaMemcpyAsync(xDivXSubXi_d + subDomainSize *FIELD_EXTENSION, params.xDivXSubXi[domainSize + row], subDomainSize *FIELD_EXTENSION * sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
 }
 
-void CHelpersStepsPackGPU::storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
+void CHelpersStepsGPU::storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
     uint64_t *pols_d = streamExclusiveStorage[s] + pols_offset;
     cudaStream_t stream = streams[s];
     for (uint64_t s = 1; s < 11; s++) {
@@ -282,7 +288,7 @@ void CHelpersStepsPackGPU::storeData(StarkInfo &starkInfo, StepsParams &params,
     }
 }
 
-__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage) {
+__global__ void loadPolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage) {
 
     uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;
 
@@ -357,7 +363,7 @@ __global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t
     }
 }
 
-__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) {
+__global__ void storePolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) {
     uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;
 
     uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -395,7 +401,7 @@ __global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_
     }
 }
 
-__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage)
+__global__ void pack_kernel(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage)
 {
     uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;
 

diff --git a/src/starkpil/chelpers_steps_pack.cuh → src/starkpil/chelpers_steps_gpu.cuh b/src/starkpil/chelpers_steps_pack.cuh → src/starkpil/chelpers_steps_gpu.cuh
@@ -7,7 +7,7 @@
 const int nStreams = 2; // streams per device
 const int MAX_GPUS = 8;
 class gl64_t;
-class CHelpersStepsPackGPU: public CHelpersStepsPack {
+class CHelpersStepsGPU: public CHelpersStepsPack {
 public:
 
     int nDevices;
@@ -58,9 +58,9 @@ public:
     void storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t streamIdx);
 };
 
-__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage);
-__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
-__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
+__global__ void loadPolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage);
+__global__ void storePolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
+__global__ void pack_kernel(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
 
 #endif
 #endif
diff --git a/src/starkpil/starks.cpp b/src/starkpil/starks.cpp
@@ -63,7 +63,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
         nBlocksStage1++;
     }
 
-    printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1.first, nttOffsetHelperStage1.second);
     ntt.extendPol(reduceMemory?p_cm1_2ns_tmp:p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1, 3, nBlocksStage1);
     TimerStopAndLog(STARK_STEP_1_LDE);
 
@@ -135,7 +134,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
     while((nttOffsetHelperStage2.second * nBlocksStage2 < buffHelperElementsStage2) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2) ) {
         nBlocksStage2++;
     }
-    printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2.first, nttOffsetHelperStage2.second);
+
     ntt.extendPol(reduceMemory?p_cm2_2ns_tmp:p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2, 3, nBlocksStage2);
     TimerStopAndLog(STARK_STEP_2_LDE);
 
@@ -186,7 +185,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
     while((nttOffsetHelperStage3.second * nBlocksStage3 < buffHelperElementsStage3) || (starkInfo.mapSectionsN.section[cm3_n] > 256*nBlocksStage3) ) {
         nBlocksStage3++;
     }
-    printf("cm3 offset:%lu, size:%lu\n", nttOffsetHelperStage3.first, nttOffsetHelperStage3.second);
+
     ntt.extendPol(p_cm3_2ns, p_cm3_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm3_n], pBuffHelperStage3, 3, nBlocksStage3);
     TimerStopAndLog(STARK_STEP_3_LDE);
 
@@ -212,7 +211,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
             nBlocksStage1_++;
         }
 
-        printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1_.first, nttOffsetHelperStage1_.second);
         ntt.extendPol(p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1_, 3, nBlocksStage1_);
         TimerStopAndLog(STARK_STEP_1_RECALCULATING_LDE);
 
@@ -226,7 +224,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
         while((nttOffsetHelperStage2_.second * nBlocksStage2_ < buffHelperElementsStage2_) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2_) ) {
             nBlocksStage2_++;
         }
-        printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2_.first, nttOffsetHelperStage2_.second);
+
         ntt.extendPol(p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2_, 3, nBlocksStage2_);
         TimerStopAndLog(STARK_STEP_2_RECALCULATING_LDE);
 

diff --git a/test/examples/main.cpp b/test/examples/main.cpp
@@ -6,10 +6,11 @@
     #include "chelpers_steps_avx512.hpp"
 #endif
 #include "chelpers_steps_pack.hpp"
-#include "chelpers_steps_gpu.hpp"
+#include "chelpers_steps_gpu.cuh"
 #include "AllSteps.hpp"
 #include "zklog.hpp"
 #include "exit_process.hpp"
+#include "cuda_utils.hpp"
 
 int main()
 {
@@ -37,6 +38,10 @@ int main()
 
     StarkInfo starkInfo(starkInfoFile, reduceMemory);
 
+#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
+    alloc_pinned_mem_per_device((1 << starkInfo.starkStruct.nBitsExt) * 32);
+#endif
+
     uint64_t polBits = starkInfo.starkStruct.steps[starkInfo.starkStruct.steps.size() - 1].nBits;
     FRIProof fproof((1 << polBits), FIELD_EXTENSION, starkInfo.starkStruct.steps.size(), starkInfo.evMap.size(), starkInfo.nPublics);
+0 −7		Makefile
+0 −33		src/gl64_t.cuh
+0 −95		src/goldilocks_cubic_extension.cuh
+0 −2		src/ntt_goldilocks.cu
+0 −2		src/poseidon_goldilocks.cu
+1 −110		tests/tests.cu