Skip to content

Commit

Permalink
update goldilocks and fix ut
Browse files Browse the repository at this point in the history
  • Loading branch information
yann-sjtu committed Sep 3, 2024
1 parent aaff872 commit 0e9d5d5
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 29 deletions.
2 changes: 1 addition & 1 deletion src/goldilocks
8 changes: 4 additions & 4 deletions src/prover/prover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#include "cuda_utils.hpp"
#include "ntt_goldilocks.hpp"
#include <pthread.h>
#include "chelpers_steps_pack.cuh"
#include "chelpers_steps_gpu.cuh"

int asynctask(void* (*task)(void* args), void* arg)
{
Expand Down Expand Up @@ -601,7 +601,7 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
/*************************************/

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsPackGPU cHelpersSteps;
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down Expand Up @@ -854,7 +854,7 @@ void Prover::genAggregatedProof(ProverRequest *pProverRequest)

if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsPackGPU cHelpersSteps;
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down Expand Up @@ -963,7 +963,7 @@ void Prover::genFinalProof(ProverRequest *pProverRequest)
FRIProofC12 fproofRecursiveF((1 << polBitsRecursiveF), FIELD_EXTENSION, starksRecursiveF->starkInfo.starkStruct.steps.size(), starksRecursiveF->starkInfo.evMap.size(), starksRecursiveF->starkInfo.nPublics);
if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsPackGPU cHelpersSteps;
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
#ifdef __AVX512__
#include "chelpers_steps_avx512.hpp"
#endif
#include "chelpers_steps_pack.cuh"
#include "chelpers_steps_gpu.cuh"
#include "goldilocks_cubic_extension.cuh"
#include "cuda_utils.cuh"
#include "cuda_utils.hpp"
#include "timer.hpp"

const uint64_t MAX_U64 = 0xFFFFFFFFFFFFFFFF;

CHelpersStepsPackGPU *cHelpersSteps[MAX_GPUS];
CHelpersStepsGPU *cHelpersSteps[MAX_GPUS];
uint64_t *gpuSharedStorage[MAX_GPUS];
uint64_t *streamExclusiveStorage[nStreams*MAX_GPUS];
cudaStream_t streams[nStreams*MAX_GPUS];

void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
void CHelpersStepsGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {

prepare(starkInfo, params, parserArgs, parserParams);

Expand Down Expand Up @@ -145,8 +145,8 @@ void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params,

for (int d=0;d<nDevices;d++) {
CHECKCUDAERR(cudaSetDevice(d));
CHECKCUDAERR(cudaMalloc((void **)&(cHelpersSteps[d]), sizeof(CHelpersStepsPackGPU)));
CHECKCUDAERR(cudaMemcpy(cHelpersSteps[d], this, sizeof(CHelpersStepsPackGPU), cudaMemcpyHostToDevice));
CHECKCUDAERR(cudaMalloc((void **)&(cHelpersSteps[d]), sizeof(CHelpersStepsGPU)));
CHECKCUDAERR(cudaMemcpy(cHelpersSteps[d], this, sizeof(CHelpersStepsGPU), cudaMemcpyHostToDevice));
}

for (uint32_t s = 0; s < nStreams*nDevices; s++) {
Expand All @@ -155,7 +155,7 @@ void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams &params,
}
}

void CHelpersStepsPackGPU::cleanupGPU() {
void CHelpersStepsGPU::cleanupGPU() {
CHECKCUDAERR(cudaGetDeviceCount(&nDevices));
for (int d=0;d<nDevices;d++) {
cudaFree(gpuSharedStorage[d]);
Expand All @@ -172,7 +172,7 @@ void CHelpersStepsPackGPU::cleanupGPU() {
}


void CHelpersStepsPackGPU::calculateExpressions(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {
void CHelpersStepsGPU::calculateExpressions(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams) {

if (!starkInfo.reduceMemory || parserParams.stage == 2) { // in these cases, cpu version is faster
#ifdef __AVX512__
Expand All @@ -188,21 +188,27 @@ void CHelpersStepsPackGPU::calculateExpressions(StarkInfo &starkInfo, StepsParam
cleanupGPU();
}

void CHelpersStepsPackGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams,
void CHelpersStepsGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, StepsParams &params, ParserArgs &parserArgs, ParserParams &parserParams,
uint64_t rowIni, uint64_t rowEnd){

if(rowEnd < rowIni || rowEnd > domainSize || (rowEnd -rowIni) % nrowsPack != 0) {
zklog.info("Invalid range for rowIni " + to_string(rowIni) + " and rowEnd " + to_string(rowEnd));
exitProcess();
}

if ((rowEnd - rowIni) < nrowsPack*nCudaThreads*nStreams*nDevices) {
nCudaThreads = (rowEnd - rowIni) / (nrowsPack*nStreams*nDevices);
subDomainSize = nrowsPack * nCudaThreads;
printf("nCudaThreads: %u\n", nCudaThreads);
}

assert((rowEnd - rowIni) % (nrowsPack*nCudaThreads*nStreams*nDevices) == 0);
uint64_t nrowPerStream = (rowEnd - rowIni) / nStreams /nDevices;

for (int s=0; s<nStreams*nDevices; s++) {
int d = s/nStreams;
CHECKCUDAERR(cudaSetDevice(d));
CHelpersStepsPackGPU *cHelpersSteps_d = cHelpersSteps[d];
CHelpersStepsGPU *cHelpersSteps_d = cHelpersSteps[d];
uint64_t *sharedStorage = gpuSharedStorage[d];
uint64_t *exclusiveStorage = streamExclusiveStorage[s];
cudaStream_t stream = streams[s];
Expand Down Expand Up @@ -233,7 +239,7 @@ void CHelpersStepsPackGPU::calculateExpressionsRowsGPU(StarkInfo &starkInfo, Ste
TimerStopAndLog(WAIT_STREAM);
}

void CHelpersStepsPackGPU::loadData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
void CHelpersStepsGPU::loadData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {

ConstantPolsStarks *constPols = domainExtended ? params.pConstPols2ns : params.pConstPols;
Polinomial &x = domainExtended ? params.x_2ns : params.x_n;
Expand Down Expand Up @@ -272,7 +278,7 @@ void CHelpersStepsPackGPU::loadData(StarkInfo &starkInfo, StepsParams &params, u
CHECKCUDAERR(cudaMemcpyAsync(xDivXSubXi_d + subDomainSize *FIELD_EXTENSION, params.xDivXSubXi[domainSize + row], subDomainSize *FIELD_EXTENSION * sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
}

void CHelpersStepsPackGPU::storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
void CHelpersStepsGPU::storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t s) {
uint64_t *pols_d = streamExclusiveStorage[s] + pols_offset;
cudaStream_t stream = streams[s];
for (uint64_t s = 1; s < 11; s++) {
Expand All @@ -282,7 +288,7 @@ void CHelpersStepsPackGPU::storeData(StarkInfo &starkInfo, StepsParams &params,
}
}

__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage) {
__global__ void loadPolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage) {

uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;

Expand Down Expand Up @@ -357,7 +363,7 @@ __global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t
}
}

__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) {
__global__ void storePolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) {
uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;

uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down Expand Up @@ -395,7 +401,7 @@ __global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_
}
}

__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage)
__global__ void pack_kernel(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage)
{
uint64_t nCudaThreads = cHelpersSteps->nCudaThreads;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
const int nStreams = 2; // streams per device
const int MAX_GPUS = 8;
class gl64_t;
class CHelpersStepsPackGPU: public CHelpersStepsPack {
class CHelpersStepsGPU: public CHelpersStepsPack {
public:

int nDevices;
Expand Down Expand Up @@ -58,9 +58,9 @@ public:
void storeData(StarkInfo &starkInfo, StepsParams &params, uint64_t row, uint32_t streamIdx);
};

__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage);
__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
__global__ void loadPolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage);
__global__ void storePolinomialsGPU(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);
__global__ void pack_kernel(CHelpersStepsGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage);

#endif
#endif
8 changes: 3 additions & 5 deletions src/starkpil/starks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
nBlocksStage1++;
}

printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1.first, nttOffsetHelperStage1.second);
ntt.extendPol(reduceMemory?p_cm1_2ns_tmp:p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1, 3, nBlocksStage1);
TimerStopAndLog(STARK_STEP_1_LDE);

Expand Down Expand Up @@ -135,7 +134,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
while((nttOffsetHelperStage2.second * nBlocksStage2 < buffHelperElementsStage2) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2) ) {
nBlocksStage2++;
}
printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2.first, nttOffsetHelperStage2.second);

ntt.extendPol(reduceMemory?p_cm2_2ns_tmp:p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2, 3, nBlocksStage2);
TimerStopAndLog(STARK_STEP_2_LDE);

Expand Down Expand Up @@ -186,7 +185,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
while((nttOffsetHelperStage3.second * nBlocksStage3 < buffHelperElementsStage3) || (starkInfo.mapSectionsN.section[cm3_n] > 256*nBlocksStage3) ) {
nBlocksStage3++;
}
printf("cm3 offset:%lu, size:%lu\n", nttOffsetHelperStage3.first, nttOffsetHelperStage3.second);

ntt.extendPol(p_cm3_2ns, p_cm3_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm3_n], pBuffHelperStage3, 3, nBlocksStage3);
TimerStopAndLog(STARK_STEP_3_LDE);

Expand All @@ -212,7 +211,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
nBlocksStage1_++;
}

printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1_.first, nttOffsetHelperStage1_.second);
ntt.extendPol(p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1_, 3, nBlocksStage1_);
TimerStopAndLog(STARK_STEP_1_RECALCULATING_LDE);

Expand All @@ -226,7 +224,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil
while((nttOffsetHelperStage2_.second * nBlocksStage2_ < buffHelperElementsStage2_) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2_) ) {
nBlocksStage2_++;
}
printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2_.first, nttOffsetHelperStage2_.second);

ntt.extendPol(p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2_, 3, nBlocksStage2_);
TimerStopAndLog(STARK_STEP_2_RECALCULATING_LDE);

Expand Down
7 changes: 6 additions & 1 deletion test/examples/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
#include "chelpers_steps_avx512.hpp"
#endif
#include "chelpers_steps_pack.hpp"
#include "chelpers_steps_gpu.hpp"
#include "chelpers_steps_gpu.cuh"
#include "AllSteps.hpp"
#include "zklog.hpp"
#include "exit_process.hpp"
#include "cuda_utils.hpp"

int main()
{
Expand Down Expand Up @@ -37,6 +38,10 @@ int main()

StarkInfo starkInfo(starkInfoFile, reduceMemory);

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
alloc_pinned_mem_per_device((1 << starkInfo.starkStruct.nBitsExt) * 32);
#endif

uint64_t polBits = starkInfo.starkStruct.steps[starkInfo.starkStruct.steps.size() - 1].nBits;
FRIProof fproof((1 << polBits), FIELD_EXTENSION, starkInfo.starkStruct.steps.size(), starkInfo.evMap.size(), starkInfo.nPublics);

Expand Down

0 comments on commit 0e9d5d5

Please sign in to comment.