From bd623628b801dfce230edd42a032edb9c19dbfd9 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Mon, 18 Nov 2024 22:38:54 -0500 Subject: [PATCH] [bug] fix mcxlab crash with >2^31-1 voxels, fix #235, revert #164 #235 was reported by Thomas Rialan in https://groups.google.com/g/mcx-users/c/3zvP5W4OGnM revert #164 to fix high error reported by Jessica Lund in https://groups.google.com/g/mcx-users/c/hr6lqpfTYbg --- src/mcx_core.cu | 58 ++++++++++++++++++++++++++----------------------- src/mcx_utils.c | 2 +- src/mcx_utils.h | 2 +- src/mcxlab.cpp | 10 ++++----- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/src/mcx_core.cu b/src/mcx_core.cu index 14b72630..75056ecd 100644 --- a/src/mcx_core.cu +++ b/src/mcx_core.cu @@ -1114,7 +1114,7 @@ __device__ inline int launchnewphoton(MCXpos* p, MCXdir* v, Stokes* s, MCXtime* if (fabsf(oldval) > MAX_ACCUM) { atomicadd(& field[*idx1d + tshift * gcfg->dimlen.z], ((oldval > 0.f) ? -MAX_ACCUM : MAX_ACCUM)); - atomicadd(& field[*idx1d + tshift * gcfg->dimlen.z + gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); + atomicadd(& field[*idx1d + tshift * gcfg->dimlen.z + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); } #endif @@ -1132,7 +1132,7 @@ __device__ inline int launchnewphoton(MCXpos* p, MCXdir* v, Stokes* s, MCXtime* if (fabsf(oldval) > MAX_ACCUM) { atomicadd(& field[(*idx1d + tshift * gcfg->dimlen.z)*gcfg->srcnum + i], ((oldval > 0.f) ? -MAX_ACCUM : MAX_ACCUM)); - atomicadd(& field[(*idx1d + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); + atomicadd(& field[(*idx1d + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); } #endif @@ -1993,7 +1993,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (atomicadd(& field[idx1d + tshift * gcfg->dimlen.z], -oldval) < 0.f) { atomicadd(& field[idx1d + tshift * gcfg->dimlen.z], oldval); } else { - atomicadd(& field[idx1d + tshift * gcfg->dimlen.z + gcfg->dimlen.w], oldval); + atomicadd(& field[idx1d + tshift * gcfg->dimlen.z + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], oldval); } } @@ -2142,7 +2142,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (gcfg->outputtype == otEnergy) { weight = w0 - p.w; } else if (gcfg->outputtype == otFluence || gcfg->outputtype == otFlux) { - weight = (prop.mua < 0.001f) ? (w0 * len) : __fdividef(w0 - p.w, prop.mua); /** when mua->0, take limit_{mua->0} w0*(1-exp(-mua*len))/mua yields w0*len */ + weight = (prop.mua < EPS) ? (w0 * len) : __fdividef(w0 - p.w, prop.mua); /** when mua->0, the first two terms of Taylor expansion of w0*(1-exp(-mua*len))/mua = w0*len - mua*len^2*w0/2 */ } else if (gcfg->seed == SEED_FROM_FILE) { if (gcfg->outputtype == otJacobian || gcfg->outputtype == otRF) { weight = replayweight[(idx * gcfg->threadphoton + min(idx, gcfg->oddphotons - 1) + (int)f.ndone)] * f.pathlen; @@ -2186,11 +2186,11 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (fabsf(oldval) > MAX_ACCUM && gcfg->outputtype != otRF) { atomicadd(& field[idx1dold + tshift * gcfg->dimlen.z], ((oldval > 0.f) ? -MAX_ACCUM : MAX_ACCUM)); - atomicadd(& field[idx1dold + tshift * gcfg->dimlen.z + gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); - GPUDEBUG(("reducing float round-off error by moving %e to [%d], oldval=%f\n", MAX_ACCUM, idx1dold + tshift * gcfg->dimlen.z + gcfg->dimlen.w, oldval)); + atomicadd(& field[idx1dold + tshift * gcfg->dimlen.z + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); + GPUDEBUG(("reducing float round-off error by moving %e to [%d], oldval=%f\n", MAX_ACCUM, idx1dold + tshift * gcfg->dimlen.z + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w, oldval)); } else if (gcfg->outputtype == otRF && gcfg->omega > 0.f) { oldval = -replayweight[(idx * gcfg->threadphoton + min(idx, gcfg->oddphotons - 1) + (int)f.ndone)] * f.pathlen * ppath[gcfg->w0offset + gcfg->srcnum + 1]; - atomicadd(& field[idx1dold + tshift * gcfg->dimlen.z + gcfg->dimlen.w], oldval); + atomicadd(& field[idx1dold + tshift * gcfg->dimlen.z + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], oldval); } #endif @@ -2204,10 +2204,10 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (fabsf(oldval) > MAX_ACCUM && gcfg->outputtype != otRF) { atomicadd(& field[(idx1dold + tshift * gcfg->dimlen.z)*gcfg->srcnum + i], ((oldval > 0.f) ? -MAX_ACCUM : MAX_ACCUM)); - atomicadd(& field[(idx1dold + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); + atomicadd(& field[(idx1dold + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], ((oldval > 0.f) ? MAX_ACCUM : -MAX_ACCUM)); } else if (gcfg->outputtype == otRF) { oldval = p.w * f.pathlen * ppath[gcfg->w0offset + gcfg->srcnum + 1]; - atomicadd(& field[(idx1dold + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + gcfg->dimlen.w], oldval); + atomicadd(& field[(idx1dold + tshift * gcfg->dimlen.z)*gcfg->srcnum + i + (uint64_t)gcfg->dimlen.z * gcfg->dimlen.w], oldval); } #endif @@ -2706,7 +2706,7 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { uint sharedbuf = 0; /** \c dimxyz - output volume variable \c field voxel count, Nx*Ny*Nz*Ns where Ns=cfg.srcnum is the pattern number for photon sharing */ - int dimxyz = cfg->dim.x * cfg->dim.y * cfg->dim.z * ((cfg->srctype == MCX_SRC_PATTERN || cfg->srctype == MCX_SRC_PATTERN3D) ? cfg->srcnum : (cfg->srcid == -1) ? (cfg->extrasrclen + 1) : 1); + size_t dimxyz = cfg->dim.x * cfg->dim.y * cfg->dim.z * ((cfg->srctype == MCX_SRC_PATTERN || cfg->srctype == MCX_SRC_PATTERN3D) ? cfg->srcnum : (cfg->srcid == -1) ? (cfg->extrasrclen + 1) : 1); /** \c media - input volume representing the simulation domain, format specified in cfg.mediaformat, read-only */ uint* media = (uint*)(cfg->vol); @@ -2823,7 +2823,7 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { /** Use the specified GPU's parameters, stored in gpu[gpuid] to determine the maximum time gates that it can hold */ if (gpu[gpuid].maxgate == 0 && dimxyz > 0) { - int needmem = dimxyz + cfg->nthread * sizeof(float4) * 4 + sizeof(float) * cfg->maxdetphoton * hostdetreclen + 10 * 1024 * 1024; /*keep 10M for other things*/ + size_t needmem = dimxyz + cfg->nthread * sizeof(float4) * 4 + sizeof(float) * cfg->maxdetphoton * hostdetreclen + 10 * 1024 * 1024; /*keep 10M for other things*/ gpu[gpuid].maxgate = (gpu[gpuid].globalmem - needmem) / (cfg->dim.x * cfg->dim.y * cfg->dim.z); gpu[gpuid].maxgate = MIN(((cfg->tend - cfg->tstart) / cfg->tstep + 0.5), gpu[gpuid].maxgate); } @@ -2960,11 +2960,17 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { #pragma omp barrier + dimlen.x = cfg->dim.x; + dimlen.y = cfg->dim.y * cfg->dim.x; + dimlen.z = cfg->dim.x * cfg->dim.y * cfg->dim.z; + dimlen.w = gpu[gpuid].maxgate * ((cfg->srctype == MCX_SRC_PATTERN || cfg->srctype == MCX_SRC_PATTERN3D) ? cfg->srcnum : (cfg->srcid == -1) ? (cfg->extrasrclen + 1) : 1); + /** Here we decide the total output buffer, field's length. it is Nx*Ny*Nz*Nt*Ns */ if (cfg->seed == SEED_FROM_FILE && cfg->replaydet == -1) { - fieldlen = dimxyz * gpu[gpuid].maxgate * cfg->detnum; + dimlen.w *= cfg->detnum; + fieldlen = dimlen.z * dimlen.w; } else { - fieldlen = dimxyz * gpu[gpuid].maxgate; + fieldlen = dimlen.z * dimlen.w; } /** A 1D grid is determined by the total thread number and block size */ @@ -3159,11 +3165,6 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { cachebox.x = (cp1.x - cp0.x + 1); cachebox.y = (cp1.y - cp0.y + 1) * (cp1.x - cp0.x + 1); - dimlen.x = cfg->dim.x; - dimlen.y = cfg->dim.y * cfg->dim.x; - dimlen.z = cfg->dim.x * cfg->dim.y * cfg->dim.z; - dimlen.w = fieldlen; - param.dimlen = dimlen; param.cachebox = cachebox; @@ -3592,6 +3593,7 @@ is more than what your have specified (%d), please use the -H option to specify * Accumulate volumetric fluence from all threads/devices */ if (cfg->issave2pt) { + size_t i; OutputType* rawfield = (OutputType*)malloc(sizeof(OutputType) * fieldlen * SHADOWCOUNT); CUDA_ASSERT(cudaMemcpy(rawfield, gfield, sizeof(OutputType)*fieldlen * SHADOWCOUNT, cudaMemcpyDeviceToHost)); MCX_FPRINTF(cfg->flog, "transfer complete:\t%d ms\n", GetTimeMillis() - tic); @@ -3602,7 +3604,7 @@ is more than what your have specified (%d), please use the -H option to specify * single-precision output, we need to copy and accumulate two separate floating-point buffers * to minimize round-off errors near the source */ - for (i = 0; i < (int)fieldlen; i++) { //accumulate field, can be done in the GPU + for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU field[i] = rawfield[i]; #ifndef USE_DOUBLE @@ -3624,7 +3626,7 @@ is more than what your have specified (%d), please use the -H option to specify * If respin is used, each repeatition is accumulated to the 2nd half of the buffer */ if (ABS(cfg->respin) > 1) { - for (i = 0; i < (int)fieldlen; i++) { //accumulate field, can be done in the GPU + for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU field[fieldlen + i] += field[i]; } } @@ -3664,12 +3666,14 @@ is more than what your have specified (%d), please use the -H option to specify * For MATLAB mex file, the data is copied to a pre-allocated buffer \c cfg->export* as a return variable */ if (cfg->exportfield) { - for (i = 0; i < (int)fieldlen; i++) + size_t i; + + for (i = 0; i < fieldlen; i++) #pragma omp atomic cfg->exportfield[i] += field[i]; if (cfg->outputtype == otRF && rfimag) { - for (i = 0; i < (int)fieldlen; i++) + for (i = 0; i < fieldlen; i++) #pragma omp atomic cfg->exportfield[i + fieldlen] += rfimag[i]; } @@ -3708,16 +3712,16 @@ is more than what your have specified (%d), please use the -H option to specify kahanc = 0.f; if (cfg->outputtype == otEnergy) { - int fieldlenPsrc = fieldlen / cfg->srcnum; + size_t j, fieldlenPsrc = fieldlen / cfg->srcnum; - for (iter = 0; iter < fieldlenPsrc; iter++) { - mcx_kahanSum(&energyabs[i], &kahanc, cfg->exportfield[iter * cfg->srcnum + i]); + for (j = 0; j < fieldlenPsrc; j++) { + mcx_kahanSum(&energyabs[i], &kahanc, cfg->exportfield[j * cfg->srcnum + i]); } } else { - int j; + size_t j; for (iter = 0; iter < (int)gpu[gpuid].maxgate; iter++) - for (j = 0; j < (int)dimlen.z; j++) { + for (j = 0; j < dimlen.z; j++) { mcx_kahanSum(&energyabs[i], &kahanc, cfg->exportfield[iter * dimxyz + (j * cfg->srcnum + i)]*mcx_updatemua((uint)cfg->vol[j], cfg)); } } diff --git a/src/mcx_utils.c b/src/mcx_utils.c index c5ae8f8d..74818b4f 100644 --- a/src/mcx_utils.c +++ b/src/mcx_utils.c @@ -1190,7 +1190,7 @@ void mcx_printlog(Config* cfg, char* str) { * @param[in] option: if set to 2, only normalize positive values (negative values for diffuse reflectance calculations) */ -void mcx_normalize(float field[], float scale, int fieldlen, int option, int pidx, int srcnum) { +void mcx_normalize(float field[], float scale, size_t fieldlen, int option, int pidx, int srcnum) { int i; for (i = 0; i < fieldlen; i++) { diff --git a/src/mcx_utils.h b/src/mcx_utils.h index 3411a6dd..fbebac8a 100644 --- a/src/mcx_utils.h +++ b/src/mcx_utils.h @@ -299,7 +299,7 @@ void mcx_parsecmd(int argc, char* argv[], Config* cfg); void mcx_usage(Config* cfg, char* exename); void mcx_printheader(Config* cfg); void mcx_loadvolume(char* filename, Config* cfg, int isbuf); -void mcx_normalize(float field[], float scale, int fieldlen, int option, int pidx, int srcnum); +void mcx_normalize(float field[], float scale, size_t fieldlen, int option, int pidx, int srcnum); void mcx_kahanSum(float* sum, float* kahanc, float input); int mcx_readarg(int argc, char* argv[], int id, void* output, const char* type); void mcx_printlog(Config* cfg, char* str); diff --git a/src/mcxlab.cpp b/src/mcxlab.cpp index 316b41bf..38d146e4 100644 --- a/src/mcxlab.cpp +++ b/src/mcxlab.cpp @@ -261,7 +261,7 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { /** Initialize all buffers necessary to store the output variables */ if (nlhs >= 1 && cfg.issave2pt) { - int fieldlen = cfg.dim.x * cfg.dim.y * cfg.dim.z * (int)((cfg.tend - cfg.tstart) / cfg.tstep + 0.5) * cfg.srcnum; + size_t fieldlen = cfg.dim.x * cfg.dim.y * cfg.dim.z * (int)((cfg.tend - cfg.tstart) / cfg.tstep + 0.5) * cfg.srcnum; if (cfg.replay.seed != NULL && cfg.replaydet == -1) { fieldlen *= cfg.detnum; @@ -392,7 +392,7 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { /** if the 1st output presents, output the fluence/energy-deposit volume data */ if (nlhs >= 1) { - int fieldlen; + size_t fieldlen; fielddim[0] = cfg.srcnum * cfg.dim.x; fielddim[1] = cfg.dim.y; fielddim[2] = cfg.dim.z; @@ -410,16 +410,16 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { fielddim[5] *= (cfg.extrasrclen + 1); } - fieldlen = fielddim[0] * fielddim[1] * fielddim[2] * fielddim[3] * fielddim[4] * fielddim[5]; + fieldlen = (size_t)fielddim[0] * fielddim[1] * fielddim[2] * fielddim[3] * fielddim[4] * fielddim[5]; if (cfg.issaveref && cfg.exportfield) { int highdim = fielddim[3] * fielddim[4] * fielddim[5]; - int voxellen = cfg.dim.x * cfg.dim.y * cfg.dim.z; + size_t voxellen = cfg.dim.x * cfg.dim.y * cfg.dim.z; float* dref = (float*)malloc(fieldlen * sizeof(float)); memcpy(dref, cfg.exportfield, fieldlen * sizeof(float)); - for (int voxelid = 0; voxelid < voxellen; voxelid++) { + for (size_t voxelid = 0; voxelid < voxellen; voxelid++) { if (cfg.vol[voxelid]) { for (int gate = 0; gate < highdim; gate++) for (int srcid = 0; srcid < cfg.srcnum; srcid++) {