diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp index 705f875a6e6a..2bfa62e6d6b7 100644 --- a/modules/cudev/include/opencv2/cudev/block/scan.hpp +++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp @@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid) } else { + // Read from smem[tid] (T val = smem[tid]) + // and write to smem[tid + 1] (smem[tid + 1] = warpScanInclusive(mask, val)) + // should be explicitly fenced by "__syncwarp" to get rid of + // "cuda-memcheck --tool racecheck" warnings. + __syncwarp(mask); + // calculate inclusive scan and write back to shared memory with offset 1 smem[tid + 1] = warpScanInclusive(mask, val); @@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid) int quot = THREADS_NUM / WARP_SIZE; + T val; + if (tid < quot) { // grab top warp elements - T val = smem[tid]; + val = smem[tid]; + } + + __syncthreads(); + + if (tid < quot) + { if (0 == (THREADS_NUM & (WARP_SIZE - 1))) { diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp index d1014b3ceb4b..eeae57d625a1 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp @@ -63,7 +63,8 @@ namespace integral_detail __shared__ D smem[NUM_SCAN_THREADS * 2]; __shared__ D carryElem; - carryElem = 0; + if (threadIdx.x == 0) + carryElem = 0; __syncthreads(); @@ -105,7 +106,8 @@ namespace integral_detail __shared__ D smem[NUM_SCAN_THREADS * 2]; __shared__ D carryElem; - carryElem = 0; + if (threadIdx.x == 0) + carryElem = 0; __syncthreads(); diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp index c0afb552a932..bab462973f5e 100644 --- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp @@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid) #pragma unroll for (int i = 1; i <= (WARP_SIZE / 2); i *= 2) { - const T val = __shfl_up(data, i, WARP_SIZE); + const T val = shfl_up(data, i); if (laneId >= i) data += val; } diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp index dd142c6719f7..d317b2c44984 100644 --- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp @@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize) return __hiloint2double(hi, lo); } +__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize) +{ + return __shfl_up(val, delta, width); +} + #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \ __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \ { \