diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp
index 705f875a6e6a..2bfa62e6d6b7 100644
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
             }
             else
             {
+                // Read from smem[tid]              (T val = smem[tid])
+                // and write to smem[tid + 1]       (smem[tid + 1] = warpScanInclusive(mask, val))
+                // should be explicitly fenced by "__syncwarp" to get rid of
+                // "cuda-memcheck --tool racecheck" warnings.
+                __syncwarp(mask);
+
                 // calculate inclusive scan and write back to shared memory with offset 1
                 smem[tid + 1] = warpScanInclusive(mask, val);
 
@@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 
         int quot = THREADS_NUM / WARP_SIZE;
 
+        T val;
+
         if (tid < quot)
         {
             // grab top warp elements
-            T val = smem[tid];
+            val = smem[tid];
+        }
+
+        __syncthreads();
+
+        if (tid < quot)
+        {
 
             if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
             {
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
index d1014b3ceb4b..eeae57d625a1 100644
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -63,7 +63,8 @@ namespace integral_detail
         __shared__ D smem[NUM_SCAN_THREADS * 2];
         __shared__ D carryElem;
 
-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;
 
         __syncthreads();
 
@@ -105,7 +106,8 @@ namespace integral_detail
         __shared__ D smem[NUM_SCAN_THREADS * 2];
         __shared__ D carryElem;
 
-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;
 
         __syncthreads();
 
diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
index c0afb552a932..bab462973f5e 100644
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
     #pragma unroll
     for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
     {
-        const T val = __shfl_up(data, i, WARP_SIZE);
+        const T val = shfl_up(data, i);
         if (laneId >= i)
               data += val;
     }
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
index dd142c6719f7..d317b2c44984 100644
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
     return __hiloint2double(hi, lo);
 }
 
+__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
+{
+    return __shfl_up(val, delta, width);
+}
+
 #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
     __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
     { \