Skip to content
This repository has been archived by the owner on Mar 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #18 from xsacha/hunter-3.3.0
Browse files Browse the repository at this point in the history
CUDA 9.0 support, backport
  • Loading branch information
ruslo authored Sep 1, 2017
2 parents 8646be9 + 4575a03 commit 419a9a8
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 11 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (NOT IOS AND NOT APPLE) )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" ON IF (NOT IOS AND NOT APPLE) )
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON IF (NOT WINRT) )
OCV_OPTION(WITH_VFW "Include Video for Windows support" ON IF WIN32 )
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
Expand Down
12 changes: 11 additions & 1 deletion cmake/FindCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -790,8 +790,18 @@ endif()
if(CUDA_VERSION VERSION_GREATER "5.0")
# In CUDA 5.5 NPP was splitted onto 3 separate libraries.
find_cuda_helper_libs(nppc)
find_cuda_helper_libs(nppi)
find_cuda_helper_libs(nppial)
find_cuda_helper_libs(nppicc)
find_cuda_helper_libs(nppicom)
find_cuda_helper_libs(nppidei)
find_cuda_helper_libs(nppif)
find_cuda_helper_libs(nppig)
find_cuda_helper_libs(nppim)
find_cuda_helper_libs(nppist)
find_cuda_helper_libs(nppisu)
find_cuda_helper_libs(nppitc)
find_cuda_helper_libs(npps)
set(CUDA_nppi_LIBRARY "${CUDA_nppial_LIBRARY};${CUDA_nppicc_LIBRARY};${CUDA_nppicom_LIBRARY};${CUDA_nppidei_LIBRARY};${CUDA_nppif_LIBRARY};${CUDA_nppig_LIBRARY};${CUDA_nppim_LIBRARY};${CUDA_nppist_LIBRARY};${CUDA_nppisu_LIBRARY};${CUDA_nppitc_LIBRARY}")
set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
find_cuda_helper_libs(npp)
Expand Down
12 changes: 7 additions & 5 deletions cmake/OpenCVDetectCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ if(CUDA_FOUND)

message(STATUS "CUDA detected: " ${CUDA_VERSION})

set(_generations "Fermi" "Kepler" "Maxwell" "Pascal")
set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND _generations "Auto")
endif()
Expand All @@ -70,6 +70,8 @@ if(CUDA_FOUND)
set(__cuda_arch_bin "5.0 5.2")
elseif(CUDA_GENERATION STREQUAL "Pascal")
set(__cuda_arch_bin "6.0 6.1")
elseif(CUDA_GENERATION STREQUAL "Volta")
set(__cuda_arch_bin "7.0")
elseif(CUDA_GENERATION STREQUAL "Auto")
execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
Expand All @@ -94,17 +96,17 @@ if(CUDA_FOUND)
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT _nvcc_res EQUAL 0)
message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
set(__cuda_arch_bin "5.3 6.2")
set(__cuda_arch_bin "5.3 6.2 7.0")
else()
set(__cuda_arch_bin "${_nvcc_out}")
string(REPLACE "2.1" "2.1(2.0)" __cuda_arch_bin "${__cuda_arch_bin}")
endif()
set(__cuda_arch_ptx "")
else()
if(${CUDA_VERSION} VERSION_LESS "8.0")
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2")
else()
if(${CUDA_VERSION} VERSION_LESS "9.0")
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2 6.0 6.1")
else()
set(__cuda_arch_bin "3.0 3.5 3.7 5.0 5.2 6.0 6.1 7.0")
endif()
endif()
endif()
Expand Down
9 changes: 9 additions & 0 deletions modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@

namespace cv { namespace cuda { namespace device
{
#if __CUDACC_VER_MAJOR__ >= 9
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
#endif
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
Expand Down Expand Up @@ -148,6 +153,10 @@ namespace cv { namespace cuda { namespace device
}
}}}

# undef __shfl
# undef __shfl_up
# undef __shfl_down

//! @endcond

#endif // OPENCV_CUDA_WARP_SHUFFLE_HPP
8 changes: 8 additions & 0 deletions modules/core/include/opencv2/core/private.cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@
#ifdef HAVE_CUDA
# include <cuda.h>
# include <cuda_runtime.h>
# if defined (__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-aliasing"
# include <cuda_fp16.h>
# pragma GCC diagnostic pop
# else
# include <cuda_fp16.h>
# endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
# include <npp.h>
# include "opencv2/core/cuda_stream_accessor.hpp"
# include "opencv2/core/cuda/common.hpp"
Expand Down
2 changes: 1 addition & 1 deletion modules/cudacodec/src/precomp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#include "opencv2/core/private.cuda.hpp"

#ifdef HAVE_NVCUVID
#include <nvcuvid.h>
#include <dynlink_nvcuvid.h>

#ifdef _WIN32
#define NOMINMAX
Expand Down
12 changes: 11 additions & 1 deletion modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#define OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP

#include "../common.hpp"
#include "opencv2/core/private.cuda.hpp"

namespace cv { namespace cudev {

Expand Down Expand Up @@ -274,12 +275,21 @@ template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v);

template <> __device__ __forceinline__ float cast_fp16<short, float>(short v)
{
#if __CUDACC_VER_MAJOR__ >= 9
return float(*(__half*)&v);
#else
return __half2float(v);
#endif
}

template <> __device__ __forceinline__ short cast_fp16<float, short>(float v)
{
return (short)__float2half_rn(v);
#if __CUDACC_VER_MAJOR__ >= 9
__half h(v);
return *(short*)&v;
#else
return (short)__float2half_rn(v);
#endif
}
//! @}

Expand Down
12 changes: 11 additions & 1 deletion modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,14 @@ namespace cv { namespace cudev {

#if CV_CUDEV_ARCH >= 300

// shfl
#if __CUDACC_VER_MAJOR__ >= 9
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
#endif

// shfl
__device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize)
{
return (uchar) __shfl((int) val, srcLane, width);
Expand Down Expand Up @@ -419,6 +425,10 @@ CV_CUDEV_SHFL_XOR_VEC_INST(float)
CV_CUDEV_SHFL_XOR_VEC_INST(double)

#undef CV_CUDEV_SHFL_XOR_VEC_INST
#undef __shfl
#undef __shfl_xor
#undef __shfl_up
#undef __shfl_down

#endif // CV_CUDEV_ARCH >= 300

Expand Down

0 comments on commit 419a9a8

Please sign in to comment.