diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3f03e30..8d53cc5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,16 +37,19 @@ jobs: - name: Build run: | if [ ${{ matrix.os }} == "ubuntu-latest" ] ; then - stack build --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c:gsl-example --flag inline-c-cpp:std-vector-example + stack build --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c:gsl-example --flag inline-c-cpp:std-vector-example --flag inline-c-cuda:test-without-cuda else - stack build --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c-cpp:std-vector-example + stack build --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c-cpp:std-vector-example --flag inline-c-cuda:test-without-cuda fi - name: Test run: | if [ ${{ matrix.os }} == "ubuntu-latest" ] ; then - stack test --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c:gsl-example --flag inline-c-cpp:std-vector-example + stack test --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c:gsl-example --flag inline-c-cpp:std-vector-example --flag inline-c-cuda:test-without-cuda ./inline-c-cpp/test-error-message-line-numbers.sh --stack-yaml stack-${{ matrix.stackage }}.yaml else - stack test --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c-cpp:std-vector-example + stack test --stack-yaml stack-${{ matrix.stackage }}.yaml --flag inline-c-cpp:std-vector-example --flag inline-c-cuda:test-without-cuda ./inline-c-cpp/test-error-message-line-numbers.sh --stack-yaml stack-${{ matrix.stackage }}.yaml fi + env: + INLINE_C_CUDA_SUFFIX: cc + INLINE_C_CUDA_COMPILER: g++ diff --git a/inline-c-cuda/inline-c-cuda.cabal b/inline-c-cuda/inline-c-cuda.cabal index 1af8144..c221382 100644 --- a/inline-c-cuda/inline-c-cuda.cabal +++ b/inline-c-cuda/inline-c-cuda.cabal @@ -6,17 +6,21 @@ description: Utilities to inline CUDA code into Haskell using inline-c. tests for example on how to build. license: MIT license-file: LICENSE -author: Francesco Mazzoli -maintainer: f@mazzo.li -copyright: (c) 2015-2016 FP Complete Corporation, (c) 2017-2019 Francesco Mazzoli +author: Junji Hashimoto +maintainer: junji.hashimoto@gmail.com +copyright: (c) 2015-2016 FP Complete Corporation, (c) 2023 Junji Hashimoto category: FFI -tested-with: GHC == 9.2.2 +tested-with: GHC == 9.2.8, GHC == 9.4.7, GHC == 9.6.2 build-type: Simple source-repository head type: git location: https://github.com/fpco/inline-c +flag test-without-cuda + description: Test without cuda + default: False + library exposed-modules: Language.C.Inline.Cuda build-depends: base >=4.7 && <5 @@ -31,7 +35,11 @@ library hs-source-dirs: src default-language: Haskell2010 ghc-options: -Wall - extra-libraries: cudart + if flag(test-without-cuda) + cpp-options: -DTEST_WITHOUT_CUDA + else + extra-libraries: cudart + test-suite tests type: exitcode-stdio-1.0 @@ -48,3 +56,5 @@ test-suite tests , template-haskell , vector default-language: Haskell2010 + if flag(test-without-cuda) + cpp-options: -DTEST_WITHOUT_CUDA diff --git a/inline-c-cuda/src/Language/C/Inline/Cuda.hs b/inline-c-cuda/src/Language/C/Inline/Cuda.hs index a98557e..eb76371 100644 --- a/inline-c-cuda/src/Language/C/Inline/Cuda.hs +++ b/inline-c-cuda/src/Language/C/Inline/Cuda.hs @@ -25,12 +25,16 @@ import qualified Data.Map as Map import Control.Monad.IO.Class (liftIO) import System.Exit (ExitCode(..)) import System.Process (readProcessWithExitCode) +import System.Environment (lookupEnv) +import Data.Maybe (fromMaybe) compileCuda :: String -> TH.Q FilePath compileCuda src = do - cuFile <- TH.addTempFile "cu" + nvcc <- fromMaybe "nvcc" <$> TH.runIO (lookupEnv "INLINE_C_CUDA_COMPILER") + cu <- fromMaybe "cu" <$> TH.runIO (lookupEnv "INLINE_C_CUDA_SUFFIX") oFile <- TH.addTempFile "o" - let (cmd,args) = ("nvcc", ["-c","-o",oFile, cuFile]) + cuFile <- TH.addTempFile cu + let (cmd,args) = (nvcc, ["-c", "-o", oFile, cuFile]) (code, stdout, stderr) <- liftIO $ do writeFile cuFile src readProcessWithExitCode cmd args "" diff --git a/inline-c-cuda/test/tests.hs b/inline-c-cuda/test/tests.hs index 1e5e528..658cb7f 100644 --- a/inline-c-cuda/test/tests.hs +++ b/inline-c-cuda/test/tests.hs @@ -38,6 +38,56 @@ C.context $ C.cudaCtx C.include "" C.include "" +#ifdef TEST_WITHOUT_CUDA + +[C.emitBlock| + +void +vectorAdd(int blocksPerGrid, int threadsPerBlock, const float *A, const float *B, float *C, int numElements) +{ + for(int blockIdx = 0; blockIdx < blocksPerGrid ; blockIdx++){ + int blockDim = threadsPerBlock; + for(int threadIdx = 0; threadIdx < threadsPerBlock ; threadIdx++){ + int i = blockDim * blockIdx + threadIdx; + + if (i < numElements) + { + C[i] = A[i] + B[i]; + } + } + } +} + + +typedef int cudaError_t; +const int cudaSuccess = 1; + +cudaError_t cudaMalloc(void** dst, size_t size){ + *dst = malloc(size); + return cudaSuccess; +} + +cudaError_t cudaFree(void* dst){ + free(dst); + return cudaSuccess; +} + +const int cudaMemcpyHostToDevice = 0; +const int cudaMemcpyDeviceToHost = 1; + +cudaError_t cudaMemcpy(void *dst, void *src, size_t nbytes, int direction){ + memcpy(dst, src, nbytes); + return cudaSuccess; +} + +char* cudaGetErrorString(cudaError_t err){ + return ""; +} + +|] + +#else + [C.emitBlock| __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) @@ -51,6 +101,8 @@ vectorAdd(const float *A, const float *B, float *C, int numElements) } |] +#endif + cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b cudaAllocaArray size func = do let csize = fromIntegral size @@ -121,11 +173,19 @@ main = Hspec.hspec $ do } |] cudaMemcpyHostToDevice numElements h_A d_A cudaMemcpyHostToDevice numElements h_B d_B +#ifdef TEST_WITHOUT_CUDA + [C.block| void { + const int threadsPerBlock = 256; + const int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock; + vectorAdd(blocksPerGrid, threadsPerBlock, $(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements)); + } |] +#else [C.block| void { - int threadsPerBlock = 256; - int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock; + const int threadsPerBlock = 256; + const int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock; vectorAdd<<>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements)); } |] +#endif cudaMemcpyDeviceToHost numElements d_C h_C lA <- peekArray numElements h_A lB <- peekArray numElements h_B diff --git a/stack-lts-20.yaml b/stack-lts-20.yaml index 9959c4e..ac57769 100644 --- a/stack-lts-20.yaml +++ b/stack-lts-20.yaml @@ -3,4 +3,5 @@ packages: - inline-c - inline-c-cpp - inline-c-objc +- inline-c-cuda - sample-cabal-project diff --git a/stack-lts-21.yaml b/stack-lts-21.yaml index e339fe4..cc543ee 100644 --- a/stack-lts-21.yaml +++ b/stack-lts-21.yaml @@ -3,4 +3,5 @@ packages: - inline-c - inline-c-cpp - inline-c-objc +- inline-c-cuda - sample-cabal-project diff --git a/stack-nightly.yaml b/stack-nightly.yaml index aba2285..141cf1c 100644 --- a/stack-nightly.yaml +++ b/stack-nightly.yaml @@ -3,4 +3,5 @@ packages: - inline-c - inline-c-cpp - inline-c-objc +- inline-c-cuda - sample-cabal-project