Skip to content

Commit

Permalink
Fix cuda (#668)
Browse files Browse the repository at this point in the history
* outdated unittest template testSuite -> suite

* new nimcuda layout, switch to cuda 12.5, and ref type destructor fix

* better cudaMalloc internal proc typing

* remove deprecated use of .data= proc

* fix cuda -> cpu copy proc

* add side effect

* mark-off buggy proc and tests for it
  • Loading branch information
lilkeet authored Oct 22, 2024
1 parent 873ac94 commit 937b290
Show file tree
Hide file tree
Showing 12 changed files with 103 additions and 76 deletions.
2 changes: 1 addition & 1 deletion src/arraymancer/tensor/backend/cublas.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import nimcuda/[cublas_v2, cublas_api],
import nimcuda/cuda12_5/[cublas_v2, cublas_api],
./cuda_global_state,
./cuda

Expand Down
32 changes: 18 additions & 14 deletions src/arraymancer/tensor/backend/cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,18 @@

import ../data_structure,
./global_config,
nimcuda/[nimcuda, cuda_runtime_api, driver_types]
nimcuda/cuda12_5/[check, cuda_runtime_api, driver_types]

export nimcuda, cuda_runtime_api, driver_types
export check, cuda_runtime_api, driver_types

# Data structures to ease interfacing with Cuda and kernels

proc cudaMalloc*[T](size: Natural): ptr T {.noSideEffect, inline.}=
proc cudaMalloc*[T](size: Natural): ptr UncheckedArray[T] {.noSideEffect, inline.}=
## Internal proc.
## Wrap CudaMAlloc(var pointer, size) -> Error_code
let s = size * sizeof(T)
let s = csize_t(size * sizeof(T))
check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda*[T](p: ref[ptr T]) {.noSideEffect.}=
if not p[].isNil:
check cudaFree(p[])


# ##############################################################
Expand All @@ -38,7 +35,7 @@ proc newCudaStorage*[T: SomeFloat](length: int): CudaStorage[T] {.noSideEffect.}
result.Flen = length
new(result.Fref_tracking, deallocCuda)
result.Fdata = cast[ptr UncheckedArray[T]](cudaMalloc[T](result.Flen))
result.Fref_tracking[] = result.Fdata
result.Fref_tracking.value = result.Fdata

# #########################################################
# # Sending tensor layout to Cuda Kernel
Expand Down Expand Up @@ -70,7 +67,9 @@ type
## Using arrays instead of seq avoids having to indicate __restrict__ everywhere to indicate no-aliasing
## We also prefer stack allocated array sice the data will be used at every single loop iteration to compute elements position.
## Ultimately it avoids worrying about deallocation too
CudaLayoutArray = ref[ptr cint]
CudaLayoutArrayObj* = object
value*: ptr UncheckedArray[cint]
CudaLayoutArray* = ref CudaLayoutArrayObj


CudaTensorLayout [T: SomeFloat] = object
Expand All @@ -88,6 +87,11 @@ type
data*: ptr T # Data on Cuda device
len*: cint # Number of elements allocated in memory


proc deallocCuda*(p: CudaLayoutArray) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)

proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSideEffect.}=
## Store a CudaTensor shape, strides, etc information on the GPU
#
Expand All @@ -103,8 +107,8 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid
new result.shape, deallocCuda
new result.strides, deallocCuda

result.shape[] = cudaMalloc[cint](MAXRANK)
result.strides[] = cudaMalloc[cint](MAXRANK)
result.shape.value = cudaMalloc[cint](MAXRANK)
result.strides.value = cudaMalloc[cint](MAXRANK)

var
tmp_shape: array[MAXRANK, cint] # CudaLayoutArray
Expand All @@ -116,6 +120,6 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid


# TODO: use streams and async
let size = t.rank * sizeof(cint)
check cudaMemCpy(result.shape[], addr tmp_shape[0], size, cudaMemcpyHostToDevice)
check cudaMemCpy(result.strides[], addr tmp_strides[0], size, cudaMemcpyHostToDevice)
let size = csize_t(t.rank * sizeof(cint))
check cudaMemCpy(result.shape.value, addr tmp_shape[0], size, cudaMemcpyHostToDevice)
check cudaMemCpy(result.strides.value, addr tmp_strides[0], size, cudaMemcpyHostToDevice)
3 changes: 2 additions & 1 deletion src/arraymancer/tensor/backend/cuda_global_state.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import nimcuda/[nimcuda, cuda_runtime_api, cublas_v2, cublas_api]
import nimcuda/cuda12_5/[check, cuda_runtime_api, cublas_v2, cublas_api,
driver_types]

# ###################################################
# Global Cuda and CuBLAS state
Expand Down
14 changes: 13 additions & 1 deletion src/arraymancer/tensor/data_structure.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ import
../laser/dynamic_stack_arrays,
../laser/tensor/datatypes,
nimblas,
nimcuda/cuda12_5/[cuda_runtime_api, check],
# Standard library
std/[complex]

export nimblas.OrderType, complex
export datatypes, dynamic_stack_arrays

type
CudaTensorRefTrackerObj*[T: SomeFloat] = object
value*: ptr UncheckedArray[T]

CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T]

CudaStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the Cuda backend.
##
Expand All @@ -31,7 +37,7 @@ type
# TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.
Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection.

CudaTensor*[T: SomeFloat] = object
## Tensor data structure stored on Nvidia GPU (Cuda)
Expand Down Expand Up @@ -73,6 +79,12 @@ type

AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]


proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)


# ###############
# Field accessors
# ###############
Expand Down
5 changes: 3 additions & 2 deletions src/arraymancer/tensor/init_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@ proc cuda*[T:SomeFloat](t: Tensor[T]): CudaTensor[T] {.noinit.}=
cudaMemcpyHostToDevice,
cudaStream0) # cudaStream0 is a cudaStream_t global var

proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noSideEffect, noinit.}=
proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noinit.}=
## Convert a tensor on a Cuda device to a tensor on Cpu.
# We use blocking copy in this case to make sure
# all data is available for future computation

result.shape = t.shape
result.strides = t.strides
result.offset = t.offset
result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated

allocCpuStorage result.storage, t.storage.Flen

let size = csize_t(t.storage.Flen * sizeof(T))

Expand Down
40 changes: 20 additions & 20 deletions src/arraymancer/tensor/private/p_kernels_interface_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ template cuda_assign_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -86,9 +86,9 @@ template cuda_assign_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data
)

Expand All @@ -106,9 +106,9 @@ template cuda_binary_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
a_shape, a_strides: ptr cint, a_offset: cint, a_data: ptr T,
b_shape, b_strides: ptr cint, b_offset: cint, b_data: ptr T
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
a_shape, a_strides: ptr UncheckedArray[cint], a_offset: cint, a_data: ptr T,
b_shape, b_strides: ptr UncheckedArray[cint], b_offset: cint, b_data: ptr T
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -170,11 +170,11 @@ template cuda_binary_call*[T: SomeFloat](
kernel_name(
CUDA_HOF_TPB, CUDA_HOF_BPG,
src_a.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src_a.shape[], src_a.strides[],
src_a.shape.value, src_a.strides.value,
src_a.offset, src_a.data,
src_b.shape[], src_b.strides[],
src_b.shape.value, src_b.strides.value,
src_b.offset, src_b.data
)

Expand All @@ -193,8 +193,8 @@ template cuda_rscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
beta: T
) {.importcpp: import_string, noSideEffect.}

Expand Down Expand Up @@ -252,9 +252,9 @@ template cuda_rscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data,
beta
)
Expand All @@ -274,9 +274,9 @@ template cuda_lscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
alpha: T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -332,10 +332,10 @@ template cuda_lscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
alpha,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data
)

Expand All @@ -352,7 +352,7 @@ template cuda_assignscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
scalar: T
) {.importcpp: import_string, noSideEffect.}

Expand Down Expand Up @@ -402,7 +402,7 @@ template cuda_assignscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
dst.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
val
)
7 changes: 5 additions & 2 deletions src/arraymancer/tensor/shapeshifting_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ proc transpose*(t: CudaTensor): CudaTensor {.noSideEffect.}=

cuda_assign_glue("cuda_asContiguous", "CopyOp", cuda_asContiguous)

proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = colMajor, force: bool = false):
CudaTensor[T] {.noSideEffect.}=
proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = rowMajor, force: bool = false):
CudaTensor[T] {.noSideEffect, error: "NOT WORKING RIGHT NOW TODO: FIX".}=
## Transform a tensor with general striding to a Tensor with contiguous layout.
##
## By default CudaTensor will be colMajor (contrary to a cpu tensor).
##
## By default nothing is done if the tensor is already contiguous (C Major or F major)
## The "force" parameter can force re-ordering to a specific layout
# TODO: fix. this proc always outputs rowmajor, no matter the input.
# probably has to do with all the cuda tensors being colmajor by default,
# plus probably some double-negative of two bugs making the other procs work.

if t.isContiguous and not force:
return t
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_accessors_slicer_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import ../../src/arraymancer
import std / unittest


testSuite "CUDA: Testing indexing and slice syntax":
suite "CUDA: Testing indexing and slice syntax":
const
a = @[1, 2, 3, 4, 5]
b = @[1, 2, 3, 4, 5]
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_broadcasting_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import ../../src/arraymancer
import std / [unittest, sugar, sequtils]

testSuite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
suite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
test "Tensor element-wise multiplication (Hadamard product) and division":
block:
let u = @[-4, 0, 9].toTensor().asType(float32).cuda
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_init_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import ../../src/arraymancer
import std / unittest


testSuite "Cuda init":
suite "Cuda init":
test "Clone function":
let a = [ 7, 4, 3, 1, 8, 6,
8, 1, 6, 2, 6, 6,
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_operators_blas_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import ../../src/arraymancer
import std / [unittest, sugar]

testSuite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
suite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
test "GEMM - General Matrix to Matrix Multiplication":
## TODO: test with slices
let a = [[1.0,2,3],
Expand Down
Loading

0 comments on commit 937b290

Please sign in to comment.