Skip to content

Commit

Permalink
Fix for CUDA codegen (spcl#1442)
Browse files Browse the repository at this point in the history
This PR addresses spcl#1388: fix python codegen and `SharedToGlobal1D`
template to generate correct code for write without reduction.
  • Loading branch information
edopao authored Dec 18, 2023
1 parent bf56e4d commit 7c06755
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 20 deletions.
16 changes: 14 additions & 2 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,10 +1132,22 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
func=funcname,
type=dst_node.desc(sdfg).dtype.ctype,
bdims=', '.join(_topy(self._block_dims)),
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false',
accum=accum,
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction +
_topy(dst_strides) + _topy(copy_shape))), sdfg, state_id, [src_node, dst_node])
elif funcname == 'dace::SharedToGlobal1D':
# special case: use a new template struct that provides functions for copy and reduction
callsite_stream.write(
(' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format(
func=funcname,
type=dst_node.desc(sdfg).dtype.ctype,
bdims=', '.join(_topy(self._block_dims)),
copysize=', '.join(_topy(copy_shape)),
is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false',
accum=accum or '::Copy',
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction)), sdfg,
state_id, [src_node, dst_node])
else:
callsite_stream.write(
(' {func}<{type}, {bdims}, {copysize}, ' +
Expand All @@ -1145,7 +1157,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
bdims=', '.join(_topy(self._block_dims)),
copysize=', '.join(_topy(copy_shape)),
dststrides=', '.join(_topy(dst_strides)),
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false',
accum=accum,
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), sdfg,
state_id, [src_node, dst_node])
Expand Down
53 changes: 35 additions & 18 deletions dace/runtime/include/dace/cuda/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -736,60 +736,77 @@ namespace dace
int COPY_XLEN, bool ASYNC>
struct SharedToGlobal1D
{
template <typename WCR>
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE, WCR wcr)
static constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
static constexpr int TOTAL = COPY_XLEN;
static constexpr int WRITES = TOTAL / BLOCK_SIZE;
static constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

static DACE_DFI void Copy(const T *smem, int src_xstride, T *ptr, int dst_xstride)
{
// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
*(ptr + (ltid + i * BLOCK_SIZE) * dst_xstride) =
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride);
}

if (REM_WRITES != 0 && ltid < REM_WRITES) {
*(ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride) =
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride);
}

if (!ASYNC)
__syncthreads();
}

template <typename WCR>
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride, WCR wcr)
{
// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
constexpr int TOTAL = COPY_XLEN;
constexpr int WRITES = TOTAL / BLOCK_SIZE;
constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
wcr_custom<T>::template reduce(
wcr, ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
wcr, ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
}

if (REM_WRITES != 0) {
if (ltid < REM_WRITES)
wcr_custom<T>::template reduce(
ptr + (ltid + WRITES * BLOCK_SIZE)* DST_XSTRIDE,
ptr + (ltid + WRITES * BLOCK_SIZE)* dst_xstride,
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
}

if (!ASYNC)
__syncthreads();
}

template <ReductionType REDTYPE>
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE)
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride)
{
if (!ASYNC)
__syncthreads();

// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
constexpr int TOTAL = COPY_XLEN;
constexpr int WRITES = TOTAL / BLOCK_SIZE;
constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
wcr_fixed<REDTYPE, T>::template reduce_atomic(
ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
}

if (REM_WRITES != 0) {
if (ltid < REM_WRITES)
wcr_fixed<REDTYPE, T>::template reduce_atomic(
ptr + (ltid + WRITES*BLOCK_SIZE)* DST_XSTRIDE,
ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride,
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
}

if (!ASYNC)
__syncthreads();
}
};

Expand Down
84 changes: 84 additions & 0 deletions tests/codegen/cuda_memcopy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
""" Tests code generation for array copy on GPU target. """
import dace
from dace.transformation.auto import auto_optimize

import pytest
import re

# this test requires cupy module
cp = pytest.importorskip("cupy")

# initialize random number generator
rng = cp.random.default_rng(42)


@pytest.mark.gpu
def test_gpu_shared_to_global_1D():
M = 32
N = dace.symbol('N')

@dace.program
def transpose_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
for i in dace.map[0:N]:
local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
for j in dace.map[0:M]:
local_gather[j] = A[j, i]
B[i, :] = local_gather


sdfg = transpose_shared_to_global.to_sdfg()
auto_optimize.apply_gpu_storage(sdfg)

size_M = M
size_N = 128

A = rng.random((size_M, size_N,))
B = rng.random((size_N, size_M,))

ref = A.transpose()

sdfg(A, B, N=size_N)
cp.allclose(ref, B)

code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
m = re.search('dace::SharedToGlobal1D<.+>::Copy', code)
assert m is not None


@pytest.mark.gpu
def test_gpu_shared_to_global_1D_accumulate():
M = 32
N = dace.symbol('N')

@dace.program
def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
for i in dace.map[0:N]:
local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
for j in dace.map[0:M]:
local_gather[j] = A[j, i]
local_gather[:] >> B(M, lambda x, y: x + y)[i, :]


sdfg = transpose_and_add_shared_to_global.to_sdfg()
auto_optimize.apply_gpu_storage(sdfg)

size_M = M
size_N = 128

A = rng.random((size_M, size_N,))
B = rng.random((size_N, size_M,))

ref = A.transpose() + B

sdfg(A, B, N=size_N)
cp.allclose(ref, B)

code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
m = re.search('dace::SharedToGlobal1D<.+>::template Accum', code)
assert m is not None


if __name__ == '__main__':
test_gpu_shared_to_global_1D()
test_gpu_shared_to_global_1D_accumulate()

0 comments on commit 7c06755

Please sign in to comment.