Skip to content

Commit

Permalink
Merge pull request #1357 from spcl/configurable-tid-type
Browse files Browse the repository at this point in the history
Make thread/block index type configurable
  • Loading branch information
tbennun authored Sep 8, 2023
2 parents f95f816 + e4322d2 commit 66e0e65
Show file tree
Hide file tree
Showing 15 changed files with 119 additions and 55 deletions.
1 change: 1 addition & 0 deletions dace/codegen/compiled_sdfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
result: Dict[dtypes.StorageType, int] = {}
for storage in self.external_memory_types:
func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}')
func.restype = ctypes.c_size_t
result[storage] = func(self._libhandle, *self._lastargs[1])

return result
Expand Down
31 changes: 31 additions & 0 deletions dace/codegen/cppunparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
import numpy as np
import os
import tokenize
import warnings

import sympy
import dace
Expand Down Expand Up @@ -733,6 +734,21 @@ def _Num(self, t):
if isinstance(t.n, complex):
dtype = dtypes.DTYPE_TO_TYPECLASS[complex]

# Handle large integer values
if isinstance(t.n, int):
bits = t.n.bit_length()
if bits == 32: # Integer, potentially unsigned
if t.n >= 0: # unsigned
repr_n += 'U'
else: # signed, 64-bit
repr_n += 'LL'
elif 32 < bits <= 63:
repr_n += 'LL'
elif bits == 64 and t.n >= 0:
repr_n += 'ULL'
elif bits >= 64:
warnings.warn(f'Value wider than 64 bits encountered in expression ({t.n}), emitting as-is')

if repr_n.endswith("j"):
self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
else:
Expand Down Expand Up @@ -831,8 +847,23 @@ def _Tuple(
self.write(")")

unop = {"Invert": "~", "Not": "!", "UAdd": "+", "USub": "-"}
unop_lambda = {'Invert': (lambda x: ~x), 'Not': (lambda x: not x), 'UAdd': (lambda x: +x), 'USub': (lambda x: -x)}

def _UnaryOp(self, t):
# Dispatch constants after applying the operation
if sys.version_info[:2] < (3, 8):
if isinstance(t.operand, ast.Num):
newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
newnode = ast.Num(n=newval)
self.dispatch(newnode)
return
else:
if isinstance(t.operand, ast.Constant):
newval = self.unop_lambda[t.op.__class__.__name__](t.operand.value)
newnode = ast.Constant(value=newval)
self.dispatch(newnode)
return

self.write("(")
self.write(self.unop[t.op.__class__.__name__])
self.write(" ")
Expand Down
15 changes: 11 additions & 4 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1939,6 +1939,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream):
node = dfg_scope.source_nodes()[0]

# Get the thread/block index type
ttype = Config.get('compiler', 'cuda', 'thread_id_type')
tidtype = getattr(dtypes, ttype, False)
if not isinstance(tidtype, dtypes.typeclass):
raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. '
'See ``dace.dtypes`` for available types (for example ``int32``).')

# allocating shared memory for dynamic threadblock maps
if has_dtbmap:
kernel_stream.write(
Expand Down Expand Up @@ -1990,8 +1997,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_

expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)

kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)

# Delinearize beyond the third dimension
if len(krange) > 3:
Expand All @@ -2010,8 +2017,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
)

expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)

# Dispatch internal code
assert CUDACodeGen._in_device_code is False
Expand Down
11 changes: 11 additions & 0 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,17 @@ required:
a specified larger block size in the third dimension. Default value is
derived from hardware limits on common GPUs.
thread_id_type:
type: str
title: Thread/block index data type
default: int32
description: >
Defines the data type for a thread and block index in the generated code.
The type is based on the type-classes in ``dace.dtypes``. For example,
``uint64`` is equivalent to ``dace.uint64``. Change this setting when large
index types are needed to address memory offsets that are beyond the 32-bit
range, or to reduce memory usage.
#############################################
# General FPGA flags
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/blas/environments/cublas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class cuBLAS:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/blas/environments/rocblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class rocBLAS:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
12 changes: 8 additions & 4 deletions dace/libraries/blas/include/dace_cublas.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ static void CheckCublasError(cublasStatus_t const& status) {
}

static cublasHandle_t CreateCublasHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cublasHandle_t handle;
CheckCublasError(cublasCreate(&handle));
Expand Down Expand Up @@ -65,8 +67,10 @@ class _CublasConstants {
}

_CublasConstants(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
// Allocate constant zero with the largest used size
cudaMalloc(&zero_, sizeof(cuDoubleComplex) * 1);
Expand Down
60 changes: 32 additions & 28 deletions dace/libraries/blas/include/dace_rocblas.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ static void CheckRocblasError(rocblas_status const& status) {
}

static rocblas_handle CreateRocblasHandle(int device) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
if (device >= 0) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
}
}
rocblas_handle handle;
CheckRocblasError(rocblas_create_handle(&handle));
Expand Down Expand Up @@ -68,53 +70,55 @@ class _RocblasConstants {
}

_RocblasConstants(int device) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
if (device >= 0) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
}
}
// Allocate constant zero with the largest used size
hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);

// Allocate constant one
hipMalloc(&half_pone_, sizeof(__half) * 1);
(void)hipMalloc(&half_pone_, sizeof(__half) * 1);
__half half_pone = __float2half(1.0f);
hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
(void)hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
hipMemcpyHostToDevice);
hipMalloc(&float_pone_, sizeof(float) * 1);
(void)hipMalloc(&float_pone_, sizeof(float) * 1);
float float_pone = 1.0f;
hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
(void)hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
hipMemcpyHostToDevice);
hipMalloc(&double_pone_, sizeof(double) * 1);
(void)hipMalloc(&double_pone_, sizeof(double) * 1);
double double_pone = 1.0;
hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
(void)hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
hipMemcpyHostToDevice);
hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
(void)hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
hipComplex complex64_pone = make_hipFloatComplex(1.0f, 0.0f);
hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
(void)hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
hipMemcpyHostToDevice);
hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
hipDoubleComplex complex128_pone = make_hipDoubleComplex(1.0, 0.0);
hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
(void)hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
hipMemcpyHostToDevice);

// Allocate custom factors and default to zero
hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
}

_RocblasConstants(_RocblasConstants const&) = delete;

~_RocblasConstants() {
hipFree(zero_);
hipFree(half_pone_);
hipFree(float_pone_);
hipFree(double_pone_);
hipFree(complex64_pone_);
hipFree(complex128_pone_);
hipFree(custom_alpha_);
hipFree(custom_beta_);
(void)hipFree(zero_);
(void)hipFree(half_pone_);
(void)hipFree(float_pone_);
(void)hipFree(double_pone_);
(void)hipFree(complex64_pone_);
(void)hipFree(complex128_pone_);
(void)hipFree(custom_alpha_);
(void)hipFree(custom_beta_);
}

_RocblasConstants& operator=(_RocblasConstants const&) = delete;
Expand Down
16 changes: 8 additions & 8 deletions dace/libraries/blas/nodes/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ def expansion(node, state, sdfg):
code = ''
if dtype in (dace.complex64, dace.complex128):
code = f'''
{dtype.ctype} alpha = {alpha};
{dtype.ctype} beta = {beta};
{dtype.ctype} __alpha = {alpha};
{dtype.ctype} __beta = {beta};
'''
opt['alpha'] = '&alpha'
opt['beta'] = '&beta'
opt['alpha'] = '&__alpha'
opt['beta'] = '&__beta'

code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, "
"{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, "
Expand Down Expand Up @@ -287,12 +287,12 @@ def expansion(cls, node, state, sdfg):

# Set pointer mode to host
call_prefix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_host});
{dtype.ctype} alpha = {alpha};
{dtype.ctype} beta = {beta};
{dtype.ctype} __alpha = {alpha};
{dtype.ctype} __beta = {beta};
'''
call_suffix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_device});'''
alpha = f'({cdtype} *)&alpha'
beta = f'({cdtype} *)&beta'
alpha = f'({cdtype} *)&__alpha'
beta = f'({cdtype} *)&__beta'
else:
alpha = constants[node.alpha]
beta = constants[node.beta]
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/lapack/environments/cusolverdn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuSolverDn:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/lapack/include/dace_cusolverdn.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ static void CheckCusolverDnError(cusolverStatus_t const& status) {
}

static cusolverDnHandle_t CreateCusolverDnHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cusolverDnHandle_t handle;
CheckCusolverDnError(cusolverDnCreate(&handle));
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/linalg/environments/cutensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuTensor:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/linalg/include/dace_cutensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ static void CheckCuTensorError(cutensorStatus_t const& status) {
}

static cutensorHandle_t CreateCuTensorHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cutensorHandle_t handle;
CheckCuTensorError(cutensorInit(&handle));
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/sparse/environments/cusparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuSPARSE:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/sparse/include/dace_cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ static void CheckCusparseError(cusparseStatus_t const& status) {
}

static cusparseHandle_t CreateCusparseHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cusparseHandle_t handle;
CheckCusparseError(cusparseCreate(&handle));
Expand Down

0 comments on commit 66e0e65

Please sign in to comment.