From 602220eb7fab11fbf9190c7db4568a3371ff1ab7 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Aug 2023 20:06:19 -0700 Subject: [PATCH 1/7] Codegen: Make thread/block index type configurable --- dace/codegen/targets/cuda.py | 15 +++++++++++---- dace/config_schema.yml | 11 +++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index ee49f04d03..a465d2bbc0 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1939,6 +1939,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream): node = dfg_scope.source_nodes()[0] + # Get the thread/block index type + ttype = Config.get('compiler', 'cuda', 'thread_id_type') + tidtype = getattr(dtypes, ttype, False) + if not isinstance(tidtype, dtypes.typeclass): + raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. ' + 'See ``dace.dtypes`` for available types (for example ``int32``).') + # allocating shared memory for dynamic threadblock maps if has_dtbmap: kernel_stream.write( @@ -1990,8 +1997,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr) - kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) # Delinearize beyond the third dimension if len(krange) > 3: @@ -2010,8 +2017,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ ) expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr) - kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) # Dispatch internal code assert CUDACodeGen._in_device_code is False diff --git a/dace/config_schema.yml b/dace/config_schema.yml index e378b6c1f2..08a427aa52 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -413,6 +413,17 @@ required: a specified larger block size in the third dimension. Default value is derived from hardware limits on common GPUs. + thread_id_type: + type: str + title: Thread/block index data type + default: int32 + description: > + Defines the data type for a thread and block index in the generated code. + The type is based on the type-classes in ``dace.dtypes``. For example, + ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large + index types are needed to address memory offsets that are beyond the 32-bit + range, or to reduce memory usage. + ############################################# # General FPGA flags From 5f6e371f2905b835da8f594db94bb7b44b0305da Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Aug 2023 20:06:46 -0700 Subject: [PATCH 2/7] Rename alpha/beta in library node to avoid clashes --- dace/libraries/blas/nodes/gemm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py index 2db2055ae5..83be99d78b 100644 --- a/dace/libraries/blas/nodes/gemm.py +++ b/dace/libraries/blas/nodes/gemm.py @@ -184,11 +184,11 @@ def expansion(node, state, sdfg): code = '' if dtype in (dace.complex64, dace.complex128): code = f''' - {dtype.ctype} alpha = {alpha}; - {dtype.ctype} beta = {beta}; + {dtype.ctype} __alpha = {alpha}; + {dtype.ctype} __beta = {beta}; ''' - opt['alpha'] = '&alpha' - opt['beta'] = '&beta' + opt['alpha'] = '&__alpha' + opt['beta'] = '&__beta' code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, " "{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, " @@ -287,12 +287,12 @@ def expansion(cls, node, state, sdfg): # Set pointer mode to host call_prefix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_host}); - {dtype.ctype} alpha = {alpha}; - {dtype.ctype} beta = {beta}; + {dtype.ctype} __alpha = {alpha}; + {dtype.ctype} __beta = {beta}; ''' call_suffix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_device});''' - alpha = f'({cdtype} *)&alpha' - beta = f'({cdtype} *)&beta' + alpha = f'({cdtype} *)&__alpha' + beta = f'({cdtype} *)&__beta' else: alpha = constants[node.alpha] beta = constants[node.beta] From acd58851e66ee561e3a60bef79719a9ca9f7ffaf Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Aug 2023 20:56:08 -0700 Subject: [PATCH 3/7] Respect return type of get_external_memory_size --- dace/codegen/compiled_sdfg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 9ee0772eeb..22f95d01d7 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -287,6 +287,7 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]: result: Dict[dtypes.StorageType, int] = {} for storage in self.external_memory_types: func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}') + func.restype = ctypes.c_size_t result[storage] = func(self._libhandle, *self._lastargs[1]) return result From 30fdcf7916f419bbb4484d8eac4342a302592705 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Aug 2023 20:56:36 -0700 Subject: [PATCH 4/7] Handle large integer values in C code generation --- dace/codegen/cppunparse.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index eae0ed229e..31dae08f79 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -78,6 +78,7 @@ import numpy as np import os import tokenize +import warnings import sympy import dace @@ -733,6 +734,21 @@ def _Num(self, t): if isinstance(t.n, complex): dtype = dtypes.DTYPE_TO_TYPECLASS[complex] + # Handle large integer values + if isinstance(t.n, int): + bits = t.n.bit_length() + if bits == 32: # Integer, potentially unsigned + if t.n >= 0: # unsigned + repr_n += 'U' + else: # signed, 64-bit + repr_n += 'LL' + elif 32 < bits <= 63: + repr_n += 'LL' + elif bits == 64 and t.n >= 0: + repr_n += 'ULL' + elif bits >= 64: + warnings.warn(f'Value wider than 64 bits encountered in expression ({t.n}), emitting as-is') + if repr_n.endswith("j"): self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1])) else: From 8a8744e1b55f3f3ddae1c162f645eed6f839ac4d Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Wed, 30 Aug 2023 11:28:46 -0700 Subject: [PATCH 5/7] Environments: Work well with external libraries that set their own GPU device --- dace/libraries/blas/environments/cublas.py | 2 +- dace/libraries/blas/environments/rocblas.py | 2 +- dace/libraries/blas/include/dace_cublas.h | 12 ++-- dace/libraries/blas/include/dace_rocblas.h | 60 ++++++++++--------- .../lapack/environments/cusolverdn.py | 2 +- .../lapack/include/dace_cusolverdn.h | 6 +- .../libraries/linalg/environments/cutensor.py | 2 +- dace/libraries/linalg/include/dace_cutensor.h | 6 +- .../libraries/sparse/environments/cusparse.py | 2 +- dace/libraries/sparse/include/dace_cusparse.h | 6 +- 10 files changed, 57 insertions(+), 43 deletions(-) diff --git a/dace/libraries/blas/environments/cublas.py b/dace/libraries/blas/environments/cublas.py index d4ab879e61..ef73b511c0 100644 --- a/dace/libraries/blas/environments/cublas.py +++ b/dace/libraries/blas/environments/cublas.py @@ -25,7 +25,7 @@ class cuBLAS: def handle_setup_code(node): location = node.location if not location or "gpu" not in node.location: - location = 0 + location = -1 # -1 means current device else: try: location = int(location["gpu"]) diff --git a/dace/libraries/blas/environments/rocblas.py b/dace/libraries/blas/environments/rocblas.py index 5d752ed690..47e16531ff 100644 --- a/dace/libraries/blas/environments/rocblas.py +++ b/dace/libraries/blas/environments/rocblas.py @@ -25,7 +25,7 @@ class rocBLAS: def handle_setup_code(node): location = node.location if not location or "gpu" not in node.location: - location = 0 + location = -1 # -1 means current device else: try: location = int(location["gpu"]) diff --git a/dace/libraries/blas/include/dace_cublas.h b/dace/libraries/blas/include/dace_cublas.h index 8ec03c2b37..3547a009d2 100644 --- a/dace/libraries/blas/include/dace_cublas.h +++ b/dace/libraries/blas/include/dace_cublas.h @@ -21,8 +21,10 @@ static void CheckCublasError(cublasStatus_t const& status) { } static cublasHandle_t CreateCublasHandle(int device) { - if (cudaSetDevice(device) != cudaSuccess) { - throw std::runtime_error("Failed to set CUDA device."); + if (device >= 0) { + if (cudaSetDevice(device) != cudaSuccess) { + throw std::runtime_error("Failed to set CUDA device."); + } } cublasHandle_t handle; CheckCublasError(cublasCreate(&handle)); @@ -65,8 +67,10 @@ class _CublasConstants { } _CublasConstants(int device) { - if (cudaSetDevice(device) != cudaSuccess) { - throw std::runtime_error("Failed to set CUDA device."); + if (device >= 0) { + if (cudaSetDevice(device) != cudaSuccess) { + throw std::runtime_error("Failed to set CUDA device."); + } } // Allocate constant zero with the largest used size cudaMalloc(&zero_, sizeof(cuDoubleComplex) * 1); diff --git a/dace/libraries/blas/include/dace_rocblas.h b/dace/libraries/blas/include/dace_rocblas.h index 7a7e4a75ee..00469136a3 100644 --- a/dace/libraries/blas/include/dace_rocblas.h +++ b/dace/libraries/blas/include/dace_rocblas.h @@ -24,8 +24,10 @@ static void CheckRocblasError(rocblas_status const& status) { } static rocblas_handle CreateRocblasHandle(int device) { - if (hipSetDevice(device) != hipSuccess) { - throw std::runtime_error("Failed to set HIP device."); + if (device >= 0) { + if (hipSetDevice(device) != hipSuccess) { + throw std::runtime_error("Failed to set HIP device."); + } } rocblas_handle handle; CheckRocblasError(rocblas_create_handle(&handle)); @@ -68,53 +70,55 @@ class _RocblasConstants { } _RocblasConstants(int device) { - if (hipSetDevice(device) != hipSuccess) { - throw std::runtime_error("Failed to set HIP device."); + if (device >= 0) { + if (hipSetDevice(device) != hipSuccess) { + throw std::runtime_error("Failed to set HIP device."); + } } // Allocate constant zero with the largest used size - hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1); - hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1); + (void)hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1); + (void)hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1); // Allocate constant one - hipMalloc(&half_pone_, sizeof(__half) * 1); + (void)hipMalloc(&half_pone_, sizeof(__half) * 1); __half half_pone = __float2half(1.0f); - hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1, + (void)hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1, hipMemcpyHostToDevice); - hipMalloc(&float_pone_, sizeof(float) * 1); + (void)hipMalloc(&float_pone_, sizeof(float) * 1); float float_pone = 1.0f; - hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1, + (void)hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1, hipMemcpyHostToDevice); - hipMalloc(&double_pone_, sizeof(double) * 1); + (void)hipMalloc(&double_pone_, sizeof(double) * 1); double double_pone = 1.0; - hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1, + (void)hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1, hipMemcpyHostToDevice); - hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1); + (void)hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1); hipComplex complex64_pone = make_hipFloatComplex(1.0f, 0.0f); - hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1, + (void)hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1, hipMemcpyHostToDevice); - hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1); + (void)hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1); hipDoubleComplex complex128_pone = make_hipDoubleComplex(1.0, 0.0); - hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1, + (void)hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1, hipMemcpyHostToDevice); // Allocate custom factors and default to zero - hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1); - hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1); - hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1); - hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1); + (void)hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1); + (void)hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1); + (void)hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1); + (void)hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1); } _RocblasConstants(_RocblasConstants const&) = delete; ~_RocblasConstants() { - hipFree(zero_); - hipFree(half_pone_); - hipFree(float_pone_); - hipFree(double_pone_); - hipFree(complex64_pone_); - hipFree(complex128_pone_); - hipFree(custom_alpha_); - hipFree(custom_beta_); + (void)hipFree(zero_); + (void)hipFree(half_pone_); + (void)hipFree(float_pone_); + (void)hipFree(double_pone_); + (void)hipFree(complex64_pone_); + (void)hipFree(complex128_pone_); + (void)hipFree(custom_alpha_); + (void)hipFree(custom_beta_); } _RocblasConstants& operator=(_RocblasConstants const&) = delete; diff --git a/dace/libraries/lapack/environments/cusolverdn.py b/dace/libraries/lapack/environments/cusolverdn.py index c92c8bf3e7..4daad8062e 100644 --- a/dace/libraries/lapack/environments/cusolverdn.py +++ b/dace/libraries/lapack/environments/cusolverdn.py @@ -24,7 +24,7 @@ class cuSolverDn: def handle_setup_code(node): location = node.location if not location or "gpu" not in node.location: - location = 0 + location = -1 # -1 means current device else: try: location = int(location["gpu"]) diff --git a/dace/libraries/lapack/include/dace_cusolverdn.h b/dace/libraries/lapack/include/dace_cusolverdn.h index 2da65ffa2f..f262541f0b 100644 --- a/dace/libraries/lapack/include/dace_cusolverdn.h +++ b/dace/libraries/lapack/include/dace_cusolverdn.h @@ -21,8 +21,10 @@ static void CheckCusolverDnError(cusolverStatus_t const& status) { } static cusolverDnHandle_t CreateCusolverDnHandle(int device) { - if (cudaSetDevice(device) != cudaSuccess) { - throw std::runtime_error("Failed to set CUDA device."); + if (device >= 0) { + if (cudaSetDevice(device) != cudaSuccess) { + throw std::runtime_error("Failed to set CUDA device."); + } } cusolverDnHandle_t handle; CheckCusolverDnError(cusolverDnCreate(&handle)); diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py index e3572a0673..0022ec1f57 100644 --- a/dace/libraries/linalg/environments/cutensor.py +++ b/dace/libraries/linalg/environments/cutensor.py @@ -24,7 +24,7 @@ class cuTensor: def handle_setup_code(node): location = node.location if not location or "gpu" not in node.location: - location = 0 + location = -1 # -1 means current device else: try: location = int(location["gpu"]) diff --git a/dace/libraries/linalg/include/dace_cutensor.h b/dace/libraries/linalg/include/dace_cutensor.h index 8079892285..ddad2feaa3 100644 --- a/dace/libraries/linalg/include/dace_cutensor.h +++ b/dace/libraries/linalg/include/dace_cutensor.h @@ -20,8 +20,10 @@ static void CheckCuTensorError(cutensorStatus_t const& status) { } static cutensorHandle_t CreateCuTensorHandle(int device) { - if (cudaSetDevice(device) != cudaSuccess) { - throw std::runtime_error("Failed to set CUDA device."); + if (device >= 0) { + if (cudaSetDevice(device) != cudaSuccess) { + throw std::runtime_error("Failed to set CUDA device."); + } } cutensorHandle_t handle; CheckCuTensorError(cutensorInit(&handle)); diff --git a/dace/libraries/sparse/environments/cusparse.py b/dace/libraries/sparse/environments/cusparse.py index 0970557944..a731f75bf7 100644 --- a/dace/libraries/sparse/environments/cusparse.py +++ b/dace/libraries/sparse/environments/cusparse.py @@ -24,7 +24,7 @@ class cuSPARSE: def handle_setup_code(node): location = node.location if not location or "gpu" not in node.location: - location = 0 + location = -1 # -1 means current device else: try: location = int(location["gpu"]) diff --git a/dace/libraries/sparse/include/dace_cusparse.h b/dace/libraries/sparse/include/dace_cusparse.h index 82470089e0..9d28bb4748 100644 --- a/dace/libraries/sparse/include/dace_cusparse.h +++ b/dace/libraries/sparse/include/dace_cusparse.h @@ -20,8 +20,10 @@ static void CheckCusparseError(cusparseStatus_t const& status) { } static cusparseHandle_t CreateCusparseHandle(int device) { - if (cudaSetDevice(device) != cudaSuccess) { - throw std::runtime_error("Failed to set CUDA device."); + if (device >= 0) { + if (cudaSetDevice(device) != cudaSuccess) { + throw std::runtime_error("Failed to set CUDA device."); + } } cusparseHandle_t handle; CheckCusparseError(cusparseCreate(&handle)); From 3e9390937f2823f96eb4a960930b0babe4cf3224 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 7 Sep 2023 14:02:46 -0700 Subject: [PATCH 6/7] cppunparse: Dispatch constants after applying the operation --- dace/codegen/cppunparse.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index 31dae08f79..1121aa9f42 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -847,8 +847,16 @@ def _Tuple( self.write(")") unop = {"Invert": "~", "Not": "!", "UAdd": "+", "USub": "-"} + unop_lambda = {'Invert': (lambda x: ~x), 'Not': (lambda x: not x), 'UAdd': (lambda x: +x), 'USub': (lambda x: -x)} def _UnaryOp(self, t): + # Dispatch constants after applying the operation + if t.operand.__class__.__name__ in ('Constant', 'Num'): + newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n) + newnode = ast.Constant(value=newval) + self.dispatch(newnode) + return + self.write("(") self.write(self.unop[t.op.__class__.__name__]) self.write(" ") From e4322d2eeeb8561f2ef99cc305c44737337af183 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 7 Sep 2023 14:13:16 -0700 Subject: [PATCH 7/7] Fix for Python version compatibility --- dace/codegen/cppunparse.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index 1121aa9f42..77dd34d478 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -851,11 +851,18 @@ def _Tuple( def _UnaryOp(self, t): # Dispatch constants after applying the operation - if t.operand.__class__.__name__ in ('Constant', 'Num'): - newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n) - newnode = ast.Constant(value=newval) - self.dispatch(newnode) - return + if sys.version_info[:2] < (3, 8): + if isinstance(t.operand, ast.Num): + newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n) + newnode = ast.Num(n=newval) + self.dispatch(newnode) + return + else: + if isinstance(t.operand, ast.Constant): + newval = self.unop_lambda[t.op.__class__.__name__](t.operand.value) + newnode = ast.Constant(value=newval) + self.dispatch(newnode) + return self.write("(") self.write(self.unop[t.op.__class__.__name__])