Merge pull request #1357 from spcl/configurable-tid-type

Make thread/block index type configurable
spcl · Sep 8, 2023 · 66e0e65 · 66e0e65
2 parents f95f816 + e4322d2
commit 66e0e65
Show file tree

Hide file tree

Showing 15 changed files with 119 additions and 55 deletions.
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
@@ -287,6 +287,7 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
         result: Dict[dtypes.StorageType, int] = {}
         for storage in self.external_memory_types:
             func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}')
+            func.restype = ctypes.c_size_t
             result[storage] = func(self._libhandle, *self._lastargs[1])
 
         return result

diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
@@ -78,6 +78,7 @@
 import numpy as np
 import os
 import tokenize
+import warnings
 
 import sympy
 import dace
@@ -733,6 +734,21 @@ def _Num(self, t):
         if isinstance(t.n, complex):
             dtype = dtypes.DTYPE_TO_TYPECLASS[complex]
 
+        # Handle large integer values
+        if isinstance(t.n, int):
+            bits = t.n.bit_length()
+            if bits == 32:  # Integer, potentially unsigned
+                if t.n >= 0:  # unsigned
+                    repr_n += 'U'
+                else:  # signed, 64-bit
+                    repr_n += 'LL'
+            elif 32 < bits <= 63:
+                repr_n += 'LL'
+            elif bits == 64 and t.n >= 0:
+                repr_n += 'ULL'
+            elif bits >= 64:
+                warnings.warn(f'Value wider than 64 bits encountered in expression ({t.n}), emitting as-is')
+
         if repr_n.endswith("j"):
             self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
         else:
@@ -831,8 +847,23 @@ def _Tuple(
         self.write(")")
 
     unop = {"Invert": "~", "Not": "!", "UAdd": "+", "USub": "-"}
+    unop_lambda = {'Invert': (lambda x: ~x), 'Not': (lambda x: not x), 'UAdd': (lambda x: +x), 'USub': (lambda x: -x)}
 
     def _UnaryOp(self, t):
+        # Dispatch constants after applying the operation
+        if sys.version_info[:2] < (3, 8):
+            if isinstance(t.operand, ast.Num):
+                newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
+                newnode = ast.Num(n=newval)
+                self.dispatch(newnode)
+                return
+        else:
+            if isinstance(t.operand, ast.Constant):
+                newval = self.unop_lambda[t.op.__class__.__name__](t.operand.value)
+                newnode = ast.Constant(value=newval)
+                self.dispatch(newnode)
+                return
+
         self.write("(")
         self.write(self.unop[t.op.__class__.__name__])
         self.write(" ")

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -1939,6 +1939,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
                               kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream):
         node = dfg_scope.source_nodes()[0]
 
+        # Get the thread/block index type
+        ttype = Config.get('compiler', 'cuda', 'thread_id_type')
+        tidtype = getattr(dtypes, ttype, False)
+        if not isinstance(tidtype, dtypes.typeclass):
+            raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. '
+                             'See ``dace.dtypes`` for available types (for example ``int32``).')
+
         # allocating shared memory for dynamic threadblock maps
         if has_dtbmap:
             kernel_stream.write(
@@ -1990,8 +1997,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
 
                 expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
 
-                kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
-                self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
+                kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
+                self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)
 
             # Delinearize beyond the third dimension
             if len(krange) > 3:
@@ -2010,8 +2017,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
                     )
 
                     expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
-                    kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
-                    self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
+                    kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
+                    self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)
 
         # Dispatch internal code
         assert CUDACodeGen._in_device_code is False

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
@@ -413,6 +413,17 @@ required:
                             a specified larger block size in the third dimension. Default value is
                             derived from hardware limits on common GPUs.
 
+                    thread_id_type:
+                        type: str
+                        title: Thread/block index data type
+                        default: int32
+                        description: >
+                            Defines the data type for a thread and block index in the generated code.
+                            The type is based on the type-classes in ``dace.dtypes``. For example,
+                            ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large
+                            index types are needed to address memory offsets that are beyond the 32-bit
+                            range, or to reduce memory usage.
+
 
             #############################################
             # General FPGA flags

diff --git a/dace/libraries/blas/environments/cublas.py b/dace/libraries/blas/environments/cublas.py
@@ -25,7 +25,7 @@ class cuBLAS:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])

diff --git a/dace/libraries/blas/environments/rocblas.py b/dace/libraries/blas/environments/rocblas.py
@@ -25,7 +25,7 @@ class rocBLAS:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])

diff --git a/dace/libraries/blas/include/dace_cublas.h b/dace/libraries/blas/include/dace_cublas.h
@@ -21,8 +21,10 @@ static void CheckCublasError(cublasStatus_t const& status) {
 }
 
 static cublasHandle_t CreateCublasHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cublasHandle_t handle;
   CheckCublasError(cublasCreate(&handle));
@@ -65,8 +67,10 @@ class _CublasConstants {
   }
 
   _CublasConstants(int device) {
-    if (cudaSetDevice(device) != cudaSuccess) {
-      throw std::runtime_error("Failed to set CUDA device.");
+    if (device >= 0) {
+      if (cudaSetDevice(device) != cudaSuccess) {
+        throw std::runtime_error("Failed to set CUDA device.");
+      }
     }
     // Allocate constant zero with the largest used size
     cudaMalloc(&zero_, sizeof(cuDoubleComplex) * 1);

diff --git a/dace/libraries/blas/include/dace_rocblas.h b/dace/libraries/blas/include/dace_rocblas.h
@@ -24,8 +24,10 @@ static void CheckRocblasError(rocblas_status const& status) {
 }
 
 static rocblas_handle CreateRocblasHandle(int device) {
-  if (hipSetDevice(device) != hipSuccess) {
-    throw std::runtime_error("Failed to set HIP device.");
+  if (device >= 0) {
+    if (hipSetDevice(device) != hipSuccess) {
+      throw std::runtime_error("Failed to set HIP device.");
+    }
   }
   rocblas_handle handle;
   CheckRocblasError(rocblas_create_handle(&handle));
@@ -68,53 +70,55 @@ class _RocblasConstants {
   }
 
   _RocblasConstants(int device) {
-    if (hipSetDevice(device) != hipSuccess) {
-      throw std::runtime_error("Failed to set HIP device.");
+    if (device >= 0) {
+      if (hipSetDevice(device) != hipSuccess) {
+        throw std::runtime_error("Failed to set HIP device.");
+      }
     }
     // Allocate constant zero with the largest used size
-    hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
 
     // Allocate constant one
-    hipMalloc(&half_pone_, sizeof(__half) * 1);
+    (void)hipMalloc(&half_pone_, sizeof(__half) * 1);
     __half half_pone = __float2half(1.0f);
-    hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
+    (void)hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&float_pone_, sizeof(float) * 1);
+    (void)hipMalloc(&float_pone_, sizeof(float) * 1);
     float float_pone = 1.0f;
-    hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
+    (void)hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&double_pone_, sizeof(double) * 1);
+    (void)hipMalloc(&double_pone_, sizeof(double) * 1);
     double double_pone = 1.0;
-    hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
+    (void)hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
+    (void)hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
     hipComplex complex64_pone = make_hipFloatComplex(1.0f, 0.0f);
-    hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
+    (void)hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
     hipDoubleComplex complex128_pone = make_hipDoubleComplex(1.0, 0.0);
-    hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
+    (void)hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
                hipMemcpyHostToDevice);
 
     // Allocate custom factors and default to zero
-    hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
-    hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
   }
 
   _RocblasConstants(_RocblasConstants const&) = delete;
 
   ~_RocblasConstants() {
-    hipFree(zero_);
-    hipFree(half_pone_);
-    hipFree(float_pone_);
-    hipFree(double_pone_);
-    hipFree(complex64_pone_);
-    hipFree(complex128_pone_);
-    hipFree(custom_alpha_);
-    hipFree(custom_beta_);
+    (void)hipFree(zero_);
+    (void)hipFree(half_pone_);
+    (void)hipFree(float_pone_);
+    (void)hipFree(double_pone_);
+    (void)hipFree(complex64_pone_);
+    (void)hipFree(complex128_pone_);
+    (void)hipFree(custom_alpha_);
+    (void)hipFree(custom_beta_);
   }
 
   _RocblasConstants& operator=(_RocblasConstants const&) = delete;

diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py
@@ -184,11 +184,11 @@ def expansion(node, state, sdfg):
         code = ''
         if dtype in (dace.complex64, dace.complex128):
             code = f'''
-            {dtype.ctype} alpha = {alpha};
-            {dtype.ctype} beta = {beta};
+            {dtype.ctype} __alpha = {alpha};
+            {dtype.ctype} __beta = {beta};
             '''
-            opt['alpha'] = '&alpha'
-            opt['beta'] = '&beta'
+            opt['alpha'] = '&__alpha'
+            opt['beta'] = '&__beta'
 
         code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, "
                  "{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, "
@@ -287,12 +287,12 @@ def expansion(cls, node, state, sdfg):
 
             # Set pointer mode to host
             call_prefix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_host});
-            {dtype.ctype} alpha = {alpha};
-            {dtype.ctype} beta = {beta};
+            {dtype.ctype} __alpha = {alpha};
+            {dtype.ctype} __beta = {beta};
             '''
             call_suffix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_device});'''
-            alpha = f'({cdtype} *)&alpha'
-            beta = f'({cdtype} *)&beta'
+            alpha = f'({cdtype} *)&__alpha'
+            beta = f'({cdtype} *)&__beta'
         else:
             alpha = constants[node.alpha]
             beta = constants[node.beta]

diff --git a/dace/libraries/lapack/environments/cusolverdn.py b/dace/libraries/lapack/environments/cusolverdn.py
@@ -24,7 +24,7 @@ class cuSolverDn:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])

diff --git a/dace/libraries/lapack/include/dace_cusolverdn.h b/dace/libraries/lapack/include/dace_cusolverdn.h
@@ -21,8 +21,10 @@ static void CheckCusolverDnError(cusolverStatus_t const& status) {
 }
 
 static cusolverDnHandle_t CreateCusolverDnHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cusolverDnHandle_t handle;
   CheckCusolverDnError(cusolverDnCreate(&handle));

diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
@@ -24,7 +24,7 @@ class cuTensor:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])

diff --git a/dace/libraries/linalg/include/dace_cutensor.h b/dace/libraries/linalg/include/dace_cutensor.h
@@ -20,8 +20,10 @@ static void CheckCuTensorError(cutensorStatus_t const& status) {
 }
 
 static cutensorHandle_t CreateCuTensorHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cutensorHandle_t handle;
   CheckCuTensorError(cutensorInit(&handle));

diff --git a/dace/libraries/sparse/environments/cusparse.py b/dace/libraries/sparse/environments/cusparse.py
@@ -24,7 +24,7 @@ class cuSPARSE:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])

diff --git a/dace/libraries/sparse/include/dace_cusparse.h b/dace/libraries/sparse/include/dace_cusparse.h
@@ -20,8 +20,10 @@ static void CheckCusparseError(cusparseStatus_t const& status) {
 }
 
 static cusparseHandle_t CreateCusparseHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cusparseHandle_t handle;
   CheckCusparseError(cusparseCreate(&handle));