From 7cb93f29820c8ad8caba5f75122d912192050f0f Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Oct 2024 16:40:23 -0700 Subject: [PATCH] Python frontend stability and inline storage specification (#1711) The PR adds a new syntax to support inline storage specification with the `@` operator, supporting the following statements: `a = np.ones(M) @ dace.StorageType.CPU_ThreadLocal`. This PR also fixes multiple minor issues in the Python frontend: * `WarpTiling` did not respect sequential map schedules * Non-sequence inputs for `numpy.fill` variants (e.g., `numpy.zeros(N)`) * NumPy replacement syntax errors would sometimes not have source information * Fix type inference for nested scopes in Python frontend * Dynamic thread block scheduling does not support multi-dimensional maps * Default schedule inference should use dynamic thread blocks if they exist * Type hints with storage type not being adhered to by the Python frontend * Validation issue #1562 The following changes were added as skipped tests and deferred to future PRs: * Dynamic map range related issues: Fix deferred to #1696 * Dynamic thread block scheduling would not pass object to nested functions: Fix deferred to future PR, see #1189 for more information --- dace/codegen/targets/cuda.py | 74 ++++++------ dace/codegen/tools/type_inference.py | 4 +- dace/dtypes.py | 2 - dace/frontend/python/newast.py | 31 +++-- dace/frontend/python/replacements.py | 114 ++++++++---------- dace/sdfg/infer_types.py | 10 +- dace/sdfg/sdfg.py | 8 +- dace/sdfg/validation.py | 3 +- dace/transformation/dataflow/warp_tiling.py | 4 + dace/transformation/helpers.py | 17 ++- tests/dynamic_tb_map_cudatest.py | 70 ++++++++++- tests/numpy/array_creation_test.py | 49 +++++++- tests/numpy/map_syntax_test.py | 52 ++++++++ .../device_annotations_test.py | 49 ++++++-- tests/sdfg/cycles_test.py | 19 ++- 15 files changed, 370 insertions(+), 136 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f080f2cc62..1cf8919d74 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -23,8 +23,8 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute from dace.config import Config from dace.frontend import operations -from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, - is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope) +from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, + is_devicelevel_gpu, nodes, scope_contains_scope) from dace.sdfg import utils as sdutil from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, StateSubgraphView @@ -68,6 +68,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): dispatcher = self._dispatcher self.create_grid_barrier = False + self.dynamic_tbmap_type = None self.extra_nsdfg_args = [] CUDACodeGen._in_device_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None @@ -892,8 +893,8 @@ def increment(streams): return max_streams, max_events - def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, - dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, + def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node, + dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: u, uconn, v, vconn, memlet = edge @@ -1163,11 +1164,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St copysize=', '.join(_topy(copy_shape)), is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum or '::Copy', - args=', '.join( - [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction - ) - ), - cfg, state_id, [src_node, dst_node]) + args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + + custom_reduction)), cfg, state_id, [src_node, dst_node]) else: callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, ' + @@ -1236,8 +1234,12 @@ def _begin_streams(self, sdfg, state): result.add(e.dst._cuda_stream) return result - def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: # Two modes: device-level state and if this state has active streams if CUDACodeGen._in_device_code: @@ -1361,8 +1363,7 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: "&& threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) elif write_scope == 'block': - callsite_stream.write("if (threadIdx.x == 0) " - "{ // sub-graph begin", cfg, state.block_id) + callsite_stream.write("if (threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) else: callsite_stream.write("{ // subgraph begin", cfg, state.block_id) else: @@ -1985,16 +1986,13 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # allocating shared memory for dynamic threadblock maps if has_dtbmap: - kernel_stream.write( - '__shared__ dace::' - 'DynamicMap<{fine_grained}, {block_size}>' - '::shared_type dace_dyn_map_shared;'.format( - fine_grained=('true' - if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), - block_size=functools.reduce( - (lambda x, y: x * y), - [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg, - state_id, node) + self.dynamic_tbmap_type = ( + f'dace::DynamicMap<{"true" if Config.get_bool("compiler", "cuda", "dynamic_map_fine_grained") else "false"}, ' + f'{functools.reduce((lambda x, y: x * y), [int(x) for x in Config.get("compiler", "cuda", "dynamic_map_block_size").split(",")])}>' + '::shared_type') + kernel_stream.write(f'__shared__ {self.dynamic_tbmap_type} dace_dyn_map_shared;', cfg, state_id, node) + else: + self.dynamic_tbmap_type = None # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) @@ -2072,8 +2070,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # Generate conditions for this block's execution using min and max # element, e.g., skipping out-of-bounds threads in trailing block - # unless thsi is handled by another map down the line - if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): + # unless this is handled by another map down the line + if ((not has_tbmap or has_dtbmap) and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): dsym_end = [d + bs - 1 for d, bs in zip(dsym, self._block_dims)] minels = krange.min_element() maxels = krange.max_element() @@ -2090,10 +2088,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S condition += '%s < %s' % (v, _topy(maxel + 1)) if len(condition) > 0: self._kernel_grid_conditions.append(f'if ({condition}) {{') - kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: self._kernel_grid_conditions.append('{') - kernel_stream.write('{', cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('{', cfg, state_id, scope_entry) self._dispatcher.dispatch_subgraph(sdfg, cfg, @@ -2112,6 +2112,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S self._kernel_state = None CUDACodeGen._in_device_code = False self._grid_dims = None + self.dynamic_tbmap_type = None def get_next_scope_entries(self, dfg, scope_entry): parent_scope_entry = dfg.entry_node(scope_entry) @@ -2179,10 +2180,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco current_sdfg = current_state.parent if not outer_scope: raise ValueError(f'Failed to find the outer scope of {scope_entry}') - callsite_stream.write( - 'if ({} < {}) {{'.format(outer_scope.map.params[0], - _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg, - state_id, scope_entry) + for cond in self._kernel_grid_conditions: + callsite_stream.write(cond, cfg, state_id, scope_entry) # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule. # They define inside the schedule the bounds of the any nested Maps. @@ -2205,8 +2204,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco '__dace_dynmap_begin = {begin};\n' '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry) - # close if - callsite_stream.write('}', cfg, state_id, scope_entry) + # Close kernel grid conditions + for _ in self._kernel_grid_conditions: + callsite_stream.write('}', cfg, state_id, scope_entry) callsite_stream.write( 'dace::DynamicMap<{fine_grained}, {bsize}>::' @@ -2215,7 +2215,7 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco 'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool( 'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), bsize=total_block_size, - kmapIdx=outer_scope.map.params[0], + kmapIdx=outer_scope.map.params[-1], param=dynmap_var), cfg, state_id, scope_entry) for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): @@ -2556,8 +2556,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco for cond in self._kernel_grid_conditions: callsite_stream.write(cond, cfg, state_id, scope_entry) - def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if self.node_dispatch_predicate(sdfg, dfg, node): # Dynamically obtain node generator according to class name gen = getattr(self, '_generate_' + type(node).__name__, False) @@ -2594,6 +2594,8 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) if self.create_grid_barrier: result.append(('cub::GridBarrier&', '__gbar', '__gbar')) + if self.dynamic_tbmap_type: + result.append((f'{self.dynamic_tbmap_type}&', 'dace_dyn_map_shared', 'dace_dyn_map_shared')) # Add data from nested SDFGs to kernel arguments result.extend([(atype, aname, aname) for atype, aname, _ in self.extra_nsdfg_args]) diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py index 893866522f..8f8dd84151 100644 --- a/dace/codegen/tools/type_inference.py +++ b/dace/codegen/tools/type_inference.py @@ -9,7 +9,7 @@ import numpy as np import ast -from dace import dtypes +from dace import data, dtypes from dace import symbolic from dace.codegen import cppunparse from dace.symbolic import symbol, SymExpr, symstr @@ -286,6 +286,8 @@ def _Name(t, symbols, inferred_symbols): inferred_type = dtypes.typeclass(inferred_type.type) elif isinstance(inferred_type, symbolic.symbol): inferred_type = inferred_type.dtype + elif isinstance(inferred_type, data.Data): + inferred_type = inferred_type.dtype elif t_id in inferred_symbols: inferred_type = inferred_symbols[t_id] return inferred_type diff --git a/dace/dtypes.py b/dace/dtypes.py index c5f9bb4732..a016ac60e2 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1,10 +1,8 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ A module that contains various DaCe type definitions. """ -from __future__ import print_function import ctypes import aenum import inspect -import itertools import numpy import re from collections import OrderedDict diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index cacf15d785..78890c9cdd 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1489,19 +1489,19 @@ def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typecla else: values = str(val).split(':') if len(values) == 1: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) elif len(values) == 2: result[name] = symbolic.symbol( name, dtypes.result_type_of(infer_expr_type(values[0], { - **self.globals, + **self.defined, **dyn_inputs }), infer_expr_type(values[1], { - **self.globals, + **self.defined, **dyn_inputs }))) elif len(values) == 3: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) else: raise DaceSyntaxError( self, None, "Invalid number of arguments in a range iterator. " @@ -3258,18 +3258,23 @@ def visit_AnnAssign(self, node: ast.AnnAssign): dtype = astutils.evalnode(node.annotation, {**self.globals, **self.defined}) if isinstance(dtype, data.Data): simple_type = dtype.dtype + storage = dtype.storage else: simple_type = dtype + storage = dtypes.StorageType.Default if not isinstance(simple_type, dtypes.typeclass): raise TypeError except: dtype = None + storage = dtypes.StorageType.Default type_name = rname(node.annotation) warnings.warn('typeclass {} is not supported'.format(type_name)) if node.value is None and dtype is not None: # Annotating type without assignment self.annotated_types[rname(node.target)] = dtype return - self._visit_assign(node, node.target, None, dtype=dtype) + results = self._visit_assign(node, node.target, None, dtype=dtype) + if storage != dtypes.StorageType.Default: + self.sdfg.arrays[results[0][0]].storage = storage def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): # Get targets (elts) and results @@ -3563,6 +3568,8 @@ def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): self.cfg_target.add_edge(self.last_block, output_indirection, dace.sdfg.InterstateEdge()) self.last_block = output_indirection + return results + def visit_AugAssign(self, node: ast.AugAssign): self._visit_assign(node, node.target, augassign_ops[type(node.op).__name__]) @@ -4623,10 +4630,16 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): self._add_state('call_%d' % node.lineno) self.last_block.set_default_lineinfo(self.current_lineinfo) - if found_ufunc: - result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) - else: - result = func(self, self.sdfg, self.last_block, *args, **keywords) + try: + if found_ufunc: + result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) + else: + result = func(self, self.sdfg, self.last_block, *args, **keywords) + except DaceSyntaxError as ex: + # Attach source information to exception + if ex.node is None: + ex.node = node + raise self.last_block.set_default_lineinfo(None) diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 5e6118a34b..537fef97bf 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -322,24 +322,30 @@ def _numpy_full(pv: ProgramVisitor, is_data = True vtype = sdfg.arrays[fill_value].dtype dtype = dtype or vtype + + # Handle one-dimensional inputs + if isinstance(shape, (Number, str)) or symbolic.issymbolic(shape): + shape = [shape] + + if any(isinstance(s, str) for s in shape): + raise DaceSyntaxError( + pv, None, f'Data-dependent shape {shape} is currently not allowed. Only constants ' + 'and symbolic values can be used.') + name, _ = sdfg.add_temp_transient(shape, dtype) if is_data: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, dict(__inp=dace.Memlet(data=fill_value, subset='0')), "__out = __inp", dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) else: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) @@ -459,10 +465,8 @@ def _numpy_flip(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, axis inpidx = ','.join([f'__i{i}' for i in range(ndim)]) outidx = ','.join([f'{s} - __i{i} - 1' if a else f'__i{i}' for i, (a, s) in enumerate(zip(axis, desc.shape))]) state.add_mapped_tasklet(name="_numpy_flip_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -532,10 +536,8 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1 outidx = ','.join(out_indices) state.add_mapped_tasklet(name="_rot90_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -644,7 +646,8 @@ def _elementwise(pv: 'ProgramVisitor', else: state.add_mapped_tasklet( name="_elementwise_", - map_ranges={f'__i{dim}': f'0:{N}' for dim, N in enumerate(inparr.shape)}, + map_ranges={f'__i{dim}': f'0:{N}' + for dim, N in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(in_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, code=code, outputs={'__out': Memlet.simple(out_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, @@ -694,10 +697,8 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: else: state.add_mapped_tasklet( name=func, - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(inparr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, code='__out = {f}(__inp)'.format(f=func), outputs={'__out': Memlet.simple(outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, @@ -1046,27 +1047,22 @@ def _argminmax(pv: ProgramVisitor, code = "__init = _val_and_idx(val={}, idx=-1)".format( dtypes.min_value(a_arr.dtype) if func == 'max' else dtypes.max_value(a_arr.dtype)) - nest.add_state().add_mapped_tasklet(name="_arg{}_convert_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, - inputs={}, - code=code, - outputs={ - '__init': - Memlet.simple( - reduced_structs, - ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) - }, - external_edges=True) + nest.add_state().add_mapped_tasklet( + name="_arg{}_convert_".format(func), + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, + inputs={}, + code=code, + outputs={ + '__init': Memlet.simple(reduced_structs, + ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) + }, + external_edges=True) nest.add_state().add_mapped_tasklet( name="_arg{}_reduce_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape)}, inputs={'__in': Memlet.simple(a, ','.join('__i%d' % i for i in range(len(a_arr.shape))))}, code="__out = _val_and_idx(idx={}, val=__in)".format("__i%d" % axis), outputs={ @@ -1086,10 +1082,8 @@ def _argminmax(pv: ProgramVisitor, nest.add_state().add_mapped_tasklet( name="_arg{}_extract_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, inputs={ '__in': Memlet.simple(reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) @@ -1212,10 +1206,9 @@ def _unop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str): opcode = 'not' name, _ = sdfg.add_temp_transient(arr1.shape, restype, arr1.storage) - state.add_mapped_tasklet("_%s_" % opname, { - '__i%d' % i: '0:%s' % s - for i, s in enumerate(arr1.shape) - }, {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, + state.add_mapped_tasklet("_%s_" % opname, {'__i%d' % i: '0:%s' % s + for i, s in enumerate(arr1.shape)}, + {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, '__out = %s __in1' % opcode, {'__out': Memlet.simple(name, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, external_edges=True) @@ -4316,10 +4309,8 @@ def _ndarray_fill(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, va shape = sdfg.arrays[arr].shape state.add_mapped_tasklet( '_numpy_fill_', - map_ranges={ - f"__i{dim}": f"0:{s}" - for dim, s in enumerate(shape) - }, + map_ranges={f"__i{dim}": f"0:{s}" + for dim, s in enumerate(shape)}, inputs=inputs, code=f"__out = {body}", outputs={'__out': dace.Memlet.simple(arr, ",".join([f"__i{dim}" for dim in range(len(shape))]))}, @@ -4544,6 +4535,13 @@ def _ndarray_astype(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, return _datatype_converter(sdfg, state, arr, dtype)[0] +@oprepo.replaces_operator('Array', 'MatMult', otherclass='StorageType') +def _cast_storage(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, arr: str, stype: dace.StorageType) -> str: + desc = sdfg.arrays[arr] + desc.storage = stype + return arr + + # Replacements that need ufuncs ############################################### # TODO: Fix by separating to different modules and importing @@ -4747,13 +4745,7 @@ def _tensordot(pv: 'ProgramVisitor', @oprepo.replaces("cupy._core.core.ndarray") @oprepo.replaces("cupy.ndarray") -def _define_cupy_local( - pv: "ProgramVisitor", - sdfg: SDFG, - state: SDFGState, - shape: Shape, - dtype: typeclass -): +def _define_cupy_local(pv: "ProgramVisitor", sdfg: SDFG, state: SDFGState, shape: Shape, dtype: typeclass): """Defines a local array in a DaCe program.""" if not isinstance(shape, (list, tuple)): shape = [shape] @@ -4781,10 +4773,8 @@ def _cupy_full(pv: ProgramVisitor, name, _ = sdfg.add_temp_transient(shape, dtype, storage=dtypes.StorageType.GPU_Global) state.add_mapped_tasklet( - '_cupy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_cupy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py index cf58cf76cc..97010e95a7 100644 --- a/dace/sdfg/infer_types.py +++ b/dace/sdfg/infer_types.py @@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG): for e in state.out_edges(node): cname = e.src_conn if cname and node.out_connectors[cname] is None: - raise TypeError('Ambiguous or uninferable type in' - ' connector "%s" of node "%s"' % (cname, node)) + raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node)) ############################################################################# @@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState, else: child_schedule = _determine_child_schedule(parent_schedules) + # Special case for dynamic thread-block neighboring schedules + if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock: + from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks # Avoid import loops + if gpu_map_has_explicit_dyn_threadblocks(state, parent_node): + child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic + # Set child schedule type in scope for node in child_nodes[parent_node]: # Set default schedule types @@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType: raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG') + def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None: """ Infers aliasing information on nested SDFG arrays based on external edges and connectors. diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 38a41236a6..f25a6e24d5 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -761,13 +761,13 @@ def add_symbol(self, name, stype, find_new_name: bool = False): if name in self.symbols: raise FileExistsError(f'Symbol "{name}" already exists in SDFG') if name in self.arrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a data descriptor.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a data descriptor.') if name in self._subarrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a subarray.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a subarray.') if name in self._rdistrarrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a RedistrArray.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a RedistrArray.') if name in self._pgrids: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a ProcessGrid.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a ProcessGrid.') if not isinstance(stype, dtypes.typeclass): stype = dtypes.dtype_to_typeclass(stype) self.symbols[name] = stype diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index f02a5003e9..2df9e17445 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -396,7 +396,6 @@ def validate_state(state: 'dace.sdfg.SDFGState', symbols = symbols or {} initialized_transients = (initialized_transients if initialized_transients is not None else {'__pystate'}) references = references or set() - scope = state.scope_dict() # Obtain whether we are already in an accelerator context if not hasattr(context, 'in_gpu'): @@ -426,6 +425,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', if state.has_cycles(): raise InvalidSDFGError('State should be acyclic but contains cycles', sdfg, state_id) + scope = state.scope_dict() + for nid, node in enumerate(state.nodes()): # Reference check if id(node) in references: diff --git a/dace/transformation/dataflow/warp_tiling.py b/dace/transformation/dataflow/warp_tiling.py index 362b51d9ac..f9091950e3 100644 --- a/dace/transformation/dataflow/warp_tiling.py +++ b/dace/transformation/dataflow/warp_tiling.py @@ -55,6 +55,10 @@ def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry: # Stride and offset all internal maps maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True) for nstate, nmap in maps_to_stride: + # Skip sequential maps + if nmap.schedule == dtypes.ScheduleType.Sequential: + continue + nsdfg = nstate.parent nsdfg_node = nsdfg.parent_nsdfg_node diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index 6ca4602079..b7bf49e62b 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S return ScopeSubgraphView(state, new_nodes, new_entry) -def offset_map(state: SDFGState, - entry: nodes.MapEntry, - dim: int, - offset: symbolic.SymbolicType, - negative: bool = True): +def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True): """ Offsets a map parameter and its contents by a value. @@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode) return False +def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool: + """ + Returns True if GPU_Device map has explicit thread-block maps nested within. + """ + internal_maps = get_internal_scopes(state, entry) + if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps): + return True + + return False + + def reconnect_edge_through_map( state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode], keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]: diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py index b24e5f2ea6..edc1eac9f2 100644 --- a/tests/dynamic_tb_map_cudatest.py +++ b/tests/dynamic_tb_map_cudatest.py @@ -12,10 +12,8 @@ @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H]) def spmv(A_row, A_col, A_val, x, b): - @dace.mapscope(_[0:H]) def compute_row(i): - @dace.map(_[A_row[i]:A_row[i + 1]]) def compute(j): a << A_val[j] @@ -292,8 +290,76 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i assert np.allclose(val, ref.data) +@pytest.mark.gpu +def test_dynamic_multidim_map(): + @dace.program + def tester(a: dace.float32[H, W, nnz]): + A = dace.ndarray([H, W, nnz], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i, j in dace.map[0:H, 0:W] @ dace.ScheduleType.GPU_Device: + for k in dace.map[0:nnz] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + A[i, j, k] = i * 110 + j * 11 + k + a[:] = A + + a = np.zeros((10, 11, 65), dtype=np.float32) + tester(a) + assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 110 + j * 11 + k, (10, 11, 65), dtype=np.float32)) + + +@pytest.mark.skip('Nested maps with work-stealing thread-block schedule are currently unsupported') +def test_dynamic_nested_map(): + @dace.program + def nested2(A: dace.float32[W], i: dace.int32, j: dace.int32): + A[j] = i * 10 + j + + @dace.program + def nested1(A: dace.float32[W], i: dace.int32): + for j in dace.map[0:W] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + nested2(A, i, j) + + @dace.program + def dynamic_nested_map(a: dace.float32[H, W]): + A = dace.ndarray([H, W], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i in dace.map[0:H] @ dace.ScheduleType.GPU_Device: + nested1(A[i], i) + + a[:] = A + + a = np.zeros((10, 11), dtype=np.float32) + sdfg = dynamic_nested_map.to_sdfg(simplify=False) + for _, _, arr in sdfg.arrays_recursive(): + if arr.storage in (dace.StorageType.GPU_Shared, dace.StorageType.Default): + arr.storage = dace.StorageType.Register + sdfg(a, H=10, W=11) + assert np.allclose(a, np.fromfunction(lambda i, j: i * 10 + j, (10, 11), dtype=np.float32)) + + +@pytest.mark.gpu +def test_dynamic_default_schedule(): + N = dace.symbol('N') + + @dace.program + def tester(a: dace.float32[N, 10]): + A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device: + smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared + smem[:] = 1 + for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + A[i, j] = i * 65 + smem[j] + a[:] = A + + a = np.zeros((65, 10), dtype=np.float32) + tester(a) + assert np.allclose(a, np.fromfunction(lambda i, j: i * 65 + 1, (65, 10), dtype=np.float32)) + + if __name__ == '__main__': test_dynamic_map() test_dynamic_maps() test_nested_dynamic_map() test_dynamic_map_with_step() + test_dynamic_multidim_map() + # test_dynamic_nested_map() + test_dynamic_default_schedule() diff --git a/tests/numpy/array_creation_test.py b/tests/numpy/array_creation_test.py index 85908c7a1f..7329b48b3f 100644 --- a/tests/numpy/array_creation_test.py +++ b/tests/numpy/array_creation_test.py @@ -1,7 +1,9 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace +from dace.frontend.python.common import DaceSyntaxError import numpy as np from common import compare_numpy_output +import pytest # M = dace.symbol('M') # N = dace.symbol('N') @@ -154,7 +156,7 @@ def test_arange_6(): def program_strides_0(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 1)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -168,7 +170,7 @@ def test_strides_0(): def program_strides_1(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(4, 2)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -182,7 +184,7 @@ def test_strides_1(): def program_strides_2(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(1, 2)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -196,7 +198,7 @@ def test_strides_2(): def program_strides_3(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 4)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -206,6 +208,42 @@ def test_strides_3(): assert np.allclose(A, [[0, 1], [2, 3]]) +def test_zeros_symbolic_size_scalar(): + K = dace.symbol('K') + + @dace.program + def zeros_symbolic_size(): + return np.zeros((K), dtype=np.uint32) + + out = zeros_symbolic_size(K=10) + assert (list(out.shape) == [10]) + assert (out.dtype == np.uint32) + + +def test_ones_scalar_size_scalar(): + + @dace.program + def ones_scalar_size(k: dace.int32): + a = np.ones(k, dtype=np.uint32) + return np.sum(a) + + with pytest.raises(DaceSyntaxError): + out = ones_scalar_size(20) + assert out == 20 + + +def test_ones_scalar_size(): + + @dace.program + def ones_scalar_size(k: dace.int32): + a = np.ones((k, k), dtype=np.uint32) + return np.sum(a) + + with pytest.raises(DaceSyntaxError): + out = ones_scalar_size(20) + assert out == 20 * 20 + + if __name__ == "__main__": test_empty() test_empty_like1() @@ -233,3 +271,6 @@ def test_strides_3(): test_strides_1() test_strides_2() test_strides_3() + test_zeros_symbolic_size_scalar() + test_ones_scalar_size_scalar() + test_ones_scalar_size() diff --git a/tests/numpy/map_syntax_test.py b/tests/numpy/map_syntax_test.py index fe7af1d644..27a0cfe018 100644 --- a/tests/numpy/map_syntax_test.py +++ b/tests/numpy/map_syntax_test.py @@ -1,6 +1,7 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import numpy as np import dace +import pytest M, N, K = (dace.symbol(name) for name in ['M', 'N', 'K']) @@ -35,6 +36,57 @@ def test_map_python(): assert np.allclose(A[:, 1:], B[:, 1:]) +@pytest.mark.skip('Fails due to bug in Python frontend') +def test_nested_map_with_indirection(): + N = dace.symbol('N') + + @dace.program + def indirect_to_indirect(arr1: dace.float64[N], ind: dace.int32[10], arr2: dace.float64[N]): + for i in dace.map[0:9]: + begin, end, stride = ind[i], ind[i + 1], 1 + for _ in dace.map[0:1]: + for j in dace.map[begin:end:stride]: + arr2[j] = arr1[j] + i + + a = np.random.rand(50) + b = np.zeros(50) + ind = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=np.int32) + sdfg = indirect_to_indirect.to_sdfg(simplify=False) + sdfg(a, ind, b) + + ref = np.zeros(50) + for i in range(9): + begin, end = ind[i], ind[i + 1] + ref[begin:end] = a[begin:end] + i + + assert np.allclose(b, ref) + + +@pytest.mark.skip('Fails due to bug in Python frontend') +def test_dynamic_map_range_scalar(): + """ + From issue #650. + """ + + @dace.program + def test(A: dace.float64[20], B: dace.float64[20]): + N = dace.define_local_scalar(dace.int32) + N = 5 + for i in dace.map[0:N]: + for j in dace.map[0:N]: + with dace.tasklet: + a << A[i] + b >> B[j] + b = a + 1 + + A = np.random.rand(20) + B = np.zeros(20) + test(A, B) + assert np.allclose(B[:5], A[:5] + 1) + + if __name__ == '__main__': test_copy3d() test_map_python() + # test_nested_map_with_indirection() + # test_dynamic_map_range_scalar() diff --git a/tests/python_frontend/device_annotations_test.py b/tests/python_frontend/device_annotations_test.py index 65c8501b23..d6b512f00b 100644 --- a/tests/python_frontend/device_annotations_test.py +++ b/tests/python_frontend/device_annotations_test.py @@ -1,16 +1,19 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. import dace import pytest +import numpy as np from dace.dtypes import StorageType, DeviceType, ScheduleType from dace import dtypes -cupy = pytest.importorskip("cupy") +try: + import cupy +except (ImportError, ModuleNotFoundError): + cupy = None @pytest.mark.gpu def test_storage(): - @dace.program def add(X: dace.float32[32, 32] @ StorageType.GPU_Global): return X + 1 @@ -46,7 +49,6 @@ def add2(X: dace.float32[32, 32] @ StorageType.GPU_Global): @pytest.mark.gpu def test_pythonmode(): - def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20] @ StorageType.GPU_Global): # This map will become a GPU kernel for i in dace.map[0:20] @ ScheduleType.GPU_Device: @@ -58,7 +60,40 @@ def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20 assert cupy.allclose(gpu_b, gpu_a + 1) +def test_inline_storage_hint(): + N = dace.symbol('N') + + @dace.program + def tester(): + b = np.ones(N, dtype=np.float32) @ dace.StorageType.CPU_ThreadLocal + return b + 1 + + sdfg = tester.to_sdfg(simplify=False) + assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal + + b = tester(N=10) + assert np.allclose(b, 2) + + +def test_annotated_storage_hint(): + N = dace.symbol('N') + + @dace.program + def tester(): + b: dace.float32[N] @ dace.StorageType.CPU_ThreadLocal = np.ones(N, dtype=np.float32) + return b + 1 + + sdfg = tester.to_sdfg(simplify=False) + assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal + + b = tester(N=10) + assert np.allclose(b, 2) + + if __name__ == "__main__": - test_storage() - test_schedule() - test_pythonmode() + if cupy is not None: + test_storage() + test_schedule() + test_pythonmode() + test_inline_storage_hint() + test_annotated_storage_hint() diff --git a/tests/sdfg/cycles_test.py b/tests/sdfg/cycles_test.py index 480392ab2d..b01aec55fd 100644 --- a/tests/sdfg/cycles_test.py +++ b/tests/sdfg/cycles_test.py @@ -2,7 +2,7 @@ import pytest import dace - +from dace.sdfg.validation import InvalidSDFGError def test_cycles(): with pytest.raises(ValueError, match="Found cycles.*"): @@ -29,6 +29,23 @@ def test_cycles_memlet_path(): sdfg.validate() +def test_cycles_1562(): + """ + Test for issue #1562. + """ + with pytest.raises(InvalidSDFGError, match="cycles"): + sdfg = dace.SDFG("foo") + state = sdfg.add_state() + mentry_2, mexit_2 = state.add_map("map_2", dict(i="0:9")) + mentry_6, mexit_6 = state.add_map("map_6", dict(i="0:9")) + mentry_8, mexit_8 = state.add_map("map_8", dict(i="0:9")) + state.add_edge(mentry_8, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9')) + state.add_edge(mentry_6, "OUT_0", mentry_2, "IN_0", dace.Memlet(data="bla", subset='0:9')) + state.add_edge(mentry_2, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9')) + sdfg.validate() + + if __name__ == '__main__': test_cycles() test_cycles_memlet_path() + test_cycles_1562()