diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index b31834ae61..88d2f97947 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -2433,6 +2433,7 @@ def visit_Continue(self, node: ast.Continue): def visit_If(self, node: ast.If): # Add a guard state self._add_state('if_guard') + self.last_state.debuginfo = self.current_lineinfo # Generate conditions cond, cond_else = self._visit_test(node.test) diff --git a/dace/sdfg/analysis/cfg.py b/dace/sdfg/analysis/cfg.py index 20b61446df..f2926e7bc3 100644 --- a/dace/sdfg/analysis/cfg.py +++ b/dace/sdfg/analysis/cfg.py @@ -219,14 +219,13 @@ def _stateorder_topological_sort(sdfg: SDFG, """ # Traverse states in custom order visited = visited or set() - if stop is not None: - visited.add(stop) stack = [start] while stack: node = stack.pop() - if node in visited: + if node in visited or node is stop: continue yield node + visited.add(node) oe = sdfg.out_edges(node) if len(oe) == 0: # End state @@ -265,6 +264,9 @@ def _stateorder_topological_sort(sdfg: SDFG, mergestate = stop for branch in oe: + if branch.dst is mergestate: + # If we hit the merge state (if without else), defer to end of branch traversal + continue for s in _stateorder_topological_sort(sdfg, branch.dst, ptree, @@ -273,8 +275,7 @@ def _stateorder_topological_sort(sdfg: SDFG, visited=visited): yield s visited.add(s) - if mergestate != stop: - stack.append(mergestate) + stack.append(mergestate) def stateorder_topological_sort(sdfg: SDFG) -> Iterator[SDFGState]: diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 82ef220cbe..471296cd42 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -196,15 +196,22 @@ def condition_sympy(self): self._cond_sympy = symbolic.pystr_to_symbolic(self.condition.as_string) return self._cond_sympy - @property - def free_symbols(self) -> Set[str]: - """ Returns a set of symbols used in this edge's properties. """ + def read_symbols(self) -> Set[str]: + """ + Returns a set of symbols read in this edge (including symbols in the condition and assignment values). + """ # Symbols in conditions and assignments result = set(map(str, dace.symbolic.symbols_in_ast(self.condition.code[0]))) for assign in self.assignments.values(): result |= symbolic.free_symbols_and_functions(assign) - return result - set(self.assignments.keys()) + return result + + @property + def free_symbols(self) -> Set[str]: + """ Returns a set of symbols used in this edge's properties. """ + return self.read_symbols() - set(self.assignments.keys()) + def replace_dict(self, repl: Dict[str, str], replace_keys=True) -> None: """ diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 2ceade4b5c..8bf8f870eb 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -141,7 +141,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None): undef_syms = set(edge.data.free_symbols) - set(symbols.keys()) if len(undef_syms) > 0: eid = sdfg.edge_id(edge) - raise InvalidSDFGInterstateEdgeError("Undefined symbols in edge: %s" % undef_syms, sdfg, eid) + raise InvalidSDFGInterstateEdgeError( + f'Undefined symbols in edge: {undef_syms}. Add those with ' + '`sdfg.add_symbol()` or define outside with `dace.symbol()`', sdfg, eid) # Validate inter-state edge names issyms = edge.data.new_symbols(sdfg, symbols) @@ -231,8 +233,7 @@ def validate_state(state: 'dace.sdfg.SDFGState', raise InvalidSDFGError("Invalid state name", sdfg, state_id) if state._parent != sdfg: - raise InvalidSDFGError("State does not point to the correct " - "parent", sdfg, state_id) + raise InvalidSDFGError("State does not point to the correct " "parent", sdfg, state_id) # Unreachable ######################################## @@ -618,7 +619,6 @@ def validate_state(state: 'dace.sdfg.SDFGState', class InvalidSDFGError(Exception): """ A class of exceptions thrown when SDFG validation fails. """ - def __init__(self, message: str, sdfg: 'SDFG', state_id: int): self.message = message self.sdfg = sdfg @@ -641,8 +641,7 @@ def _getlineinfo(self, obj) -> str: if lineinfo.start_line >= 0: if lineinfo.start_column > 0: - return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, ' - f'column {lineinfo.start_column}') + return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, ' f'column {lineinfo.start_column}') return f'File "{lineinfo.filename}", line {lineinfo.start_line}' return f'File "{lineinfo.filename}"' @@ -670,7 +669,6 @@ def __str__(self): class InvalidSDFGInterstateEdgeError(InvalidSDFGError): """ Exceptions of invalid inter-state edges in an SDFG. """ - def __init__(self, message: str, sdfg: 'SDFG', edge_id: int): self.message = message self.sdfg = sdfg @@ -687,15 +685,31 @@ def __str__(self): str(e.src), str(e.dst), ) + locinfo_src = self._getlineinfo(e.src) + locinfo_dst = self._getlineinfo(e.dst) + else: + edgestr = '' + locinfo_src = locinfo_dst = '' + + if locinfo_src or locinfo_dst: + if locinfo_src == locinfo_dst: + locinfo = f'at {locinfo_src}' + elif locinfo_src and not locinfo_dst: + locinfo = f'at {locinfo_src}' + elif locinfo_dst and not locinfo_src: + locinfo = f'at {locinfo_src}' + else: + locinfo = f'between\n {locinfo_src}\n and\n {locinfo_dst}' + + locinfo = f'\nOriginating from source code {locinfo}' else: - edgestr = "" + locinfo = '' - return "%s%s" % (self.message, edgestr) + return f'{self.message}{edgestr}{locinfo}' class InvalidSDFGNodeError(InvalidSDFGError): """ Exceptions of invalid nodes in an SDFG state. """ - def __init__(self, message: str, sdfg: 'SDFG', state_id: int, node_id: int): self.message = message self.sdfg = sdfg @@ -729,14 +743,12 @@ class NodeNotExpandedError(InvalidSDFGNodeError): Exception that is raised whenever a library node was not expanded before code generation. """ - def __init__(self, sdfg: 'SDFG', state_id: int, node_id: int): super().__init__('Library node not expanded', sdfg, state_id, node_id) class InvalidSDFGEdgeError(InvalidSDFGError): """ Exceptions of invalid edges in an SDFG state. """ - def __init__(self, message: str, sdfg: 'SDFG', state_id: int, edge_id: int): self.message = message self.sdfg = sdfg diff --git a/dace/transformation/dataflow/map_fission.py b/dace/transformation/dataflow/map_fission.py index 0e2dd868d5..9021e29203 100644 --- a/dace/transformation/dataflow/map_fission.py +++ b/dace/transformation/dataflow/map_fission.py @@ -8,7 +8,7 @@ from dace.sdfg import nodes, graph as gr from dace.sdfg import utils as sdutil from dace.sdfg.graph import OrderedDiGraph -from dace.sdfg.propagation import propagate_memlets_state +from dace.sdfg.propagation import propagate_memlets_state, propagate_subset from dace.symbolic import pystr_to_symbolic from dace.transformation import transformation, helpers from typing import List, Optional, Tuple @@ -413,6 +413,14 @@ def apply(self, graph: sd.SDFGState, sdfg: sd.SDFG): # Correct connectors and memlets in nested SDFGs to account for # missing outside map if self.expr_index == 1: + + # NOTE: In the following scope dictionary, we mark the new MapEntries as existing in their own scope. + # This makes it easier to detect edges that are outside the new Map scopes (after MapFission). + scope_dict = state.scope_dict() + for k, v in scope_dict.items(): + if isinstance(k, nodes.MapEntry) and k in new_map_entries and v is None: + scope_dict[k] = k + to_correct = ([(e, e.src) for e in external_edges_entry] + [(e, e.dst) for e in external_edges_exit]) corrected_nodes = set() for edge, node in to_correct: @@ -442,6 +450,12 @@ def apply(self, graph: sd.SDFGState, sdfg: sd.SDFG): for e in state.memlet_tree(internal_edge): e.data.subset.offset(desc.offset, False) e.data.subset = helpers.unsqueeze_memlet(e.data, outer_edge.data).subset + # NOTE: If the edge is outside of the new Map scope, then try to propagate it. This is + # needed for edges directly connecting AccessNodes, because the standard memlet + # propagation will stop at the first AccessNode outside the Map scope. For example, see + # `test.transformations.mapfission_test.MapFissionTest.test_array_copy_outside_scope`. + if not (scope_dict[e.src] and scope_dict[e.dst]): + e.data = propagate_subset([e.data], desc, outer_map.params, outer_map.range) # Only after offsetting memlets we can modify the # overall offset @@ -455,9 +469,21 @@ def apply(self, graph: sd.SDFGState, sdfg: sd.SDFG): for edge in state.all_edges(node): for e in state.memlet_tree(edge): # Prepend map dimensions to memlet - e.data.subset = subsets.Range([(pystr_to_symbolic(d) - r[0], pystr_to_symbolic(d) - r[0], 1) - for d, r in zip(outer_map.params, outer_map.range)] + - e.data.subset.ranges) + # NOTE: Do this only for the subset corresponding to `node.data`. If the edge is copying + # to/from another AccessNode, the other data may not need extra dimensions. For example, see + # `test.transformations.mapfission_test.MapFissionTest.test_array_copy_outside_scope`. + if e.data.data == node.data: + if e.data.subset: + e.data.subset = subsets.Range([(pystr_to_symbolic(d) - r[0], + pystr_to_symbolic(d) - r[0], 1) + for d, r in zip(outer_map.params, outer_map.range)] + + e.data.subset.ranges) + else: + if e.data.other_subset: + e.data.other_subset = subsets.Range( + [(pystr_to_symbolic(d) - r[0], pystr_to_symbolic(d) - r[0], 1) + for d, r in zip(outer_map.params, outer_map.range)] + + e.data.other_subset.ranges) # If nested SDFG, reconnect nodes around map and modify memlets if self.expr_index == 1: @@ -486,3 +512,7 @@ def apply(self, graph: sd.SDFGState, sdfg: sd.SDFG): # Remove outer map graph.remove_nodes_from([map_entry, map_exit]) + + # NOTE: It is better to manually call memlet propagation here to ensure that all subsets are properly updated. + # This can solve issues when, e.g., applying MapFission through `SDFG.apply_transformations_repeated`. + propagate_memlets_state(sdfg, graph) diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py index 3a47c74c9a..a6389da8d9 100644 --- a/dace/transformation/interstate/gpu_transform_sdfg.py +++ b/dace/transformation/interstate/gpu_transform_sdfg.py @@ -32,7 +32,7 @@ def _recursive_out_check(node, state, gpu_scalars): scalset = scalset.union(sset) scalout = scalout and ssout continue - if desc.shape == (1,): # Pseudo-scalar + if desc.shape == (1, ): # Pseudo-scalar scalout = False sset, ssout = _recursive_out_check(last_edge.dst, state, gpu_scalars) scalset = scalset.union(sset) @@ -66,7 +66,7 @@ def _recursive_in_check(node, state, gpu_scalars): scalset = scalset.union(sset) scalout = scalout and ssout continue - if desc.shape == (1,): # Pseudo-scalar + if desc.shape == (1, ): # Pseudo-scalar scalout = False sset, ssout = _recursive_in_check(last_edge.src, state, gpu_scalars) scalset = scalset.union(sset) @@ -81,10 +81,6 @@ def _recursive_in_check(node, state, gpu_scalars): return scalset, scalout -def _codenode_condition(node): - return isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)) and node.schedule == dtypes.ScheduleType.GPU_Default - - @make_properties class GPUTransformSDFG(transformation.MultiStateTransformation): """ Implements the GPUTransformSDFG transformation. @@ -305,33 +301,63 @@ def apply(self, _, sdfg: sd.SDFG): ####################################################### # Step 5: Collect free tasklets and check for scalars that have to be moved to the GPU + # Also recursively call GPUTransformSDFG on NestedSDFGs that have GPU device schedule but are not actually + # inside a GPU kernel. gpu_scalars = {} + nsdfgs = [] changed = True # Iterates over Tasklets that not inside a GPU kernel. Such Tasklets must be moved inside a GPU kernel only # if they write to GPU memory. The check takes into account the fact that GPU kernels can read host-based # Scalars, but cannot write to them. while changed: changed = False - for node, state in sdfg.all_nodes_recursive(): - if isinstance(node, nodes.Tasklet): - if node in global_code_nodes[state]: - continue - if state.entry_node(node) is None and not scope.is_devicelevel_gpu_kernel( - state.parent, state, node): - scalars, scalar_output = _recursive_out_check(node, state, gpu_scalars) - sset, ssout = _recursive_in_check(node, state, gpu_scalars) - scalars = scalars.union(sset) - scalar_output = scalar_output and ssout - csdfg = state.parent - # If the tasklet is not adjacent only to scalars or it is in a GPU scope. - # The latter includes NestedSDFGs that have a GPU-Device schedule but are not in a GPU kernel. - if (not scalar_output - or (csdfg.parent is not None - and csdfg.parent_nsdfg_node.schedule == dtypes.ScheduleType.GPU_Default)): - global_code_nodes[state].append(node) - gpu_scalars.update({k: None for k in scalars}) - changed = True + for state in sdfg.states(): + for node in state.nodes(): + # Handle NestedSDFGs later. + if isinstance(node, nodes.NestedSDFG): + if state.entry_node(node) is None and not scope.is_devicelevel_gpu_kernel( + state.parent, state, node): + nsdfgs.append((node, state)) + elif isinstance(node, nodes.Tasklet): + if node in global_code_nodes[state]: + continue + if state.entry_node(node) is None and not scope.is_devicelevel_gpu_kernel( + state.parent, state, node): + scalars, scalar_output = _recursive_out_check(node, state, gpu_scalars) + sset, ssout = _recursive_in_check(node, state, gpu_scalars) + scalars = scalars.union(sset) + scalar_output = scalar_output and ssout + csdfg = state.parent + # If the tasklet is not adjacent only to scalars or it is in a GPU scope. + # The latter includes NestedSDFGs that have a GPU-Device schedule but are not in a GPU kernel. + if (not scalar_output + or (csdfg.parent is not None + and csdfg.parent_nsdfg_node.schedule == dtypes.ScheduleType.GPU_Default)): + global_code_nodes[state].append(node) + gpu_scalars.update({k: None for k in scalars}) + changed = True + + # Apply GPUTransformSDFG recursively to NestedSDFGs. + for node, state in nsdfgs: + excl_copyin = set() + for e in state.in_edges(node): + src = state.memlet_path(e)[0].src + if isinstance(src, nodes.AccessNode) and sdfg.arrays[src.data].storage in gpu_storage: + excl_copyin.add(e.dst_conn) + node.sdfg.arrays[e.dst_conn].storage = sdfg.arrays[src.data].storage + excl_copyout = set() + for e in state.out_edges(node): + dst = state.memlet_path(e)[-1].dst + if isinstance(dst, nodes.AccessNode) and sdfg.arrays[dst.data].storage in gpu_storage: + excl_copyout.add(e.src_conn) + node.sdfg.arrays[e.src_conn].storage = sdfg.arrays[dst.data].storage + # TODO: Do we want to copy here the options from the top-level SDFG? + node.sdfg.apply_transformations( + GPUTransformSDFG, { + 'exclude_copyin': ','.join([str(n) for n in excl_copyin]), + 'exclude_copyout': ','.join([str(n) for n in excl_copyout]) + }) ####################################################### # Step 6: Modify transient data storage @@ -350,26 +376,9 @@ def apply(self, _, sdfg: sd.SDFG): if sdict[node] is None and nodedesc.storage not in gpu_storage: - # Ensure that scalars not already GPU-marked are actually used in a GPU scope. + # Scalars were already checked. if isinstance(nodedesc, data.Scalar) and not node.data in gpu_scalars: - used_in_gpu_scope = False - for e in state.in_edges(node): - if _codenode_condition(state.memlet_path(e)[0].src): - used_in_gpu_scope = True - break - if not used_in_gpu_scope: - for e in state.out_edges(node): - if _codenode_condition(state.memlet_path(e)[-1].dst): - used_in_gpu_scope = True - break - if not used_in_gpu_scope: - continue - for e in state.all_edges(node): - for node in (e.src, e.dst): - if isinstance(node, nodes.Tasklet): - if (state.entry_node(node) is None and not scope.is_devicelevel_gpu( - state.parent, state, node, with_gpu_default=True)): - global_code_nodes[state].append(node) + continue # NOTE: the cloned arrays match too but it's the same storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global @@ -470,5 +479,5 @@ def apply(self, _, sdfg: sd.SDFG): # Step 9: Simplify if not self.simplify: return - + sdfg.simplify() diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py index 8f1bfe5d39..ac9dfeb920 100644 --- a/dace/transformation/interstate/loop_to_map.py +++ b/dace/transformation/interstate/loop_to_map.py @@ -114,25 +114,41 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi if symbolic.contains_sympy_functions(expr): return False + in_order_states = list(cfg.stateorder_topological_sort(sdfg)) + loop_begin_idx = in_order_states.index(begin) + loop_end_idx = in_order_states.index(body_end) + + if loop_end_idx < loop_begin_idx: # Malformed loop + return False + # Find all loop-body states - states = set() - to_visit = [begin] - while to_visit: - state = to_visit.pop(0) - for _, dst, _ in sdfg.out_edges(state): - if dst not in states and dst is not guard: - to_visit.append(dst) - states.add(state) + states: List[SDFGState] = in_order_states[loop_begin_idx:loop_end_idx + 1] assert (body_end in states) - write_set = set() + write_set: Set[str] = set() for state in states: _, wset = state.read_and_write_sets() write_set |= wset + # Collect symbol reads and writes from inter-state assignments + symbols_that_may_be_used: Set[str] = {itervar} + used_before_assignment: Set[str] = set() + for state in states: + for e in sdfg.out_edges(state): + # Collect read-before-assigned symbols (this works because the states are always in order, + # see above call to `stateorder_topological_sort`) + read_symbols = e.data.read_symbols() + read_symbols -= symbols_that_may_be_used + used_before_assignment |= read_symbols + # If symbol was read before it is assigned, the loop cannot be parallel + if e.data.assignments.keys() & used_before_assignment: + return False + + symbols_that_may_be_used |= e.data.assignments.keys() + # Get access nodes from other states to isolate local loop variables - other_access_nodes = set() + other_access_nodes: Set[str] = set() for state in sdfg.nodes(): if state in states: continue @@ -141,7 +157,7 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi for state in states: other_access_nodes |= set(n.data for n in state.data_nodes() if not sdfg.arrays[n.data].transient) - write_memlets = defaultdict(list) + write_memlets: Dict[str, List[memlet.Memlet]] = defaultdict(list) itersym = symbolic.pystr_to_symbolic(itervar) a = sp.Wild('a', exclude=[itersym]) @@ -185,7 +201,7 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi return False # Consider reads in inter-state edges (could be in assignments or in condition) - isread_set = set() + isread_set: Set[memlet.Memlet] = set() for s in states: for e in sdfg.all_edges(s): isread_set |= set(e.data.get_read_memlets(sdfg.arrays)) @@ -195,26 +211,34 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi mmlt.subset): return False - # Check that the iteration variable is not used on other edges or states - # before it is reassigned - prior_states = True - for state in cfg.stateorder_topological_sort(sdfg): - # Skip all states up to guard - if prior_states: - if state is begin: - prior_states = False - continue - # We do not need to check the loop-body states - if state in states: - continue - if itervar in state.free_symbols: - return False - # Don't continue in this direction, as the variable has - # now been reassigned - # TODO: Handle case of subset of out_edges - if all(itervar in e.data.assignments for e in sdfg.out_edges(state)): + # Check that the iteration variable and other symbols are not used on other edges or states + # before they are reassigned + for state in in_order_states[loop_end_idx + 1:]: + # Don't continue in this direction, as all loop symbols have been reassigned + if not symbols_that_may_be_used: break + # Check state contents + if symbols_that_may_be_used & state.free_symbols: + return False + + # Check inter-state edges + reassigned_symbols: Set[str] = None + for e in sdfg.out_edges(state): + if symbols_that_may_be_used & e.data.read_symbols(): + return False + + # Check for symbols that are set by all outgoing edges + # TODO: Handle case of subset of out_edges + if reassigned_symbols is None: + reassigned_symbols = set(e.data.assignments.keys()) + else: + reassigned_symbols &= e.data.assignments.keys() + + # Remove reassigned symbols + if reassigned_symbols is not None: + symbols_that_may_be_used -= reassigned_symbols + return True def test_read_memlet(self, sdfg: SDFG, itersym: symbolic.SymbolicType, itervar: str, start: symbolic.SymbolicType, @@ -390,7 +414,8 @@ def apply(self, _, sdfg: sd.SDFG): # Fix SDFG symbols for sym in sdfg.free_symbols - fsymbols: - del sdfg.symbols[sym] + if sym in sdfg.symbols: + del sdfg.symbols[sym] for sym, dtype in nsymbols.items(): nsdfg.symbols[sym] = dtype diff --git a/dace/transformation/passes/constant_propagation.py b/dace/transformation/passes/constant_propagation.py index 08f258514b..d1630a8f6f 100644 --- a/dace/transformation/passes/constant_propagation.py +++ b/dace/transformation/passes/constant_propagation.py @@ -171,15 +171,22 @@ def collect_constants(self, # Traverse SDFG topologically for state in optional_progressbar(sdfg.topological_sort(start_state), 'Collecting constants', sdfg.number_of_nodes(), self.progress): - if state in result: + # NOTE: We must always check the start-state regardless if there are initial symbols. This is necessary + # when the start-state is a scope's guard instead of a special initialization state, i.e., when the start- + # state has incoming edges that may involve the initial symbols. See also: + # `tests.passes.constant_propagation_test.test_for_with_external_init_nested_start_with_guard`` + if state in result and state is not start_state: continue # Get predecessors in_edges = sdfg.in_edges(state) if len(in_edges) == 1: # Special case, propagate as-is - result[state] = {} + if state not in result: # Condition evaluates to False when state is the start-state + result[state] = {} + # First the prior state - self._propagate(result[state], result[in_edges[0].src]) + if in_edges[0].src in result: # Condition evaluates to False when state is the start-state + self._propagate(result[state], result[in_edges[0].src]) # Then assignments on the incoming edge self._propagate(result[state], self._data_independent_assignments(in_edges[0].data, arrays)) @@ -205,7 +212,8 @@ def collect_constants(self, else: assignments[aname] = aval - result[state] = {} + if state not in result: # Condition may evaluate to False when state is the start-state + result[state] = {} self._propagate(result[state], assignments) return result diff --git a/tests/passes/constant_propagation_test.py b/tests/passes/constant_propagation_test.py index fab842897c..fc22dd7f96 100644 --- a/tests/passes/constant_propagation_test.py +++ b/tests/passes/constant_propagation_test.py @@ -356,7 +356,7 @@ def test_for_with_external_init_nested(): N = dace.symbol('N') - sdfg = dace.SDFG('for_with_external_init') + sdfg = dace.SDFG('for_with_external_init_nested') sdfg.add_array('A', (N, ), dace.int32) init = sdfg.add_state('init', is_start_state=True) main = sdfg.add_state('main') @@ -393,6 +393,49 @@ def test_for_with_external_init_nested(): assert np.allclose(val1, ref) +def test_for_with_external_init_nested_start_with_guard(): + """ + This test differs from the one above in lacking an initialization SDFGState in the NestedSDFG. Instead, the guard + of the nested for-loop is explicitly set as the start-state of the NestedSDFG. + """ + + N = dace.symbol('N') + + sdfg = dace.SDFG('for_with_external_init_nested_start_with_guard') + sdfg.add_array('A', (N, ), dace.int32) + init = sdfg.add_state('init', is_start_state=True) + main = sdfg.add_state('main') + sdfg.add_edge(init, main, dace.InterstateEdge(assignments={'i': '1'})) + + nsdfg = dace.SDFG('nested_sdfg') + nsdfg.add_array('inner_A', (N,), dace.int32) + nguard = nsdfg.add_state('nested_guard', is_start_state=True) + nbody = nsdfg.add_state('nested_body') + nexit = nsdfg.add_state('nested_exit') + nsdfg.add_edge(nguard, nbody, dace.InterstateEdge(condition='i <= N')) + nsdfg.add_edge(nbody, nguard, dace.InterstateEdge(assignments={'i': 'i+1'})) + nsdfg.add_edge(nguard, nexit, dace.InterstateEdge(condition='i > N')) + + na = nbody.add_access('inner_A') + nt = nbody.add_tasklet('tasklet', {}, {'__out'}, '__out = i-1') + nbody.add_edge(nt, '__out', na, None, dace.Memlet('inner_A[i-1]')) + + a = main.add_access('A') + t = main.add_nested_sdfg(nsdfg, None, {}, {'inner_A'}, {'N': 'N', 'i': 'i'}) + main.add_edge(t, 'inner_A', a, None, dace.Memlet.from_array('A', sdfg.arrays['A'])) + + sdfg.validate() + + ref = np.arange(10, dtype=np.int32) + val0 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val0, N=10) + assert np.allclose(val0, ref) + ConstantPropagation().apply_pass(sdfg, {}) + val1 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val1, N=10) + assert np.allclose(val1, ref) + + if __name__ == '__main__': test_simple_constants() test_nested_constants() @@ -408,3 +451,4 @@ def test_for_with_external_init_nested(): test_allocation_varying(True) test_for_with_external_init() test_for_with_external_init_nested() + test_for_with_external_init_nested_start_with_guard() diff --git a/tests/transformations/gpu_transform_test.py b/tests/transformations/gpu_transform_test.py index 91403c0457..d6814273a6 100644 --- a/tests/transformations/gpu_transform_test.py +++ b/tests/transformations/gpu_transform_test.py @@ -2,6 +2,8 @@ """ Unit tests for the GPU to-device transformation. """ import dace +import numpy as np +import pytest from dace.transformation.interstate import GPUTransformSDFG @@ -26,5 +28,36 @@ def program(A: dace.float64[20, 20]): assert desc.lifetime is not dace.AllocationLifetime.SDFG +@pytest.mark.gpu +def test_scalar_to_symbol_in_nested_sdfg(): + """ + GPUTransformSDFG will automatically create copy-out states for GPU scalars that are used in host-side interstate + edges. However, this process may only be applied in top-level SDFGs and not in NestedSDFGs that have GPU-device + schedule but are not part of a single GPU kernel, leading to illegal memory accesses. + """ + + @dace.program + def nested_program(a: dace.int32, out: dace.int32[10]): + for i in range(10): + if a < 5: + out[i] = 0 + a *= 2 + else: + out[i] = 10 + a /= 2 + + @dace.program + def main_program(a: dace.int32): + out = np.ndarray((10,), dtype=np.int32) + nested_program(a, out) + return out + + sdfg = main_program.to_sdfg(simplify=False) + sdfg.apply_transformations(GPUTransformSDFG) + out = sdfg(a=4) + assert np.array_equal(out, np.array([0, 10] * 5, dtype=np.int32)) + + if __name__ == '__main__': test_toplevel_transient_lifetime() + test_scalar_to_symbol_in_nested_sdfg() diff --git a/tests/transformations/loop_to_map_test.py b/tests/transformations/loop_to_map_test.py index 2a2b51beab..b2940b259d 100644 --- a/tests/transformations/loop_to_map_test.py +++ b/tests/transformations/loop_to_map_test.py @@ -1,17 +1,19 @@ # Copyright 2020-2020 ETH Zurich and the DaCe authors. All rights reserved. import argparse -import dace -import numpy as np import os import tempfile + +import numpy as np +import pytest + +import dace from dace.sdfg import nodes from dace.transformation.interstate import LoopToMap def make_sdfg(with_wcr, map_in_guard, reverse_loop, use_variable, assign_after, log_path): - sdfg = dace.SDFG(f"loop_to_map_test_{with_wcr}_{map_in_guard}_" - f"{reverse_loop}_{use_variable}_{assign_after}") + sdfg = dace.SDFG(f"loop_to_map_test_{with_wcr}_{map_in_guard}_{reverse_loop}_{use_variable}_{assign_after}") sdfg.set_global_code("#include \n#include ") init = sdfg.add_state("init") @@ -162,7 +164,6 @@ def test_loop_to_map_variable_reassigned(n=None): def test_output_copy(): - @dace.program def l2mtest_copy(A: dace.float64[20, 20]): for i in range(1, 20): @@ -182,7 +183,6 @@ def l2mtest_copy(A: dace.float64[20, 20]): def test_output_accumulate(): - @dace.program def l2mtest_accumulate(A: dace.float64[20, 20]): for i in range(1, 20): @@ -240,7 +240,6 @@ def detect_greater(i: _[0:size]): def test_empty_loop(): - @dace.program def empty_loop(): for i in range(10): @@ -291,8 +290,8 @@ def test_interstate_dep(): def test_need_for_tasklet(): sdfg = dace.SDFG('needs_tasklet') - aname, _ = sdfg.add_array('A', (10,), dace.int32) - bname, _ = sdfg.add_array('B', (10,), dace.int32) + aname, _ = sdfg.add_array('A', (10, ), dace.int32) + bname, _ = sdfg.add_array('B', (10, ), dace.int32) body = sdfg.add_state('body') _, _, _ = sdfg.add_loop(None, body, None, 'i', '0', 'i < 10', 'i + 1', None) anode = body.add_access(aname) @@ -305,11 +304,11 @@ def test_need_for_tasklet(): if isinstance(n, nodes.Tasklet): found = True break - + assert found A = np.arange(10, dtype=np.int32) - B = np.empty((10,), dtype=np.int32) + B = np.empty((10, ), dtype=np.int32) sdfg(A=A, B=B) assert np.array_equal(B, np.arange(9, -1, -1, dtype=np.int32)) @@ -332,7 +331,7 @@ def test_need_for_transient(): if isinstance(n, nodes.AccessNode) and n.data not in (aname, bname): found = True break - + assert found A = np.arange(100, dtype=np.int32).reshape(10, 10).copy() @@ -341,8 +340,141 @@ def test_need_for_transient(): for i in range(10): start = i * 10 - assert np.array_equal(B[i], np.arange(start + 9, start -1, -1, dtype=np.int32)) + assert np.array_equal(B[i], np.arange(start + 9, start - 1, -1, dtype=np.int32)) + +def test_iteration_variable_used_outside(): + N = dace.symbol("N", dace.int32) + + @dace.program + def tester(A: dace.float64[N], output: dace.float64[1]): + i = -1 + + for i in range(N): + A[i] += 1 + + if i > 10: + output[0] = 1.0 + + sdfg = tester.to_sdfg(simplify=True) + assert sdfg.apply_transformations(LoopToMap) == 0 + + +def test_symbol_race(): + + # Adapted from npbench's crc16 test + # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/crc16/crc16_dace.py + poly: dace.uint16 = 0x8408 + + @dace.program + def tester(data: dace.int32[20]): + crc: dace.uint16 = 0xFFFF + for i in range(20): + b = data[i] + cur_byte = 0xFF & b + for _ in range(0, 8): + if (crc & 0x0001) ^ (cur_byte & 0x0001): + crc = (crc >> 1) ^ poly + else: + crc >>= 1 + cur_byte >>= 1 + crc = (~crc & 0xFFFF) + crc = (crc << 8) | ((crc >> 8) & 0xFF) + + sdfg = tester.to_sdfg(simplify=True) + assert sdfg.apply_transformations(LoopToMap) == 0 + + +def test_symbol_write_before_read(): + sdfg = dace.SDFG('tester') + init = sdfg.add_state(is_start_state=True) + body_start = sdfg.add_state() + body = sdfg.add_state() + body_end = sdfg.add_state() + sdfg.add_loop(init, body_start, None, 'i', '0', 'i < 20', 'i + 1', loop_end_state=body_end) + + # Internal loop structure + sdfg.add_edge(body_start, body, dace.InterstateEdge(assignments=dict(j='0'))) + sdfg.add_edge(body, body_end, dace.InterstateEdge(assignments=dict(j='j + 1'))) + + assert sdfg.apply_transformations(LoopToMap) == 1 + + +@pytest.mark.parametrize('overwrite', (False, True)) +def test_symbol_array_mix(overwrite): + sdfg = dace.SDFG('tester') + sdfg.add_transient('tmp', [1], dace.float64) + sdfg.add_symbol('sym', dace.float64) + init = sdfg.add_state(is_start_state=True) + body_start = sdfg.add_state() + body = sdfg.add_state() + body_end = sdfg.add_state() + after = sdfg.add_state() + sdfg.add_loop(init, body_start, after, 'i', '0', 'i < 20', 'i + 1', loop_end_state=body_end) + + sdfg.out_edges(init)[0].data.assignments['sym'] = '0.0' + + # Internal loop structure + t = body_start.add_tasklet('def', {}, {'o'}, 'o = i') + body_start.add_edge(t, 'o', body_start.add_write('tmp'), None, dace.Memlet('tmp')) + + if overwrite: + sdfg.add_edge(body_start, body, dace.InterstateEdge(assignments=dict(sym='tmp'))) + else: + sdfg.add_edge(body_start, body, dace.InterstateEdge(assignments=dict(sym='sym + tmp'))) + sdfg.add_edge(body, body_end, dace.InterstateEdge(assignments=dict(sym='sym + 1.0'))) + + assert sdfg.apply_transformations(LoopToMap) == (1 if overwrite else 0) + +@pytest.mark.parametrize('parallel', (False, True)) +def test_symbol_array_mix_2(parallel): + sdfg = dace.SDFG('tester') + sdfg.add_array('A', [20], dace.float64) + sdfg.add_array('B', [20], dace.float64) + sdfg.add_symbol('sym', dace.float64) + init = sdfg.add_state(is_start_state=True) + body_start = sdfg.add_state() + body_end = sdfg.add_state() + after = sdfg.add_state() + sdfg.add_loop(init, body_start, after, 'i', '1', 'i < 20', 'i + 1', loop_end_state=body_end) + + sdfg.out_edges(init)[0].data.assignments['sym'] = '0.0' + + # Internal loop structure + if not parallel: + t = body_start.add_tasklet('def', {}, {'o'}, 'o = i') + body_start.add_edge(t, 'o', body_start.add_write('A'), None, dace.Memlet('A[i]')) + + sdfg.add_edge(body_start, body_end, dace.InterstateEdge(assignments=dict(sym='A[i - 1]'))) + t = body_start.add_tasklet('use', {}, {'o'}, 'o = sym') + body_start.add_edge(t, 'o', body_start.add_write('B'), None, dace.Memlet('B[i]')) + + assert sdfg.apply_transformations(LoopToMap) == (1 if parallel else 0) + + +@pytest.mark.parametrize('overwrite', (False, True)) +def test_internal_symbol_used_outside(overwrite): + sdfg = dace.SDFG('tester') + init = sdfg.add_state(is_start_state=True) + body_start = sdfg.add_state() + body = sdfg.add_state() + body_end = sdfg.add_state() + after = sdfg.add_state() + sdfg.add_loop(init, body_start, after, 'i', '0', 'i < 20', 'i + 1', loop_end_state=body_end) + + # Internal loop structure + sdfg.add_edge(body_start, body, dace.InterstateEdge(assignments=dict(j='0'))) + sdfg.add_edge(body, body_end, dace.InterstateEdge(assignments=dict(j='j + 1'))) + + # Use after + after_1 = sdfg.add_state() + after_1.add_tasklet('use', {}, {}, 'printf("%d\\n", j)') + + if overwrite: + sdfg.add_edge(after, after_1, dace.InterstateEdge(assignments=dict(j='5'))) + else: + sdfg.add_edge(after, after_1, dace.InterstateEdge()) + assert sdfg.apply_transformations(LoopToMap) == (1 if overwrite else 0) if __name__ == "__main__": @@ -365,3 +497,12 @@ def test_need_for_transient(): test_interstate_dep() test_need_for_tasklet() test_need_for_transient() + test_iteration_variable_used_outside() + test_symbol_race() + test_symbol_write_before_read() + test_symbol_array_mix(False) + test_symbol_array_mix(True) + test_symbol_array_mix_2(False) + test_symbol_array_mix_2(True) + test_internal_symbol_used_outside(False) + test_internal_symbol_used_outside(True) diff --git a/tests/transformations/mapfission_test.py b/tests/transformations/mapfission_test.py index beeb64a9bb..a7faa9c882 100644 --- a/tests/transformations/mapfission_test.py +++ b/tests/transformations/mapfission_test.py @@ -394,6 +394,54 @@ def map_with_if_2(A: dace.int32[10]): val1 = np.ndarray((10, ), dtype=np.int32) sdfg(A=val1) self.assertTrue(np.array_equal(val1, ref)) + + def test_array_copy_outside_scope(self): + + """ + This test checks for two issues occuring when MapFission applies on a NestedSDFG with a state-subgraph + containing copies among AccessNodes. In such cases, these copies may end up outside the scope of the generated + Maps (after MapFssion), potentially leading to the following errors: + 1. The memlet subset corresponding to a NestedSDFG connector (input/output) may have its dimensionality + erroneously increased. + 2. The memlet subset corresponding to a NestedSDFG connector (input/output) may not be propagated even if it uses + the Map's parameters. + """ + + sdfg = dace.SDFG('array_copy_outside_scope') + iname, _ = sdfg.add_array('inp', (10,), dtype=dace.int32) + oname, _ = sdfg.add_array('out', (10,), dtype=dace.int32) + + nsdfg = dace.SDFG('nested_sdfg') + niname, nidesc = nsdfg.add_array('ninp', (1,), dtype=dace.int32) + ntname, ntdesc = nsdfg.add_scalar('ntmp', dtype=dace.int32, transient=True) + noname, nodesc = nsdfg.add_array('nout', (1,), dtype=dace.int32) + + nstate = nsdfg.add_state('nmain') + ninode = nstate.add_access(niname) + ntnode = nstate.add_access(ntname) + nonode = nstate.add_access(noname) + tasklet = nstate.add_tasklet('tasklet', {'__inp'}, {'__out'}, '__out = __inp + 1') + nstate.add_edge(ninode, None, tasklet, '__inp', dace.Memlet.from_array(niname, nidesc)) + nstate.add_edge(tasklet, '__out', ntnode, None, dace.Memlet.from_array(ntname, ntdesc)) + nstate.add_nedge(ntnode, nonode, dace.Memlet.from_array(noname, nodesc)) + + state = sdfg.add_state('main') + inode = state.add_access(iname) + onode = state.add_access(oname) + me, mx = state.add_map('map', {'i': '0:10'}) + snode = state.add_nested_sdfg(nsdfg, None, {'ninp'}, {'nout'}) + state.add_memlet_path(inode, me, snode, memlet=dace.Memlet(data=iname, subset='i'), dst_conn='ninp') + state.add_memlet_path(snode, mx, onode, memlet=dace.Memlet(data=oname, subset='i'), src_conn='nout') + + # Issue no. 1 will be caught by validation after MapFission + sdfg.apply_transformations(MapFission) + + # Issue no. 2 will be caught by code-generation due to `i` existing in a memlet outside the Map's scope. + A = np.arange(10, dtype=np.int32) + B = np.empty((10,), dtype=np.int32) + sdfg(inp=A, out=B) + assert np.array_equal(A+1, B) + if __name__ == '__main__':