diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 8896a191fe..8821628000 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -218,6 +218,8 @@ def get_state_struct(self) -> ctypes.Structure: :return: the ctypes.Structure representation of the state struct. """ + if not self._libhandle: + raise ValueError('Library was not initialized') return ctypes.cast(self._libhandle, ctypes.POINTER(self._try_parse_state_struct())).contents diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py index 31cfc6e13f..b920b0e9d5 100644 --- a/dace/codegen/targets/fpga.py +++ b/dace/codegen/targets/fpga.py @@ -1110,12 +1110,8 @@ def generate_nested_state(self, sdfg, state, nest_name, subgraphs, function_stre def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): if not self._in_device_code: - # If we're not already generating kernel code we need to set up the - # kernel launch - subgraphs = [dfg_scope] - return self.generate_kernel(sdfg, sdfg.node(state_id), - dfg_scope.source_nodes()[0].map.label.replace(" ", "_"), subgraphs, - function_stream, callsite_stream) + # If we're not already generating kernel code, fail + raise cgx.CodegenError('FPGA kernel needs to be generated inside a device state.') self.generate_node(sdfg, dfg_scope, state_id, dfg_scope.source_nodes()[0], function_stream, callsite_stream) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 808ba26d6a..091c893d5b 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -20,7 +20,6 @@ from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes from dace.sdfg import scope as sdscope from dace.sdfg import utils -from dace.sdfg.infer_types import set_default_schedule_and_storage_types from dace.transformation.passes.analysis import StateReachability @@ -425,7 +424,13 @@ def _get_schedule(self, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> dtype sdfg: SDFG = (scope if isinstance(scope, SDFG) else scope.parent) if sdfg.parent_nsdfg_node is None: return TOP_SCHEDULE - return (sdfg.parent_nsdfg_node.schedule or TOP_SCHEDULE) + + # Go one SDFG up + pstate = sdfg.parent + pscope = pstate.entry_node(sdfg.parent_nsdfg_node) + if pscope is not None: + return self._get_schedule(pscope) + return self._get_schedule(pstate) else: raise TypeError diff --git a/dace/dtypes.py b/dace/dtypes.py index 00192eeb9b..a86a746884 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -217,6 +217,18 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore } +# Maps from StorageType to a preferred ScheduleType for helping determine schedules. +# If mapped to None or does not exist in this dictionary, does not affect decision. +# Scalar data containers also do not affect this decision. +STORAGEDEFAULT_SCHEDULE = { + StorageType.CPU_Heap: ScheduleType.CPU_Multicore, + StorageType.CPU_ThreadLocal: ScheduleType.CPU_Multicore, + StorageType.GPU_Global: ScheduleType.GPU_Device, + StorageType.GPU_Shared: ScheduleType.GPU_ThreadBlock, + StorageType.FPGA_Global: ScheduleType.FPGA_Device, + StorageType.SVE_Register: ScheduleType.SVE_Map, +} + # Translation of types to C types _CTYPES = { None: "void", diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py index 4a49397255..767cd53429 100644 --- a/dace/libraries/blas/nodes/gemm.py +++ b/dace/libraries/blas/nodes/gemm.py @@ -87,7 +87,7 @@ def make_sdfg(node, parent_state, parent_sdfg): init_state = sdfg.add_state(node.label + "_initstate") state = sdfg.add_state_after(init_state, node.label + "_state") - if node.beta != 0: + if '_cin' in node.in_connectors: sdfg.add_array("_cin", shape_c, dtype_c, strides=cdata[-1], storage=cdata[1].storage) mul_out, mul_out_array = "_c", array_c @@ -1050,7 +1050,7 @@ def validate(self, sdfg, state): # Numpy replacement @oprepo.replaces('dace.libraries.blas.gemm') @oprepo.replaces('dace.libraries.blas.Gemm') -def gemv_libnode(pv: 'ProgramVisitor', +def gemm_libnode(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, A, diff --git a/dace/libraries/blas/nodes/matmul.py b/dace/libraries/blas/nodes/matmul.py index a937af0a81..185beee1a0 100644 --- a/dace/libraries/blas/nodes/matmul.py +++ b/dace/libraries/blas/nodes/matmul.py @@ -143,6 +143,8 @@ def expansion(node, state, sdfg): from dace.libraries.blas.nodes.gemm import Gemm beta = node.beta cin = True + if '_cin' not in node.in_connectors: + cin = False if c[0].data.wcr: from dace.frontend import operations redtype = operations.detect_reduction_type(c[0].data.wcr) diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py index 5cf1f5f7a4..105e1d12e9 100644 --- a/dace/sdfg/infer_types.py +++ b/dace/sdfg/infer_types.py @@ -3,11 +3,11 @@ from dace import data, dtypes from dace.codegen.tools import type_inference from dace.memlet import Memlet -from dace.sdfg import SDFG, SDFGState, nodes +from dace.sdfg import SDFG, SDFGState, nodes, validation from dace.sdfg import nodes -from dace.sdfg.graph import Edge +from dace.sdfg.graph import Edge, SubgraphView from dace.sdfg.utils import dfs_topological_sort -from typing import Callable, Dict, List, Optional, Set +from typing import Callable, Dict, List, Optional, Set, Union ############################################################################# # Connector type inference @@ -123,156 +123,272 @@ def infer_connector_types(sdfg: SDFG): # Default schedule and storage type inference -def set_default_schedule_and_storage_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType): +def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.EntryNode], + parent_schedules: List[dtypes.ScheduleType] = None, + use_parent_schedule: bool = False, + state: SDFGState = None, + child_nodes: Dict[nodes.Node, List[nodes.Node]] = None): """ Sets default storage and schedule types throughout SDFG in-place. - Replaces `ScheduleType.Default` and `StorageType.Default` + Replaces ``ScheduleType.Default`` and ``StorageType.Default`` with the corresponding types according to the parent scope's schedule. The defaults for storage types are determined by the ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device - schedule, by default, will allocate containers on the shared memory); and - similarly for schedules by ``dtypes.SCOPEDEFAULT_SCHEDULE`` (e.g., a map - nested in a CPU multi-core map will by default run within a single thread). - - :param sdfg: The SDFG to infer. - :param toplevel_schedule: The default top-level schedule for "global" nodes - (without parent scope nodes). + schedule, by default, will allocate containers on the shared memory). + Following storage type inference for a scope, nested scopes (e.g., map entry, nested SDFG) + are evaluated using the ``dtypes.STORAGEDEFAULT_SCHEDULE`` dictionary (for example, a + default map with only GPU arrays connected to it will execute on the GPU). This decision + is superseded if the schedule is specified in ``dtypes.SCOPEDEFAULT_SCHEDULE`` (e.g., + a map nested in a CPU multi-core map will by default run within a single thread). + If no default schedule is found while traversing the parent scopes, the chosen schedule will be + determined based on the SDFG's device, as specified in ``dtypes.DEFAULT_TOPLEVEL_STORAGE`` and + ``dtypes.DEFAULT_TOPLEVEL_SCHEDULE``. + May raise ``InvalidSDFGNodeError`` if a default scope is ambiguous based on surrounding + storage types. + :param scope: The SDFG, state, or scope to infer. + :param parent_schedules: A list of ScheduleType elements representing + an ordered list of schedules, from the global schedule + on the top-level SDFG (usually ``None``), up to this + point. + :param use_parent_schedule: If True, uses the parent scope's schedule type + directly, instead of the default schedule type. + Used when expanding nested SDFGs to preserve their + top-level schedule. + :param state: (Use when working with a single scope) The parent state. + :param child_nodes: (Use when working with a single scope) A mapping of each scope entry + node to its children. """ - _set_default_schedule_types(sdfg, toplevel_schedule) - _set_default_storage_types(sdfg, toplevel_schedule) - - -def _scopes_with_tbmaps(state: SDFGState, scopes: List[nodes.EntryNode]): - """ Returns a set of scopes where a thread-block (or dynamic thread-block) - sub-scopes exist. Used, e.g., to modify storage defaults. """ - scopes_with_tbmaps = set() - for scope_entry in scopes: - subgraph = state.scope_subgraph(scope_entry) - has_tb_map = False - # Append thread-block maps from subgraph and nested SDFGs - for node in subgraph.nodes(): - if isinstance(node, nodes.EntryNode) and node.schedule in (dtypes.ScheduleType.GPU_ThreadBlock, - dtypes.ScheduleType.GPU_ThreadBlock_Dynamic): - has_tb_map = True - break - elif isinstance(node, nodes.NestedSDFG): - for n in node.sdfg.all_nodes_recursive(): - if isinstance(node, - nodes.EntryNode) and node.schedule in (dtypes.ScheduleType.GPU_ThreadBlock, - dtypes.ScheduleType.GPU_ThreadBlock_Dynamic): - has_tb_map = True - break - if has_tb_map: - break - if has_tb_map: - scopes_with_tbmaps.add(scope_entry) - return scopes_with_tbmaps - - -def _set_default_schedule_in_scope(parent_node: nodes.Node, - parent_schedule: dtypes.ScheduleType, - reverse_scope_dict: Dict[nodes.Node, List[nodes.Node]], - use_parent_schedule: bool = False): - for node in reverse_scope_dict[parent_node]: - if use_parent_schedule: - child_schedule = parent_schedule - if parent_schedule in (dtypes.ScheduleType.Default, dtypes.ScheduleType.GPU_Default): - child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[parent_schedule] + parent_schedules = parent_schedules or [None] + + # TODO(later): Remove GPU_Default + if parent_schedules[-1] == dtypes.ScheduleType.GPU_Default and use_parent_schedule: + use_parent_schedule = False + + if isinstance(scope, SDFG): + # Set device for default top-level schedules and storages + for state in scope.nodes(): + set_default_schedule_and_storage_types(state, + parent_schedules, + use_parent_schedule=use_parent_schedule, + state=state, + child_nodes=state.scope_children()) + + # Take care of remaining scalars without access nodes + for aname, desc in scope.arrays.items(): + # If not transient in a nested SDFG, take storage from parent, regardless of current type + if not desc.transient and scope.parent_sdfg is not None: + desc.storage = _get_storage_from_parent(aname, scope) + elif ((desc.transient or scope.parent_sdfg is None) and desc.storage == dtypes.StorageType.Default): + # Indeterminate storage type, set to register + desc.storage = dtypes.StorageType.Register + return + + # Setup arguments + parent_node = None if isinstance(scope, SDFGState) else scope + if state is None: + if isinstance(scope, SDFGState): + state = scope else: - child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[parent_schedule] - # Set default schedule type - if isinstance(node, nodes.MapEntry): - if node.map.schedule is dtypes.ScheduleType.Default: - node.map.schedule = child_schedule - # Also traverse children (recursively) - _set_default_schedule_in_scope(node, node.map.schedule, reverse_scope_dict) - elif isinstance(node, nodes.ConsumeEntry): - if node.consume.schedule is dtypes.ScheduleType.Default: - node.consume.schedule = child_schedule - - # Also traverse children (recursively) - _set_default_schedule_in_scope(node, node.consume.schedule, reverse_scope_dict) - elif isinstance(node, nodes.NestedSDFG): - # Nested SDFGs retain same schedule as their parent scope - if node.schedule is dtypes.ScheduleType.Default: - node.schedule = parent_schedule - _set_default_schedule_types(node.sdfg, node.schedule) - elif getattr(node, 'schedule', False): - if node.schedule is dtypes.ScheduleType.Default: - node.schedule = (child_schedule - if isinstance(node, nodes.EntryNode) or parent_schedule is None else parent_schedule) - - -def _set_default_schedule_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType, use_parent_schedule: bool = False): - for state in sdfg.nodes(): - reverse_scope_dict = state.scope_children() + raise ValueError('SDFG state cannot be None when inferring a scope') + if child_nodes is None: + child_nodes = state.scope_children() + + ############################################ + + # Set default storage types in this scope + _set_default_storage_in_scope(state, parent_node, parent_schedules, child_nodes) + + # Set default schedules in this scope based on parent schedule and inferred storage types + nested_scopes = _set_default_schedule_in_scope(state, parent_node, parent_schedules, child_nodes, + use_parent_schedule) + + # Loop over internal nested SDFGs and scope entry nodes + for nnode in nested_scopes: + # Continue through nested SDFGs + if isinstance(nnode, nodes.NestedSDFG): + nscope = nnode.sdfg + child_nodes = None + extra_parent_schedules = [] + # TODO(later): Remove GPU_Default + if nnode.schedule == dtypes.ScheduleType.GPU_Default: + extra_parent_schedules.append(nnode.schedule) + else: + nscope = nnode + extra_parent_schedules = [nnode.schedule] + set_default_schedule_and_storage_types(nscope, + parent_schedules + extra_parent_schedules, + use_parent_schedule=False, + state=state, + child_nodes=child_nodes) + + +def _determine_child_schedule(parent_schedules: List[dtypes.ScheduleType]) -> Optional[dtypes.ScheduleType]: + for sched in reversed(parent_schedules): + if sched is not None and sched in dtypes.SCOPEDEFAULT_SCHEDULE: + child_sched = dtypes.SCOPEDEFAULT_SCHEDULE[sched] + if child_sched is not None: + return child_sched + return None + + +def _determine_child_storage(parent_schedules: List[dtypes.ScheduleType]) -> Optional[dtypes.StorageType]: + for sched in reversed(parent_schedules): + if (sched is not None and sched in dtypes.SCOPEDEFAULT_STORAGE and sched != dtypes.ScheduleType.Sequential): + child_sched = dtypes.SCOPEDEFAULT_STORAGE[sched] + if child_sched is not None: + return child_sched + return None + + +def _determine_schedule_from_storage(state: SDFGState, node: nodes.Node) -> Optional[dtypes.ScheduleType]: + child_schedule = None + memlets: Set[str] = set() + if node is None or isinstance(node, nodes.NestedSDFG): # State or nested SDFG + pass + elif isinstance(node, nodes.EntryNode): + # Test for storage of the scope by collecting all neighboring memlets + memlets = set(e.data.data for e in state.out_edges(node) if not e.data.is_empty()) + exit_node = state.exit_node(node) + memlets.update(e.data.data for e in state.in_edges(exit_node) if not e.data.is_empty()) + else: + # Other nodes only need neighboring memlets + memlets = set(e.data.data for e in state.all_edges(node) if not e.data.is_empty()) + + # From memlets, use non-scalar data descriptors for decision + constraints: Set[dtypes.ScheduleType] = set() + sdfg = state.parent + for dname in memlets: + if isinstance(sdfg.arrays[dname], data.Scalar): + continue # Skip scalars + + storage = sdfg.arrays[dname].storage + if storage not in dtypes.STORAGEDEFAULT_SCHEDULE: + continue + sched = dtypes.STORAGEDEFAULT_SCHEDULE[storage] + if sched is None: + continue + constraints.add(sched) + + if not constraints: # No constraints found + child_schedule = None + elif len(constraints) > 1: + raise validation.InvalidSDFGNodeError( + f'Cannot determine default schedule for node {node}. ' + 'Multiple arrays that point to it say that it should be the following schedules: ' + f'{constraints}', state.parent, state.parent.node_id(state), state.node_id(node)) + else: + child_schedule = next(iter(constraints)) - # Start with top-level nodes and call recursively - _set_default_schedule_in_scope(None, toplevel_schedule, reverse_scope_dict, use_parent_schedule) + # If no valid schedules are found and there are no conflicts with storage, use default top-level schedule + if child_schedule is None: + child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None] + return child_schedule -def _set_default_storage_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType): - for state in sdfg.nodes(): - scope_dict = state.scope_dict() - scopes_with_tbmaps = _scopes_with_tbmaps(state, [ - n - for n in state.nodes() if isinstance(n, nodes.MapEntry) and n.schedule in [dtypes.ScheduleType.GPU_Device] - ]) - - for node in state.nodes(): - if not isinstance(node, nodes.AccessNode): - continue - desc = node.desc(sdfg) - # Only set transients if nested - if ((desc.transient or sdfg.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default): - # Special cases - parent_node = scope_dict[node] - if parent_node is None: - parent_schedule = toplevel_schedule - else: - parent_schedule = parent_node.map.schedule - # Skip sequential maps to determine storage - while parent_schedule == dtypes.ScheduleType.Sequential: - parent_node = scope_dict[parent_node] - if parent_node is None: - parent_schedule = toplevel_schedule - break - parent_schedule = parent_node.map.schedule - # Determine default GPU schedule based on existence of - # thread-block maps - if parent_schedule == dtypes.ScheduleType.GPU_Device: - if parent_node not in scopes_with_tbmaps: - parent_schedule = dtypes.ScheduleType.GPU_ThreadBlock - # End of special cases - - # Set default storage type - desc.storage = dtypes.SCOPEDEFAULT_STORAGE[parent_schedule] - - # Take care of remaining arrays/scalars, e.g., code->code edges - for desc in sdfg.arrays.values(): - if ((desc.transient or sdfg.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default): - desc.storage = dtypes.StorageType.Register - for state in sdfg.nodes(): - # Loop again after all default storages have been set to set nested - # SDFGs - for node in state.nodes(): - if not isinstance(node, nodes.NestedSDFG): - continue - for name, desc in node.sdfg.arrays.items(): - if (not desc.transient and desc.storage is dtypes.StorageType.Default): - # Find connector and ensure storage types match - for e in state.in_edges(node): - if e.dst_conn == name: - desc.storage = sdfg.arrays[e.data.data].storage - break - for e in state.out_edges(node): - if e.src_conn == name: - desc.storage = sdfg.arrays[e.data.data].storage - break - _set_default_storage_types(node.sdfg, node.schedule) +def _set_default_schedule_in_scope(state: SDFGState, + parent_node: nodes.Node, + parent_schedules: List[dtypes.ScheduleType], + child_nodes: Dict[nodes.Node, List[nodes.Node]], + use_parent_schedule: bool = False) -> List[Union[nodes.EntryNode, nodes.NestedSDFG]]: + nested_scopes: List[Union[nodes.EntryNode, nodes.NestedSDFG]] = [] + + # Try to determine schedule based on parent schedule(s) + if use_parent_schedule: + child_schedule = parent_schedules[-1] + else: + child_schedule = _determine_child_schedule(parent_schedules) + + # Set child schedule type in scope + for node in child_nodes[parent_node]: + # Set default schedule types + if isinstance(node, (nodes.EntryNode, nodes.NestedSDFG)): + nested_scopes.append(node) + if node.schedule == dtypes.ScheduleType.Default: + # If parent schedules do not determine child schedule, + # test for storage of the scope by collecting all neighboring memlets + if child_schedule is None: + local_child_schedule = _determine_schedule_from_storage(state, node) + else: + local_child_schedule = child_schedule + node.schedule = local_child_schedule + elif getattr(node, 'schedule', False) and not isinstance(node, nodes.ExitNode): + if node.schedule == dtypes.ScheduleType.Default: + if child_schedule is None: + local_child_schedule = _determine_schedule_from_storage(state, node) + else: + local_child_schedule = child_schedule + node.schedule = local_child_schedule + + return nested_scopes + + +def _set_default_storage_in_scope(state: SDFGState, parent_node: Optional[nodes.Node], + parent_schedules: List[dtypes.ScheduleType], child_nodes: Dict[nodes.Node, + List[nodes.Node]]): + # Special case for GPU maps without explicit thread-block assignment + if (dtypes.ScheduleType.GPU_Device in parent_schedules + and dtypes.ScheduleType.GPU_ThreadBlock not in parent_schedules + and dtypes.ScheduleType.GPU_ThreadBlock_Dynamic not in parent_schedules): + from dace.transformation.helpers import gpu_map_has_explicit_threadblocks # Avoid import loops + # Find GPU scopes without thread-block maps + if not gpu_map_has_explicit_threadblocks(state, parent_node): + # Do not modify external list + parent_schedules = parent_schedules + [dtypes.ScheduleType.GPU_ThreadBlock] + # End of special case + + sdfg = state.parent + child_storage = _determine_child_storage(parent_schedules) + if child_storage is None: + child_storage = dtypes.SCOPEDEFAULT_STORAGE[None] + + exit_nodes = [state.exit_node(n) for n in child_nodes[parent_node] if isinstance(n, nodes.EntryNode)] + scope_subgraph = SubgraphView(state, child_nodes[parent_node] + exit_nodes) + + # Loop over access nodes + for node in scope_subgraph.nodes(): + if not isinstance(node, nodes.AccessNode): + continue + desc = node.desc(sdfg) + # If not transient in a nested SDFG, take storage from parent, regardless of current type + if not desc.transient and sdfg.parent is not None: + desc.storage = _get_storage_from_parent(node.data, sdfg) + elif desc.storage == dtypes.StorageType.Default: + desc.storage = child_storage + + # Take care of code->code edges that do not have access nodes + for edge in scope_subgraph.edges(): + if not edge.data.is_empty(): + desc = sdfg.arrays[edge.data.data] + # If not transient in a nested SDFG, take storage from parent, regardless of current type + if not desc.transient and sdfg.parent is not None: + desc.storage = _get_storage_from_parent(edge.data.data, sdfg) + elif desc.storage == dtypes.StorageType.Default: + desc.storage = child_storage + + +def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType: + """ + Retrieves the storage type of an array from its parent SDFG. + :param data_name: The name of the data descriptor. + :param sdfg: The parent SDFG. + :return: The storage type of the data descriptor. + """ + nsdfg_node = sdfg.parent_nsdfg_node + parent_state = sdfg.parent + parent_sdfg = parent_state.parent + + # Find data descriptor in parent SDFG + if data_name in nsdfg_node.in_connectors: + e = next(iter(parent_state.in_edges_by_connector(nsdfg_node, data_name))) + return parent_sdfg.arrays[e.data.data].storage + elif data_name in nsdfg_node.out_connectors: + e = next(iter(parent_state.out_edges_by_connector(nsdfg_node, data_name))) + return parent_sdfg.arrays[e.data.data].storage + + raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG') def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None: """ diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index c58837fdff..0b62c96c0b 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -1501,13 +1501,19 @@ def is_fpga_kernel(sdfg, state): if ("is_FPGA_kernel" in state.location and state.location["is_FPGA_kernel"] == False): return False data_nodes = state.data_nodes() - if len(data_nodes) == 0: - return False + at_least_one_fpga_array = False for n in data_nodes: - if n.desc(sdfg).storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, - dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister): + desc = n.desc(sdfg) + if desc.storage in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, + dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister): + at_least_one_fpga_array = True + if isinstance(desc, dt.Scalar): + continue + if desc.storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, + dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister): return False - return True + + return at_least_one_fpga_array def postdominators( diff --git a/dace/transformation/auto/auto_optimize.py b/dace/transformation/auto/auto_optimize.py index f0fe22e181..6177e9e38e 100644 --- a/dace/transformation/auto/auto_optimize.py +++ b/dace/transformation/auto/auto_optimize.py @@ -627,7 +627,7 @@ def auto_optimize(sdfg: SDFG, if s in sdfg.free_symbols: if isinstance(v, (int, float)): known_symbols[s] = v - if isinstance(v, sympy.core.numbers.Integer): + if isinstance(v, sympy.Integer): try: known_symbols[s] = int(v) except TypeError: diff --git a/dace/transformation/interstate/multistate_inline.py b/dace/transformation/interstate/multistate_inline.py index 4c20be1568..74dd51a483 100644 --- a/dace/transformation/interstate/multistate_inline.py +++ b/dace/transformation/interstate/multistate_inline.py @@ -143,8 +143,8 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG): nsdfg_node = self.nested_sdfg nsdfg: SDFG = nsdfg_node.sdfg - if nsdfg_node.schedule is not dtypes.ScheduleType.Default: - infer_types.set_default_schedule_and_storage_types(nsdfg, nsdfg_node.schedule) + if nsdfg_node.schedule != dtypes.ScheduleType.Default: + infer_types.set_default_schedule_and_storage_types(nsdfg, [nsdfg_node.schedule]) ####################################################### # Collect and update top-level SDFG metadata diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py index b85877120b..a63b37aa19 100644 --- a/dace/transformation/interstate/sdfg_nesting.py +++ b/dace/transformation/interstate/sdfg_nesting.py @@ -248,8 +248,8 @@ def apply(self, state: SDFGState, sdfg: SDFG): nsdfg: SDFG = nsdfg_node.sdfg nstate: SDFGState = nsdfg.nodes()[0] - if nsdfg_node.schedule is not dtypes.ScheduleType.Default: - infer_types.set_default_schedule_and_storage_types(nsdfg, nsdfg_node.schedule) + if nsdfg_node.schedule != dtypes.ScheduleType.Default: + infer_types.set_default_schedule_and_storage_types(nsdfg, [nsdfg_node.schedule]) nsdfg_scope_entry = state.entry_node(nsdfg_node) nsdfg_scope_exit = (state.exit_node(nsdfg_scope_entry) if nsdfg_scope_entry is not None else None) diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py index 6096ff9572..1cf93469bb 100644 --- a/dace/transformation/subgraph/gpu_persistent_fusion.py +++ b/dace/transformation/subgraph/gpu_persistent_fusion.py @@ -246,6 +246,7 @@ def apply(self, sdfg: SDFG): kernel_args_read, kernel_args_write, ) + nested_sdfg.schedule = ScheduleType.GPU_Persistent # Create and connect read only data access nodes for arg in kernel_args_read: diff --git a/dace/transformation/transformation.py b/dace/transformation/transformation.py index 6634276f46..75e591cb1e 100644 --- a/dace/transformation/transformation.py +++ b/dace/transformation/transformation.py @@ -627,15 +627,15 @@ def apply(self, state, sdfg, *args, **kwargs): else: raise TypeError("Node expansion must be a CodeNode or an SDFG") - # Fix nested schedules - if isinstance(expansion, nd.NestedSDFG): - infer_types._set_default_schedule_types(expansion.sdfg, expansion.schedule, True) - infer_types._set_default_storage_types(expansion.sdfg, expansion.schedule) - expansion.environments = copy.copy(set(map(lambda a: a.full_class_path(), type(self).environments))) sdutil.change_edge_dest(state, node, expansion) sdutil.change_edge_src(state, node, expansion) state.remove_node(node) + + # Fix nested schedules + if isinstance(expansion, nd.NestedSDFG): + infer_types.set_default_schedule_and_storage_types(expansion.sdfg, [expansion.schedule], True) + type(self).postprocessing(sdfg, state, expansion) def to_json(self, parent=None) -> Dict[str, Any]: diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py index 969420d693..89bb2550f8 100644 --- a/tests/parse_state_struct_test.py +++ b/tests/parse_state_struct_test.py @@ -13,9 +13,11 @@ from dace import dtypes from dace.codegen import codeobject, targets, compiler, compiled_sdfg - @pytest.fixture def cuda_helper(): + return _cuda_helper() + +def _cuda_helper(): helper_code = """ #include @@ -89,3 +91,6 @@ def persistent_transient(A: dace.float32[3, 3]): compiledsdfg(A=A, __return=result) assert np.allclose(result, A @ B) + +if __name__ =='__main__': + test_preallocate_transients_in_state_struct(_cuda_helper()) diff --git a/tests/sdfg/schedule_inference_test.py b/tests/sdfg/schedule_inference_test.py new file mode 100644 index 0000000000..1b1b3422d8 --- /dev/null +++ b/tests/sdfg/schedule_inference_test.py @@ -0,0 +1,180 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" Tests for default storage/schedule inference. """ +import dace +from dace.sdfg.validation import InvalidSDFGNodeError +from dace.sdfg.infer_types import set_default_schedule_and_storage_types +from dace.transformation.helpers import get_parent_map +import pytest + + +def test_default_schedule_autodetect(): + + @dace.program + def add(a: dace.float32[10, 10], b: dace.float32[10, 10]): + return a + b @ b + + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)): + assert node.schedule == dace.ScheduleType.CPU_Multicore + + +def test_gpu_schedule_autodetect(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, + b: dace.float32[10, 10] @ dace.StorageType.GPU_Global): + return a + b @ b + + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)): + assert node.schedule == dace.ScheduleType.GPU_Device + + +def test_gpu_schedule_scalar_autodetect(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, + b: dace.float32[10, 10] @ dace.StorageType.GPU_Global, c: dace.float32[10] @ dace.StorageType.CPU_Heap): + return a + b @ b + c[0] + + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)): + assert node.schedule == dace.ScheduleType.GPU_Device + + +def test_gpu_schedule_scalar_autodetect_2(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, b: dace.float32): + return a + b + + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)): + assert node.schedule == dace.ScheduleType.GPU_Device + + +def test_nested_map_in_loop_schedule(): + + @dace.program + def top(a: dace.float64[20, 20], b: dace.float64[20, 20], c: dace.float64[20, 20]): + for i in dace.map[0:20] @ dace.ScheduleType.GPU_Device: + for _ in range(5): + c[i] += a[i] + b[i] + + sdfg = top.to_sdfg(simplify=False) + + set_default_schedule_and_storage_types(sdfg, None) + for node, state in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + if get_parent_map(state, node) is None: + assert node.schedule == dace.ScheduleType.GPU_Device + else: + assert node.schedule == dace.ScheduleType.GPU_ThreadBlock + + +def test_nested_storage(): + + @dace.program + def nested(a: dace.float64[20, 20], b: dace.float64[20, 20]): + tmp = dace.define_local([20, 20], dace.float64) + tmp[:] = a + b[:] = tmp + + @dace.program + def top(a: dace.float64[20, 20], b: dace.float64[20, 20]): + nested(a, b) + + sdfg = top.to_sdfg(simplify=False) + + set_default_schedule_and_storage_types(sdfg, None) + for node, state in sdfg.all_nodes_recursive(): + nsdfg = state.parent + if isinstance(node, dace.nodes.AccessNode): + assert node.desc(nsdfg).storage == dace.StorageType.CPU_Heap + + +def test_nested_storage_equivalence(): + + @dace.program + def nested(a: dace.float64[20, 20], b: dace.float64[20, 20]): + b[:] = a + + @dace.program + def top(a: dace.float64[20, 20] @ dace.StorageType.CPU_Heap, b: dace.float64[20, 20] @ dace.StorageType.CPU_Pinned): + nested(a, b) + + sdfg = top.to_sdfg(simplify=False) + + set_default_schedule_and_storage_types(sdfg, None) + for node, state in sdfg.all_nodes_recursive(): + nsdfg = state.parent + if isinstance(node, dace.nodes.AccessNode): + if state.out_degree(node) > 0: # Check for a in external and internal scopes + assert node.desc(nsdfg).storage == dace.StorageType.CPU_Heap + elif state.in_degree(node) > 0: # Check for b in external and internal scopes + assert node.desc(nsdfg).storage == dace.StorageType.CPU_Pinned + + +def test_ambiguous_schedule(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, b: dace.float32[10, 10]): + return a + b + + with pytest.raises(InvalidSDFGNodeError): + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + + +def test_ambiguous_schedule_2(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, + b: dace.float32[10, 10] @ dace.StorageType.GPU_Global, c: dace.float32[10] @ dace.StorageType.CPU_Heap): + return a + b @ b + c + + with pytest.raises(InvalidSDFGNodeError): + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + + +def test_semi_ambiguous_schedule(): + + @dace.program + def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, + b: dace.float32[10, 10] @ dace.StorageType.GPU_Global): + for i in dace.map[0:10] @ dace.ScheduleType.GPU_Device: + shared = dace.define_local([10], dace.float32) + for j in dace.map[0:10]: # Should be inferred as thread-block + b[i, j] = a[i, j] + shared[j] + + sdfg = add.to_sdfg() + set_default_schedule_and_storage_types(sdfg, None) + for node, state in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + if get_parent_map(state, node) is None: + assert node.schedule == dace.ScheduleType.GPU_Device + else: + assert node.schedule == dace.ScheduleType.GPU_ThreadBlock + + +if __name__ == '__main__': + test_default_schedule_autodetect() + test_gpu_schedule_autodetect() + test_gpu_schedule_scalar_autodetect() + test_gpu_schedule_scalar_autodetect_2() + test_nested_kernel_computation() + test_nested_map_in_loop_schedule() + test_nested_storage() + test_nested_storage_equivalence() + test_ambiguous_schedule() + test_ambiguous_schedule_2() + test_semi_ambiguous_schedule()