From 653ec33634617b7738be8214acda18df8bd9a356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20M=C3=BCller?= <147368808+philip-paul-mueller@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:03:03 +0200 Subject: [PATCH 01/43] Updated `InlineMultistateSDFG` (#1689) The `can_be_applied()` function did not consider the symbol map when the shape of the arrays were compared. This commit fixes this behaiour by first appling a replacing step before the comparisson. Furthermore, the commit removes all the commented out code. --- .../interstate/multistate_inline.py | 223 +----------------- 1 file changed, 7 insertions(+), 216 deletions(-) diff --git a/dace/transformation/interstate/multistate_inline.py b/dace/transformation/interstate/multistate_inline.py index 42dccd8616..f637f479dc 100644 --- a/dace/transformation/interstate/multistate_inline.py +++ b/dace/transformation/interstate/multistate_inline.py @@ -10,7 +10,7 @@ from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg import InterstateEdge, SDFG, SDFGState from dace.sdfg import utils as sdutil, infer_types -from dace.sdfg.replace import replace_datadesc_names +from dace.sdfg.replace import replace_datadesc_names, replace_properties_dict from dace.transformation import transformation, helpers from dace.properties import make_properties from dace import data @@ -103,7 +103,10 @@ def can_be_applied(self, state: SDFGState, expr_index, sdfg, permissive=False): if isinstance(outer_desc, data.View): return False - inner_desc = nested_sdfg.sdfg.arrays[edge.dst_conn] + # We can not compare shapes directly, we have to consider the symbol map + # for that. Clone the descriptor because the operation is inplace. + inner_desc = nested_sdfg.sdfg.arrays[edge.dst_conn].clone() + symbolic.safe_replace(nested_sdfg.symbol_mapping, lambda m: replace_properties_dict(inner_desc, m)) if (outer_desc.shape != inner_desc.shape or outer_desc.strides != inner_desc.strides): return False @@ -121,7 +124,8 @@ def can_be_applied(self, state: SDFGState, expr_index, sdfg, permissive=False): if isinstance(outer_desc, data.View): return False - inner_desc = nested_sdfg.sdfg.arrays[edge.src_conn] + inner_desc = nested_sdfg.sdfg.arrays[edge.src_conn].clone() + symbolic.safe_replace(nested_sdfg.symbol_mapping, lambda m: replace_properties_dict(inner_desc, m)) if (outer_desc.shape != inner_desc.shape or outer_desc.strides != inner_desc.strides): return False @@ -208,27 +212,6 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG): ####################################################### # Collect and modify access nodes as necessary - # Access nodes that need to be reshaped - # reshapes: Set(str) = set() - # for aname, array in nsdfg.arrays.items(): - # if array.transient: - # continue - # edge = None - # if aname in inputs: - # edge = inputs[aname] - # if len(array.shape) > len(edge.data.subset): - # reshapes.add(aname) - # continue - # if aname in outputs: - # edge = outputs[aname] - # if len(array.shape) > len(edge.data.subset): - # reshapes.add(aname) - # continue - # if edge is not None and not InlineMultistateSDFG._check_strides( - # array.strides, sdfg.arrays[edge.data.data].strides, - # edge.data, nsdfg_node): - # reshapes.add(aname) - # Mapping from nested transient name to top-level name transients: Dict[str, str] = {} @@ -281,50 +264,6 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG): symbolic.safe_replace(repldict, lambda m: replace_datadesc_names(nsdfg, m), value_as_string=True) - # Add views whenever reshapes are necessary - # for dname in reshapes: - # desc = nsdfg.arrays[dname] - # # To avoid potential confusion, rename protected __return keyword - # if dname.startswith('__return'): - # newname = f'{nsdfg.name}_ret{dname[8:]}' - # else: - # newname = dname - # newname, _ = sdfg.add_view(newname, - # desc.shape, - # desc.dtype, - # storage=desc.storage, - # strides=desc.strides, - # offset=desc.offset, - # debuginfo=desc.debuginfo, - # allow_conflicts=desc.allow_conflicts, - # total_size=desc.total_size, - # alignment=desc.alignment, - # may_alias=desc.may_alias, - # find_new_name=True) - # repldict[dname] = newname - - # Add extra access nodes for out/in view nodes - # inv_reshapes = {repldict[r]: r for r in reshapes} - # for nstate in nsdfg.nodes(): - # for node in nstate.nodes(): - # if isinstance(node, - # nodes.AccessNode) and node.data in inv_reshapes: - # if nstate.in_degree(node) > 0 and nstate.out_degree( - # node) > 0: - # # Such a node has to be in the output set - # edge = outputs[inv_reshapes[node.data]] - - # # Redirect outgoing edges through access node - # out_edges = list(nstate.out_edges(node)) - # anode = nstate.add_access(edge.data.data) - # vnode = nstate.add_access(node.data) - # nstate.add_nedge(node, anode, edge.data) - # nstate.add_nedge(anode, vnode, edge.data) - # for e in out_edges: - # nstate.remove_edge(e) - # nstate.add_edge(vnode, e.src_conn, e.dst, - # e.dst_conn, e.data) - # Make unique names for states statenames = set(s.label for s in sdfg.nodes()) for nstate in nsdfg.nodes(): @@ -364,46 +303,6 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG): sdfg.start_state = sdfg.node_id(source) # TODO: Modify memlets by offsetting - # If both source and sink nodes are inputs/outputs, reconnect once - # edges_to_ignore = self._modify_access_to_access(new_incoming_edges, - # nsdfg, nstate, state, - # orig_data) - - # source_to_outer = {n: e.src for n, e in new_incoming_edges.items()} - # sink_to_outer = {n: e.dst for n, e in new_outgoing_edges.items()} - # # If a source/sink node is one of the inputs/outputs, reconnect it, - # # replacing memlets in outgoing/incoming paths - # modified_edges = set() - # modified_edges |= self._modify_memlet_path(new_incoming_edges, nstate, - # state, sink_to_outer, True, - # edges_to_ignore) - # modified_edges |= self._modify_memlet_path(new_outgoing_edges, nstate, - # state, source_to_outer, - # False, edges_to_ignore) - - # # Reshape: add connections to viewed data - # self._modify_reshape_data(reshapes, repldict, inputs, nstate, state, - # True) - # self._modify_reshape_data(reshapes, repldict, outputs, nstate, state, - # False) - - # Modify all other internal edges pertaining to input/output nodes - # for nstate in nsdfg.nodes(): - # for node in nstate.nodes(): - # if isinstance(node, nodes.AccessNode): - # if node.data in input_set or node.data in output_set: - # if node.data in input_set: - # outer_edge = inputs[input_set[node.data]] - # else: - # outer_edge = outputs[output_set[node.data]] - - # for edge in state.all_edges(node): - # if (edge not in modified_edges - # and edge.data.data == node.data): - # for e in state.memlet_tree(edge): - # if e.data.data == node.data: - # e._data = helpers.unsqueeze_memlet( - # e.data, outer_edge.data) # Replace nested SDFG parents with new SDFG for nstate in nsdfg.nodes(): @@ -420,111 +319,3 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG): sdfg._cfg_list = sdfg.reset_cfg_list() return nsdfg.nodes() - - # def _modify_access_to_access( - # self, - # input_edges: Dict[nodes.Node, MultiConnectorEdge], - # nsdfg: SDFG, - # nstate: SDFGState, - # state: SDFGState, - # orig_data: Dict[Union[nodes.AccessNode, MultiConnectorEdge], str], - # ) -> Set[MultiConnectorEdge]: - # """ - # Deals with access->access edges where both sides are non-transient. - # """ - # result = set() - # for node, top_edge in input_edges.items(): - # for inner_edge in nstate.out_edges(node): - # if inner_edge.dst not in orig_data: - # continue - # inner_data = orig_data[inner_edge.dst] - # if (isinstance(inner_edge.dst, nodes.AccessNode) - # and not nsdfg.arrays[inner_data].transient): - # matching_edge: MultiConnectorEdge = next( - # state.out_edges_by_connector(top_edge.dst, inner_data)) - # # Create memlet by unsqueezing both w.r.t. src and dst - # # subsets - # in_memlet = helpers.unsqueeze_memlet( - # inner_edge.data, top_edge.data) - # out_memlet = helpers.unsqueeze_memlet( - # inner_edge.data, matching_edge.data) - # new_memlet = in_memlet - # new_memlet.other_subset = out_memlet.subset - - # # Connect with new edge - # state.add_edge(top_edge.src, top_edge.src_conn, - # matching_edge.dst, matching_edge.dst_conn, - # new_memlet) - # result.add(inner_edge) - - # return result - - # def _modify_memlet_path( - # self, - # new_edges: Dict[nodes.Node, MultiConnectorEdge], - # nstate: SDFGState, - # state: SDFGState, - # inner_to_outer: Dict[nodes.Node, MultiConnectorEdge], - # inputs: bool, - # edges_to_ignore: Set[MultiConnectorEdge], - # ) -> Set[MultiConnectorEdge]: - # """ Modifies memlet paths in an inlined SDFG. Returns set of modified - # edges. - # """ - # result = set() - # for node, top_edge in new_edges.items(): - # inner_edges = (nstate.out_edges(node) - # if inputs else nstate.in_edges(node)) - # for inner_edge in inner_edges: - # if inner_edge in edges_to_ignore: - # continue - # new_memlet = helpers.unsqueeze_memlet(inner_edge.data, - # top_edge.data) - # if inputs: - # if inner_edge.dst in inner_to_outer: - # dst = inner_to_outer[inner_edge.dst] - # else: - # dst = inner_edge.dst - - # new_edge = state.add_edge(top_edge.src, top_edge.src_conn, - # dst, inner_edge.dst_conn, - # new_memlet) - # mtree = state.memlet_tree(new_edge) - # else: - # if inner_edge.src in inner_to_outer: - # # don't add edges twice - # continue - - # new_edge = state.add_edge(inner_edge.src, - # inner_edge.src_conn, top_edge.dst, - # top_edge.dst_conn, new_memlet) - # mtree = state.memlet_tree(new_edge) - - # # Modify all memlets going forward/backward - # def traverse(mtree_node): - # result.add(mtree_node.edge) - # mtree_node.edge._data = helpers.unsqueeze_memlet( - # mtree_node.edge.data, top_edge.data) - # for child in mtree_node.children: - # traverse(child) - - # for child in mtree.children: - # traverse(child) - - # return result - - # def _modify_reshape_data(self, reshapes: Set[str], repldict: Dict[str, str], - # new_edges: Dict[str, MultiConnectorEdge], - # nstate: SDFGState, state: SDFGState, inputs: bool): - # anodes = nstate.source_nodes() if inputs else nstate.sink_nodes() - # reshp = {repldict[r]: r for r in reshapes} - # for node in anodes: - # if not isinstance(node, nodes.AccessNode): - # continue - # if node.data not in reshp: - # continue - # edge = new_edges[reshp[node.data]] - # if inputs: - # state.add_edge(edge.src, edge.src_conn, node, None, edge.data) - # else: - # state.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) From 4fbeba4155c6e317cbad54b08c2d4a75fe5e6985 Mon Sep 17 00:00:00 2001 From: Pratyai Mazumder Date: Fri, 18 Oct 2024 12:14:06 +0200 Subject: [PATCH 02/43] Some very minor improvement in one error handling and one warning message. (#1686) 1. Do not throw error if `clear_instrumentation_reports()` does not have anything to clear. (The function is useful to avoid accumulating many, many obsolete profile data files over time) 2. Put some more information in a warning message. --- dace/sdfg/sdfg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 77ad8b31b5..38a41236a6 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -949,7 +949,11 @@ def clear_instrumentation_reports(self): Clears the instrumentation report folder of this SDFG. """ path = os.path.join(self.build_folder, 'perf') - for fname in os.listdir(path): + try: + files = os.listdir(path) + except FileNotFoundError: + return + for fname in files: if not fname.startswith('report-'): continue os.unlink(os.path.join(path, fname)) @@ -2288,8 +2292,8 @@ def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': sdfg.name = f'{self.name}_{index}' index += 1 if self.name != sdfg.name: - warnings.warn('SDFG "%s" is already loaded by another object, ' - 'recompiling under a different name.' % self.name) + warnings.warn(f"SDFG '{self.name}' is already loaded by another object, recompiling under a different " + f"name '{sdfg.name}'.") try: # Fill in scope entry/exit connectors From 975a0657ce2904ed5c12693c3cd9debbc89b7ab0 Mon Sep 17 00:00:00 2001 From: edopao Date: Fri, 18 Oct 2024 12:47:37 +0200 Subject: [PATCH 03/43] Extend TrivialTaskletElimination for map scope (#1650) Extend the transformation `TrivialTaskletElimination` for the case where the input or output of the copy-tasklet is a map node. The following SDFG: image is transformed to this SDFG: image --- .../dataflow/trivial_tasklet_elimination.py | 48 ++++--- .../trivial_tasklet_elimination_test.py | 129 ++++++++++++++++++ 2 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 tests/transformations/trivial_tasklet_elimination_test.py diff --git a/dace/transformation/dataflow/trivial_tasklet_elimination.py b/dace/transformation/dataflow/trivial_tasklet_elimination.py index b4c23524e2..6a84959f7d 100644 --- a/dace/transformation/dataflow/trivial_tasklet_elimination.py +++ b/dace/transformation/dataflow/trivial_tasklet_elimination.py @@ -17,48 +17,62 @@ class TrivialTaskletElimination(transformation.SingleStateTransformation): """ read = transformation.PatternNode(nodes.AccessNode) + read_map = transformation.PatternNode(nodes.MapEntry) tasklet = transformation.PatternNode(nodes.Tasklet) write = transformation.PatternNode(nodes.AccessNode) + write_map = transformation.PatternNode(nodes.MapExit) @classmethod def expressions(cls): - return [sdutil.node_path_graph(cls.read, cls.tasklet, cls.write)] + return [ + sdutil.node_path_graph(cls.read, cls.tasklet, cls.write), + sdutil.node_path_graph(cls.read_map, cls.tasklet, cls.write), + sdutil.node_path_graph(cls.read, cls.tasklet, cls.write_map), + ] def can_be_applied(self, graph, expr_index, sdfg, permissive=False): - read = self.read + read = self.read_map if expr_index == 1 else self.read tasklet = self.tasklet - write = self.write - # Do not apply on Streams - if isinstance(sdfg.arrays[read.data], data.Stream): - return False - if isinstance(sdfg.arrays[write.data], data.Stream): + write = self.write_map if expr_index == 2 else self.write + if len(tasklet.in_connectors) != 1: return False if len(graph.in_edges(tasklet)) != 1: return False - if len(graph.out_edges(tasklet)) != 1: - return False - if graph.edges_between(tasklet, write)[0].data.wcr: - return False - if len(tasklet.in_connectors) != 1: - return False if len(tasklet.out_connectors) != 1: return False + if len(graph.out_edges(tasklet)) != 1: + return False in_conn = list(tasklet.in_connectors.keys())[0] out_conn = list(tasklet.out_connectors.keys())[0] if tasklet.code.as_string != f'{out_conn} = {in_conn}': return False - + read_memlet = graph.edges_between(read, tasklet)[0].data + read_desc = sdfg.arrays[read_memlet.data] + write_memlet = graph.edges_between(tasklet, write)[0].data + if write_memlet.wcr: + return False + write_desc = sdfg.arrays[write_memlet.data] + # Do not apply on streams + if isinstance(read_desc, data.Stream): + return False + if isinstance(write_desc, data.Stream): + return False + # Keep copy-tasklet connected to map node if source and destination nodes + # have different data type (implicit type cast) + if expr_index != 0 and read_desc.dtype != write_desc.dtype: + return False + return True def apply(self, graph, sdfg): - read = self.read + read = self.read_map if self.expr_index == 1 else self.read tasklet = self.tasklet - write = self.write + write = self.write_map if self.expr_index == 2 else self.write in_edge = graph.edges_between(read, tasklet)[0] out_edge = graph.edges_between(tasklet, write)[0] graph.remove_edge(in_edge) graph.remove_edge(out_edge) out_edge.data.other_subset = in_edge.data.subset - graph.add_nedge(read, write, out_edge.data) + graph.add_edge(read, in_edge.src_conn, write, out_edge.dst_conn, out_edge.data) graph.remove_node(tasklet) diff --git a/tests/transformations/trivial_tasklet_elimination_test.py b/tests/transformations/trivial_tasklet_elimination_test.py new file mode 100644 index 0000000000..8f97b51b7e --- /dev/null +++ b/tests/transformations/trivial_tasklet_elimination_test.py @@ -0,0 +1,129 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.transformation.dataflow.trivial_tasklet_elimination import TrivialTaskletElimination + + +N = 10 + + +def test_trivial_tasklet(): + ty_ = dace.int32 + sdfg = dace.SDFG("trivial_tasklet") + sdfg.add_symbol("s", ty_) + sdfg.add_array("v", (N,), ty_) + st = sdfg.add_state() + + tmp1_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty_, transient=True) + tmp1_node = st.add_access(tmp1_name) + init_tasklet = st.add_tasklet("init", {}, {"out"}, "out = s") + st.add_edge(init_tasklet, "out", tmp1_node, None, dace.Memlet(tmp1_node.data)) + + tmp2_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty_, transient=True) + tmp2_node = st.add_access(tmp2_name) + copy_tasklet = st.add_tasklet("copy", {"inp"}, {"out"}, "out = inp") + st.add_edge(tmp1_node, None, copy_tasklet, "inp", dace.Memlet(tmp1_node.data)) + st.add_edge(copy_tasklet, "out", tmp2_node, None, dace.Memlet(tmp2_node.data)) + + bcast_tasklet, _, _ = st.add_mapped_tasklet( + "bcast", + dict(i=f"0:{N}"), + inputs={"inp": dace.Memlet(f"{tmp2_node.data}[0]")}, + input_nodes={tmp2_node.data: tmp2_node}, + code="out = inp", + outputs={"out": dace.Memlet("v[i]")}, + external_edges=True, + ) + + sdfg.validate() + tasklet_nodes = {x for x in st.nodes() if isinstance(x, dace.nodes.Tasklet)} + assert tasklet_nodes == {init_tasklet, copy_tasklet, bcast_tasklet} + + count = sdfg.apply_transformations_repeated(TrivialTaskletElimination) + assert count == 1 + + assert len(st.out_edges(tmp1_node)) == 1 + assert st.out_edges(tmp1_node)[0].dst == tmp2_node + + tasklet_nodes = {x for x in st.nodes() if isinstance(x, dace.nodes.Tasklet)} + assert tasklet_nodes == {init_tasklet, bcast_tasklet} + + +def test_trivial_tasklet_with_map(): + ty_ = dace.int32 + sdfg = dace.SDFG("trivial_tasklet_with_map") + sdfg.add_symbol("s", ty_) + sdfg.add_array("v", (N,), ty_) + st = sdfg.add_state() + + tmp1_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty_, transient=True) + tmp1_node = st.add_access(tmp1_name) + init_tasklet = st.add_tasklet("init", {}, {"out"}, "out = s") + st.add_edge(init_tasklet, "out", tmp1_node, None, dace.Memlet(tmp1_node.data)) + + me, mx = st.add_map("bcast", dict(i=f"0:{N}")) + + copy_tasklet = st.add_tasklet("copy", {"inp"}, {"out"}, "out = inp") + st.add_memlet_path(tmp1_node, me, copy_tasklet, dst_conn="inp", memlet=dace.Memlet(f"{tmp1_node.data}[0]")) + tmp2_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty_, transient=True) + tmp2_node = st.add_access(tmp2_name) + st.add_edge(copy_tasklet, "out", tmp2_node, None, dace.Memlet(tmp2_node.data)) + + bcast_tasklet = st.add_tasklet("bcast", {"inp"}, {"out"}, "out = inp") + st.add_edge(tmp2_node, None, bcast_tasklet, "inp", dace.Memlet(tmp2_node.data)) + st.add_memlet_path(bcast_tasklet, mx, st.add_access("v"), src_conn="out", memlet=dace.Memlet("v[i]")) + + sdfg.validate() + tasklet_nodes = {x for x in st.nodes() if isinstance(x, dace.nodes.Tasklet)} + assert tasklet_nodes == {init_tasklet, copy_tasklet, bcast_tasklet} + + count = sdfg.apply_transformations_repeated(TrivialTaskletElimination) + assert count == 2 + + tasklet_nodes = {x for x in st.nodes() if isinstance(x, dace.nodes.Tasklet)} + assert tasklet_nodes == {init_tasklet} + + assert len(st.in_edges(tmp2_node)) == 1 + assert st.in_edges(tmp2_node)[0].src == me + + assert len(st.out_edges(tmp2_node)) == 1 + assert st.out_edges(tmp2_node)[0].dst == mx + + +def test_trivial_tasklet_with_implicit_cast(): + ty32_ = dace.int32 + ty64_ = dace.int64 + sdfg = dace.SDFG("trivial_tasklet_with_implicit_cast") + sdfg.add_symbol("s", ty32_) + sdfg.add_array("v", (N,), ty32_) + st = sdfg.add_state() + + tmp1_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty32_, transient=True) + tmp1_node = st.add_access(tmp1_name) + init_tasklet = st.add_tasklet("init", {}, {"out"}, "out = s") + st.add_edge(init_tasklet, "out", tmp1_node, None, dace.Memlet(tmp1_node.data)) + + me, mx = st.add_map("bcast", dict(i=f"0:{N}")) + + copy_tasklet = st.add_tasklet("copy", {"inp"}, {"out"}, "out = inp") + st.add_memlet_path(tmp1_node, me, copy_tasklet, dst_conn="inp", memlet=dace.Memlet(f"{tmp1_node.data}[0]")) + tmp2_name, _ = sdfg.add_scalar(sdfg.temp_data_name(), ty64_, transient=True) + tmp2_node = st.add_access(tmp2_name) + st.add_edge(copy_tasklet, "out", tmp2_node, None, dace.Memlet(tmp2_node.data)) + + bcast_tasklet = st.add_tasklet("bcast", {"inp"}, {"out"}, "out = inp") + st.add_edge(tmp2_node, None, bcast_tasklet, "inp", dace.Memlet(tmp2_node.data)) + st.add_memlet_path(bcast_tasklet, mx, st.add_access("v"), src_conn="out", memlet=dace.Memlet("v[i]")) + + sdfg.validate() + tasklet_nodes = {x for x in st.nodes() if isinstance(x, dace.nodes.Tasklet)} + assert tasklet_nodes == {init_tasklet, copy_tasklet, bcast_tasklet} + + # not applied because of data types mismatch on read/write nodes + count = sdfg.apply_transformations_repeated(TrivialTaskletElimination) + assert count == 0 + + +if __name__ == '__main__': + test_trivial_tasklet() + test_trivial_tasklet_with_map() + test_trivial_tasklet_with_implicit_cast() From 380554f709f0cffe6407dab9a9ee60655264aa9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20M=C3=BCller?= <147368808+philip-paul-mueller@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:17:50 +0200 Subject: [PATCH 04/43] Fix to Read and Write Sets (#1678) During my work on the [new map fusion](https://github.com/spcl/dace/issues/1643) I discovered a bug in `SDFGState._read_and_write_set()`. Originally I solved it there, but it was decided to move it into its own PR. Lets look at the first, super silly example, that is not useful on its own. The main point here, is that the `data` attribute of the Memlet does not refer to the source of the connection but of the destination. ![test_1](https://github.com/user-attachments/assets/740ee4fc-cfe5-4844-a999-e316cb8f9c16) BTW: The Webviewer outputs something like `B[0] -> [0, 0]` however, the parser of the Memlet constructor does not understand this, it must be written as `B[0] -> 0, 0`, i.e. the second set of brackets must be omitted, this should be changed! From the above we would expect the following sets: - Reads: - `A`: `[Range (0, 0)]` - `B`: Should not be listed in this set, because it is fully read and written, thus it is excluded. - Writes - `B`: `[Range (0)]` - `C`: `[Range (0, 0), Range (1, 1)]` However, the current implementation gives us: - Reads: `{'A': [Range (0)], 'B': [Range (1, 1)]}` - Write: `{'B': [Range (0)], 'C': [Range (1, 1), Range (0)]}` The current behaviour is wrong because: - `A` is a `2x2` array, thus the read set should also have two dimensions. - `B` inside the read set, it is a scalar, but the range has two dimensions, furthermore, it is present at all. - `C` the first member of the write set (`Range(1, 1)`) is correct, while the second (`Range(0)`) is horrible wrong. The second example is even more simple. ![test_2](https://github.com/user-attachments/assets/da3d03af-6f10-411f-952e-ab057ed057c6) From the SDFG we expect the following sets: - Reads: - `A`: `[Range(0, 0)]` - Writes: - `B`: `[Range(0)]` It is important that in the above example `other_subset` is `None` and `data` is set to `A`, so it is not one of these "crazy" non standard Memlets we have seen in the first test. However, the current implementation gives us: - Reads: `{'A': [Range (0, 0)]}` - Writes: `{'B': [Range (0, 0)]}` This clearly shows, that whatever the implementation does is not correct. --- dace/sdfg/state.py | 103 +++++++++++------ tests/sdfg/state_test.py | 93 ++++++++++++++- .../move_loop_into_map_test.py | 64 ++++++++++- .../transformations/prune_connectors_test.py | 22 +--- .../refine_nested_access_test.py | 108 ++++++++++++++++++ 5 files changed, 328 insertions(+), 62 deletions(-) diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 2ae6109b31..09e7607d65 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -745,51 +745,82 @@ def update_if_not_none(dic, update): return defined_syms + def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr, List[Subset]]]: """ Determines what data is read and written in this subgraph, returning dictionaries from data containers to all subsets that are read/written. """ + from dace.sdfg import utils # Avoid cyclic import + + # Ensures that the `{src,dst}_subset` are properly set. + # TODO: find where the problems are + for edge in self.edges(): + edge.data.try_initialize(self.sdfg, self, edge) + read_set = collections.defaultdict(list) write_set = collections.defaultdict(list) - from dace.sdfg import utils # Avoid cyclic import - subgraphs = utils.concurrent_subgraphs(self) - for sg in subgraphs: - rs = collections.defaultdict(list) - ws = collections.defaultdict(list) - # Traverse in topological order, so data that is written before it - # is read is not counted in the read set - for n in utils.dfs_topological_sort(sg, sources=sg.source_nodes()): - if isinstance(n, nd.AccessNode): - in_edges = sg.in_edges(n) - out_edges = sg.out_edges(n) - # Filter out memlets which go out but the same data is written to the AccessNode by another memlet - for out_edge in list(out_edges): - for in_edge in list(in_edges): - if (in_edge.data.data == out_edge.data.data - and in_edge.data.dst_subset.covers(out_edge.data.src_subset)): - out_edges.remove(out_edge) - break - - for e in in_edges: - # skip empty memlets - if e.data.is_empty(): - continue - # Store all subsets that have been written - ws[n.data].append(e.data.subset) - for e in out_edges: - # skip empty memlets - if e.data.is_empty(): - continue - rs[n.data].append(e.data.subset) - # Union all subgraphs, so an array that was excluded from the read - # set because it was written first is still included if it is read - # in another subgraph - for data, accesses in rs.items(): + + # NOTE: In a previous version a _single_ read (i.e. leaving Memlet) that was + # fully covered by a single write (i.e. an incoming Memlet) was removed from + # the read set and only the write survived. However, this was never fully + # implemented nor correctly implemented and caused problems. + # So this filtering was removed. + + for subgraph in utils.concurrent_subgraphs(self): + subgraph_read_set = collections.defaultdict(list) # read and write set of this subgraph. + subgraph_write_set = collections.defaultdict(list) + for n in utils.dfs_topological_sort(subgraph, sources=subgraph.source_nodes()): + if not isinstance(n, nd.AccessNode): + # Read and writes can only be done through access nodes, + # so ignore every other node. + continue + + # Get a list of all incoming (writes) and outgoing (reads) edges of the + # access node, ignore all empty memlets as they do not carry any data. + in_edges = [in_edge for in_edge in subgraph.in_edges(n) if not in_edge.data.is_empty()] + out_edges = [out_edge for out_edge in subgraph.out_edges(n) if not out_edge.data.is_empty()] + + # Extract the subsets that describes where we read and write the data + # and store them for the later filtering. + # NOTE: In certain cases the corresponding subset might be None, in this case + # we assume that the whole array is written, which is the default behaviour. + ac_desc = n.desc(self.sdfg) + ac_size = ac_desc.total_size + in_subsets = dict() + for in_edge in in_edges: + # Ensure that if the destination subset is not given, our assumption, that the + # whole array is written to, is valid, by testing if the memlet transfers the + # whole array. + assert (in_edge.data.dst_subset is not None) or (in_edge.data.num_elements() == ac_size) + in_subsets[in_edge] = ( + sbs.Range.from_array(ac_desc) + if in_edge.data.dst_subset is None + else in_edge.data.dst_subset + ) + out_subsets = dict() + for out_edge in out_edges: + assert (out_edge.data.src_subset is not None) or (out_edge.data.num_elements() == ac_size) + out_subsets[out_edge] = ( + sbs.Range.from_array(ac_desc) + if out_edge.data.src_subset is None + else out_edge.data.src_subset + ) + + # Update the read and write sets of the subgraph. + if in_edges: + subgraph_write_set[n.data].extend(in_subsets.values()) + if out_edges: + subgraph_read_set[n.data].extend(out_subsets[out_edge] for out_edge in out_edges) + + # Add the subgraph's read and write set to the final ones. + for data, accesses in subgraph_read_set.items(): read_set[data] += accesses - for data, accesses in ws.items(): + for data, accesses in subgraph_write_set.items(): write_set[data] += accesses - return read_set, write_set + + return copy.deepcopy((read_set, write_set)) + def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]: """ diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py index 7ba43ac4c0..4bde3788e0 100644 --- a/tests/sdfg/state_test.py +++ b/tests/sdfg/state_test.py @@ -1,5 +1,6 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace +from dace import subsets as sbs from dace.transformation.helpers import find_sdfg_control_flow @@ -19,7 +20,9 @@ def test_read_write_set(): state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet('B[2]')) state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet('C[2]')) - assert 'B' not in state.read_and_write_sets()[0] + read_set, write_set = state.read_and_write_sets() + assert {'B', 'A'} == read_set + assert {'C', 'B'} == write_set def test_read_write_set_y_formation(): @@ -41,7 +44,10 @@ def test_read_write_set_y_formation(): state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet(data='B', subset='0')) state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet(data='C', subset='0')) - assert 'B' not in state.read_and_write_sets()[0] + read_set, write_set = state.read_and_write_sets() + assert {'B', 'A'} == read_set + assert {'C', 'B'} == write_set + def test_deepcopy_state(): N = dace.symbol('N') @@ -58,6 +64,87 @@ def double_loop(arr: dace.float32[N]): sdfg.validate() +def test_read_and_write_set_filter(): + sdfg = dace.SDFG('graph') + state = sdfg.add_state('state') + sdfg.add_array('A', [2, 2], dace.float64) + sdfg.add_scalar('B', dace.float64) + sdfg.add_array('C', [2, 2], dace.float64) + A, B, C = (state.add_access(name) for name in ('A', 'B', 'C')) + + state.add_nedge( + A, + B, + dace.Memlet("B[0] -> [0, 0]"), + ) + state.add_nedge( + B, + C, + dace.Memlet("C[1, 1] -> [0]"), + ) + state.add_nedge( + B, + C, + dace.Memlet("B[0] -> [0, 0]"), + ) + sdfg.validate() + + expected_reads = { + "A": [sbs.Range.from_string("0, 0")], + "B": [sbs.Range.from_string("0")], + } + expected_writes = { + "B": [sbs.Range.from_string("0")], + "C": [sbs.Range.from_string("0, 0"), sbs.Range.from_string("1, 1")], + } + read_set, write_set = state._read_and_write_sets() + + for expected_sets, computed_sets in [(expected_reads, read_set), (expected_writes, write_set)]: + assert expected_sets.keys() == computed_sets.keys(), f"Expected the set to contain '{expected_sets.keys()}' but got '{computed_sets.keys()}'." + for access_data in expected_sets.keys(): + for exp in expected_sets[access_data]: + found_match = False + for res in computed_sets[access_data]: + if res == exp: + found_match = True + break + assert found_match, f"Could not find the subset '{exp}' only got '{computed_sets}'" + + +def test_read_and_write_set_selection(): + sdfg = dace.SDFG('graph') + state = sdfg.add_state('state') + sdfg.add_array('A', [2, 2], dace.float64) + sdfg.add_scalar('B', dace.float64) + A, B = (state.add_access(name) for name in ('A', 'B')) + + state.add_nedge( + A, + B, + dace.Memlet("A[0, 0]"), + ) + sdfg.validate() + + expected_reads = { + "A": [sbs.Range.from_string("0, 0")], + } + expected_writes = { + "B": [sbs.Range.from_string("0")], + } + read_set, write_set = state._read_and_write_sets() + + for expected_sets, computed_sets in [(expected_reads, read_set), (expected_writes, write_set)]: + assert expected_sets.keys() == computed_sets.keys(), f"Expected the set to contain '{expected_sets.keys()}' but got '{computed_sets.keys()}'." + for access_data in expected_sets.keys(): + for exp in expected_sets[access_data]: + found_match = False + for res in computed_sets[access_data]: + if res == exp: + found_match = True + break + assert found_match, f"Could not find the subset '{exp}' only got '{computed_sets}'" + + def test_add_mapped_tasklet(): sdfg = dace.SDFG("test_add_mapped_tasklet") state = sdfg.add_state(is_start_block=True) @@ -82,6 +169,8 @@ def test_add_mapped_tasklet(): if __name__ == '__main__': + test_read_and_write_set_selection() + test_read_and_write_set_filter() test_read_write_set() test_read_write_set_y_formation() test_deepcopy_state() diff --git a/tests/transformations/move_loop_into_map_test.py b/tests/transformations/move_loop_into_map_test.py index dca775bb7a..ad51941cb0 100644 --- a/tests/transformations/move_loop_into_map_test.py +++ b/tests/transformations/move_loop_into_map_test.py @@ -2,6 +2,7 @@ import dace from dace.transformation.interstate import MoveLoopIntoMap import unittest +import copy import numpy as np I = dace.symbol("I") @@ -147,7 +148,12 @@ def test_apply_multiple_times_1(self): self.assertTrue(np.allclose(val, ref)) def test_more_than_a_map(self): - """ `out` is read and written indirectly by the MapExit, potentially leading to a RW dependency. """ + """ + `out` is read and written indirectly by the MapExit, potentially leading to a RW dependency. + + Note that there is actually no dependency, however, the transformation, because it relies + on `SDFGState.read_and_write_sets()` it can not detect this and can thus not be applied. + """ sdfg = dace.SDFG('more_than_a_map') _, aarr = sdfg.add_array('A', (3, 3), dace.float64) _, barr = sdfg.add_array('B', (3, 3), dace.float64) @@ -167,11 +173,12 @@ def test_more_than_a_map(self): external_edges=True, input_nodes=dict(out=oread, B=bread), output_nodes=dict(tmp=twrite)) - body.add_nedge(aread, oread, dace.Memlet.from_array('A', aarr)) + body.add_nedge(aread, oread, dace.Memlet.from_array('A', oarr)) body.add_nedge(twrite, owrite, dace.Memlet.from_array('out', oarr)) sdfg.add_loop(None, body, None, '_', '0', '_ < 10', '_ + 1') - count = sdfg.apply_transformations(MoveLoopIntoMap) - self.assertFalse(count > 0) + + count = sdfg.apply_transformations(MoveLoopIntoMap, validate_all=True, validate=True) + self.assertTrue(count == 0) def test_more_than_a_map_1(self): """ @@ -269,6 +276,55 @@ def test_more_than_a_map_3(self): count = sdfg.apply_transformations(MoveLoopIntoMap) self.assertFalse(count > 0) + def test_more_than_a_map_4(self): + """ + The test is very similar to `test_more_than_a_map()`. But a memlet is different + which leads to a RW dependency, which blocks the transformation. + """ + sdfg = dace.SDFG('more_than_a_map') + _, aarr = sdfg.add_array('A', (3, 3), dace.float64) + _, barr = sdfg.add_array('B', (3, 3), dace.float64) + _, oarr = sdfg.add_array('out', (3, 3), dace.float64) + _, tarr = sdfg.add_array('tmp', (3, 3), dace.float64, transient=True) + body = sdfg.add_state('map_state') + aread = body.add_access('A') + oread = body.add_access('out') + bread = body.add_access('B') + twrite = body.add_access('tmp') + owrite = body.add_access('out') + body.add_mapped_tasklet('op', + dict(i='0:3', j='0:3'), + dict(__in1=dace.Memlet('out[i, j]'), __in2=dace.Memlet('B[i, j]')), + '__out = __in1 - __in2', + dict(__out=dace.Memlet('tmp[i, j]')), + external_edges=True, + input_nodes=dict(out=oread, B=bread), + output_nodes=dict(tmp=twrite)) + body.add_nedge(aread, oread, dace.Memlet('A[Mod(_, 3), 0:3] -> [Mod(_ + 1, 3), 0:3]', aarr)) + body.add_nedge(twrite, owrite, dace.Memlet.from_array('out', oarr)) + sdfg.add_loop(None, body, None, '_', '0', '_ < 10', '_ + 1') + + sdfg_args_ref = { + "A": np.array(np.random.rand(3, 3), dtype=np.float64), + "B": np.array(np.random.rand(3, 3), dtype=np.float64), + "out": np.array(np.random.rand(3, 3), dtype=np.float64), + } + sdfg_args_res = copy.deepcopy(sdfg_args_ref) + + # Perform the reference execution + sdfg(**sdfg_args_ref) + + # Apply the transformation and execute the SDFG again. + count = sdfg.apply_transformations(MoveLoopIntoMap, validate_all=True, validate=True) + sdfg(**sdfg_args_res) + + for name in sdfg_args_ref.keys(): + self.assertTrue( + np.allclose(sdfg_args_ref[name], sdfg_args_res[name]), + f"Miss match for {name}", + ) + self.assertFalse(count > 0) + if __name__ == '__main__': unittest.main() diff --git a/tests/transformations/prune_connectors_test.py b/tests/transformations/prune_connectors_test.py index 63bbe5843f..b7b287d77e 100644 --- a/tests/transformations/prune_connectors_test.py +++ b/tests/transformations/prune_connectors_test.py @@ -153,7 +153,6 @@ def _make_read_write_sdfg( Depending on `conforming_memlet` the memlet that copies `inner_A` into `inner_B` will either be associated to `inner_A` (`True`) or `inner_B` (`False`). - This choice has consequences on if the transformation can apply or not. Notes: This is most likely a bug, see [issue#1643](https://github.com/spcl/dace/issues/1643), @@ -332,16 +331,6 @@ def test_unused_retval_2(): assert np.allclose(a, 1) -def test_read_write_1(): - # Because the memlet is conforming, we can apply the transformation. - sdfg = _make_read_write_sdfg(True) - - assert first_mode == PruneConnectors.can_be_applied_to(nsdfg=nsdfg, sdfg=osdfg, expr_index=0, permissive=False) - - - - - def test_prune_connectors_with_dependencies(): sdfg = dace.SDFG('tester') A, A_desc = sdfg.add_array('A', [4], dace.float64) @@ -420,18 +409,11 @@ def test_prune_connectors_with_dependencies(): assert np.allclose(np_d, np_d_) -def test_read_write_1(): - # Because the memlet is conforming, we can apply the transformation. +def test_read_write(): sdfg, nsdfg = _make_read_write_sdfg(True) + assert not PruneConnectors.can_be_applied_to(nsdfg=nsdfg, sdfg=sdfg, expr_index=0, permissive=False) - assert PruneConnectors.can_be_applied_to(nsdfg=nsdfg, sdfg=sdfg, expr_index=0, permissive=False) - sdfg.apply_transformations_repeated(PruneConnectors, validate=True, validate_all=True) - - -def test_read_write_2(): - # Because the memlet is not conforming, we can not apply the transformation. sdfg, nsdfg = _make_read_write_sdfg(False) - assert not PruneConnectors.can_be_applied_to(nsdfg=nsdfg, sdfg=sdfg, expr_index=0, permissive=False) diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py index d9fb9a7392..81640665ed 100644 --- a/tests/transformations/refine_nested_access_test.py +++ b/tests/transformations/refine_nested_access_test.py @@ -156,7 +156,115 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int): assert np.allclose(ref, val) +def _make_rna_read_and_write_set_sdfg(diff_in_out: bool) -> dace.SDFG: + """Generates the SDFG for the `test_rna_read_and_write_sets_*()` tests. + + If `diff_in_out` is `False` then the output is also used as temporary storage + within the nested SDFG. Because of the definition of the read/write sets, + this usage of the temporary storage is not picked up and it is only considered + as write set. + + If `diff_in_out` is true, then a different storage container, which is classified + as output, is used as temporary storage. + + This test was added during [PR#1678](https://github.com/spcl/dace/pull/1678). + """ + + def _make_nested_sdfg(diff_in_out: bool) -> dace.SDFG: + sdfg = dace.SDFG("inner_sdfg") + state = sdfg.add_state(is_start_block=True) + sdfg.add_array("A", dtype=dace.float64, shape=(2,), transient=False) + sdfg.add_array("T1", dtype=dace.float64, shape=(2,), transient=False) + + A = state.add_access("A") + T1_output = state.add_access("T1") + if diff_in_out: + sdfg.add_array("T2", dtype=dace.float64, shape=(2,), transient=False) + T1_input = state.add_access("T2") + else: + T1_input = state.add_access("T1") + + tsklt = state.add_tasklet( + "comp", + inputs={"__in1": None, "__in2": None}, + outputs={"__out": None}, + code="__out = __in1 + __in2", + ) + + state.add_edge(A, None, tsklt, "__in1", dace.Memlet("A[1]")) + # An alternative would be to write to a different location here. + # Then, the data would be added to the access node. + state.add_edge(A, None, T1_input, None, dace.Memlet("A[0] -> [0]")) + state.add_edge(T1_input, None, tsklt, "__in2", dace.Memlet(T1_input.data + "[0]")) + state.add_edge(tsklt, "__out", T1_output, None, dace.Memlet(T1_output.data + "[1]")) + return sdfg + + sdfg = dace.SDFG("Parent_SDFG") + state = sdfg.add_state(is_start_block=True) + + sdfg.add_array("A", dtype=dace.float64, shape=(2,), transient=False) + sdfg.add_array("T1", dtype=dace.float64, shape=(2,), transient=False) + sdfg.add_array("T2", dtype=dace.float64, shape=(2,), transient=False) + A = state.add_access("A") + T1 = state.add_access("T1") + + nested_sdfg = _make_nested_sdfg(diff_in_out) + + nsdfg = state.add_nested_sdfg( + nested_sdfg, + parent=sdfg, + inputs={"A"}, + outputs={"T2", "T1"} if diff_in_out else {"T1"}, + symbol_mapping={}, + ) + + state.add_edge(A, None, nsdfg, "A", dace.Memlet("A[0:2]")) + state.add_edge(nsdfg, "T1", T1, None, dace.Memlet("T1[0:2]")) + + if diff_in_out: + state.add_edge(nsdfg, "T2", state.add_access("T2"), None, dace.Memlet("T2[0:2]")) + sdfg.validate() + return sdfg + + +def test_rna_read_and_write_sets_doule_use(): + # The transformation does not apply because we access element `0` of both arrays that we + # pass inside the nested SDFG. + sdfg = _make_rna_read_and_write_set_sdfg(False) + nb_applied = sdfg.apply_transformations_repeated( + [RefineNestedAccess], + validate=True, + validate_all=True, + ) + assert nb_applied == 0 + + +def test_rna_read_and_write_sets_different_storage(): + + # There is a dedicated temporary storage used. + sdfg = _make_rna_read_and_write_set_sdfg(True) + + nb_applied = sdfg.apply_transformations_repeated( + [RefineNestedAccess], + validate=True, + validate_all=True, + ) + assert nb_applied > 0 + + args = { + "A": np.array(np.random.rand(2), dtype=np.float64, copy=True), + "T2": np.array(np.random.rand(2), dtype=np.float64, copy=True), + "T1": np.zeros(2, dtype=np.float64), + } + ref = args["A"][0] + args["A"][1] + sdfg(**args) + res = args["T1"][1] + assert np.allclose(res, ref), f"Expected '{ref}' but got '{res}'." + + if __name__ == '__main__': test_refine_dataflow() test_refine_interstate() test_free_symbols_only_by_indices() + test_rna_read_and_write_sets_different_storage() + test_rna_read_and_write_sets_doule_use() From 0217f26ff89ea86944a83539c7c47568bd7463c2 Mon Sep 17 00:00:00 2001 From: Pratyai Mazumder Date: Thu, 24 Oct 2024 07:11:37 +0200 Subject: [PATCH 05/43] Make `is_empty()` and `propagate_subset()` not unnecessarily rely on the `src` and `dst` (#1699) --- dace/memlet.py | 20 +++++++++----------- dace/sdfg/propagation.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/dace/memlet.py b/dace/memlet.py index 1e39b4179d..f78da3a6b7 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -230,7 +230,7 @@ def is_empty(self) -> bool: primarily used for connecting nodes to scopes without transferring data to them. """ - return (self.data is None and self.src_subset is None and self.dst_subset is None) + return (self.data is None and self.subset is None and self.other_subset is None) @property def num_accesses(self): @@ -561,20 +561,18 @@ def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]: view_edge = True if not view_edge: - if self.src_subset: - result |= self.src_subset.free_symbols - - if self.dst_subset: - result |= self.dst_subset.free_symbols + if self.subset: + result |= self.subset.free_symbols + if self.other_subset: + result |= self.other_subset.free_symbols else: # View edges do not require the end of the range nor strides - if self.src_subset: - for rb, _, _ in self.src_subset.ndrange(): + if self.subset: + for rb, _, _ in self.subset.ndrange(): if symbolic.issymbolic(rb): result |= set(map(str, rb.free_symbols)) - - if self.dst_subset: - for rb, _, _ in self.dst_subset.ndrange(): + if self.other_subset: + for rb, _, _ in self.other_subset.ndrange(): if symbolic.issymbolic(rb): result |= set(map(str, rb.free_symbols)) diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py index f62bb6eb58..a24db0c72b 100644 --- a/dace/sdfg/propagation.py +++ b/dace/sdfg/propagation.py @@ -1430,10 +1430,15 @@ def propagate_subset(memlets: List[Memlet], tmp_subset = None subset = None - if use_dst and md.dst_subset is not None: - subset = md.dst_subset - elif not use_dst and md.src_subset is not None: - subset = md.src_subset + src, dst = md.subset, md.other_subset + if md._is_data_src is not None: + # Ideally, this should always be the case. In practice, it is not always so. So, if the memlet is uninitialized + # for some reason, we just explicitly fallback to `subset` and `other_subset` to retain the prior behaviour. + src, dst = md.src_subset, md.dst_subset + if use_dst and dst is not None: + subset = dst + elif not use_dst and src is not None: + subset = src else: subset = md.subset From 4f5655390e17010d93bc64f35bacef762e4aef98 Mon Sep 17 00:00:00 2001 From: iBug Date: Thu, 24 Oct 2024 22:46:53 +0800 Subject: [PATCH 06/43] fix(codegen/prettycode): Use base_indentation as intended (#1697) --- dace/codegen/prettycode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/codegen/prettycode.py b/dace/codegen/prettycode.py index de143f5e86..0fc4ebe3f1 100644 --- a/dace/codegen/prettycode.py +++ b/dace/codegen/prettycode.py @@ -14,7 +14,7 @@ class CodeIOStream(StringIO): nodes. """ def __init__(self, base_indentation=0): super(CodeIOStream, self).__init__() - self._indent = 0 + self._indent = base_indentation self._spaces = int(Config.get('compiler', 'indentation_spaces')) self._lineinfo = Config.get_bool('compiler', 'codegen_lineinfo') From 2bf537a2c8e9764baeaf6fd0b978bbe0486dbfc3 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 24 Oct 2024 08:17:24 -0700 Subject: [PATCH 07/43] Rename `master` branch to `main` --- .github/workflows/fpga-ci.yml | 6 +++--- .github/workflows/general-ci.yml | 6 +++--- .github/workflows/gpu-ci.yml | 6 +++--- .github/workflows/heterogeneous-ci.yml | 6 +++--- .github/workflows/pyFV3-ci.yml | 6 +++--- CONTRIBUTING.md | 2 +- README.md | 18 +++++++++--------- dace/frontend/python/README.md | 2 +- doc/codegen/codegen.rst | 6 +++--- doc/extensions/extensions.rst | 8 ++++---- doc/frontend/daceprograms.rst | 4 ++-- doc/ide/cli.rst | 2 +- doc/optimization/gpu.rst | 4 ++-- doc/optimization/optimization.rst | 6 +++--- doc/optimization/profiling.rst | 4 ++-- doc/optimization/vscode.rst | 2 +- doc/sdfg/ir.rst | 2 +- doc/sdfg/transformations.rst | 2 +- doc/setup/integration.rst | 2 +- doc/setup/quickstart.rst | 4 ++-- tutorials/benchmarking.ipynb | 2 +- 21 files changed, 50 insertions(+), 50 deletions(-) diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml index d03d044b30..29be0ec1f1 100644 --- a/.github/workflows/fpga-ci.yml +++ b/.github/workflows/fpga-ci.yml @@ -2,11 +2,11 @@ name: FPGA Tests on: push: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] pull_request: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] merge_group: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] jobs: test-fpga: diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml index f7b44e6978..2dcffc6484 100644 --- a/.github/workflows/general-ci.yml +++ b/.github/workflows/general-ci.yml @@ -2,11 +2,11 @@ name: General Tests on: push: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] pull_request: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] merge_group: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] jobs: test: diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index ce7f9b628e..2a1ccb43ef 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -2,11 +2,11 @@ name: GPU Tests on: push: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] pull_request: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] merge_group: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] env: CUDACXX: /usr/local/cuda/bin/nvcc diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml index 7c65e90718..5f7dbff77e 100644 --- a/.github/workflows/heterogeneous-ci.yml +++ b/.github/workflows/heterogeneous-ci.yml @@ -2,11 +2,11 @@ name: Heterogeneous Tests on: push: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] pull_request: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] merge_group: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] env: CUDA_HOME: /usr/local/cuda diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml index 2b98327381..f58fdf85ac 100644 --- a/.github/workflows/pyFV3-ci.yml +++ b/.github/workflows/pyFV3-ci.yml @@ -2,11 +2,11 @@ name: NASA/NOAA pyFV3 repository build test on: push: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] pull_request: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] merge_group: - branches: [ master, ci-fix ] + branches: [ main, ci-fix ] defaults: run: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6bf69495b1..313b3f0f21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -47,7 +47,7 @@ For automatic styling, we use the [yapf](https://github.com/google/yapf) file fo We use [pytest](https://www.pytest.org/) for our testing infrastructure. All tests under the `tests/` folder (and any subfolders within) are automatically read and run. The files must be under the right subfolder based on the component being tested (e.g., `tests/sdfg/` for IR-related tests), and must have the right -suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/master/pytest.ini) +suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/main/pytest.ini) for more information, and for the markers we use to specify software/hardware requirements. The structure of the test file must follow `pytest` standards (i.e., free functions called `test_*`), and diff --git a/README.md b/README.md index 41b059c953..ef4bdec1db 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![FPGA Tests](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml/badge.svg)](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml) [![Documentation Status](https://readthedocs.org/projects/spcldace/badge/?version=latest)](https://spcldace.readthedocs.io/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/dace.svg)](https://badge.fury.io/py/dace) -[![codecov](https://codecov.io/gh/spcl/dace/branch/master/graph/badge.svg)](https://codecov.io/gh/spcl/dace) +[![codecov](https://codecov.io/gh/spcl/dace/branch/main/graph/badge.svg)](https://codecov.io/gh/spcl/dace) ![D](dace.svg)aCe - Data-Centric Parallel Programming @@ -11,7 +11,7 @@ _Decoupling domain science from performance optimization._ -DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb) parallel programming +DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb) parallel programming framework that takes code in Python/NumPy and other programming languages, and maps it to high-performance **CPU, GPU, and FPGA** programs, which can be optimized to achieve state-of-the-art. Internally, DaCe uses the Stateful DataFlow multiGraph (SDFG) *data-centric intermediate @@ -61,13 +61,13 @@ be used in any C ABI compatible language (C/C++, FORTRAN, etc.). For more information on how to use DaCe, see the [samples](samples) or tutorials below: -* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/getting_started.ipynb) -* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb) -* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/explicit.ipynb) -* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/numpy_frontend.ipynb) -* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/sdfg_api.ipynb) -* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb) -* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/codegen.ipynb) +* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/getting_started.ipynb) +* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb) +* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/explicit.ipynb) +* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/numpy_frontend.ipynb) +* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/sdfg_api.ipynb) +* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb) +* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/codegen.ipynb) Publication ----------- diff --git a/dace/frontend/python/README.md b/dace/frontend/python/README.md index bd57e36519..aa176f687c 100644 --- a/dace/frontend/python/README.md +++ b/dace/frontend/python/README.md @@ -4,7 +4,7 @@ The Python-Frontend aims to assist users in creating SDFGs from Python code relatively quickly. You may read a list of supported Python features [here](python_supported_features.md). The frontend supports also operations among DaCe arrays, in a manner similar to NumPy. A short tutorial can be bound -[here](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/numpy_frontend.ipynb). +[here](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/numpy_frontend.ipynb). Please note that the Python-Frontend is still in an early version. For any issues and feature requests, you can create an issue in the main DaCe project. You can also address any questions you have to alziogas@inf.ethz.ch diff --git a/doc/codegen/codegen.rst b/doc/codegen/codegen.rst index a000022ee6..f3058c1440 100644 --- a/doc/codegen/codegen.rst +++ b/doc/codegen/codegen.rst @@ -32,8 +32,8 @@ There are many features that are enabled by generating code from SDFGs: .. note:: - You can also extend the code generator with new backends externally, see the `Customizing Code Generation tutorial `_ - and the `Tensor Core sample `_ for more information. + You can also extend the code generator with new backends externally, see the `Customizing Code Generation tutorial `_ + and the `Tensor Core sample `_ for more information. After the code is generated, ``compiler.py`` will invoke CMake on the build folder (e.g., ``.dacecache//build``) @@ -145,7 +145,7 @@ necessary headers. The runtime is used for: match Python interfaces. This is especially useful to generate matching code when calling functions such as ``range`` inside Tasklets. -The folder also contains other files and helper functions, refer to its contents `on GitHub `_ +The folder also contains other files and helper functions, refer to its contents `on GitHub `_ for more information. diff --git a/doc/extensions/extensions.rst b/doc/extensions/extensions.rst index 4644bef109..3f73a924bc 100644 --- a/doc/extensions/extensions.rst +++ b/doc/extensions/extensions.rst @@ -17,10 +17,10 @@ The three key mechanisms of extensibility are class inheritance, :ref:`replaceme For more examples of how to extend DaCe, see the following resources: - * Library nodes: `Einsum specialization library node `_ - * Transformations: `Using and Creating Transformations `_ - * Code generators: `Extending the Code Generator `_ - * Frontend extensions (enumerations and replacements): `Tensor Core code sample `_ + * Library nodes: `Einsum specialization library node `_ + * Transformations: `Using and Creating Transformations `_ + * Code generators: `Extending the Code Generator `_ + * Frontend extensions (enumerations and replacements): `Tensor Core code sample `_ .. .. toctree .. :maxdepth: 1 diff --git a/doc/frontend/daceprograms.rst b/doc/frontend/daceprograms.rst index c21ac34722..4229fe422d 100644 --- a/doc/frontend/daceprograms.rst +++ b/doc/frontend/daceprograms.rst @@ -9,7 +9,7 @@ This includes standard Python code (loops, functions, context managers, etc.), b and (most) functions. .. note:: - For more examples, see the `Getting Started `_ + For more examples, see the `Getting Started `_ Jupyter Notebook tutorial. Usage @@ -349,7 +349,7 @@ Explicit Dataflow Mode The DaCe Python frontend allows users to write SDFG tasklets and memlets directly in Python code. -For more example uses, see the `Explicit Dataflow `_ +For more example uses, see the `Explicit Dataflow `_ tutorial. Memlets diff --git a/doc/ide/cli.rst b/doc/ide/cli.rst index d73d32fdfc..1f63397841 100644 --- a/doc/ide/cli.rst +++ b/doc/ide/cli.rst @@ -123,4 +123,4 @@ nothing is given, the tool will time the entire execution of each program using +---------------------------+--------------+-----------------------------------------------------------+ For a more detailed guide on how to profile SDFGs and work with the resulting data, see :ref:`profiling` and -`this tutorial `_. +`this tutorial `_. diff --git a/doc/optimization/gpu.rst b/doc/optimization/gpu.rst index a08877de3b..f94d377b51 100644 --- a/doc/optimization/gpu.rst +++ b/doc/optimization/gpu.rst @@ -170,7 +170,7 @@ Optimizing GPU SDFGs When optimizing GPU SDFGs, there are a few things to keep in mind. Below is a non-exhaustive list of common GPU optimization practices and how DaCe helps achieve them. To see some of these optimizations in action, check out the ``optimize_for_gpu`` -function in the `Matrix Multiplication optimization example `_. +function in the `Matrix Multiplication optimization example `_. * **Minimize host<->GPU transfers**: It is important to keep as much data as possible on the GPU across the application. This is especially true for data that is accessed frequently, such as data that is used in a loop. @@ -234,7 +234,7 @@ function in the `Matrix Multiplication optimization example `_ + in your code. See the `Tensor Core code sample `_ to see how to make use of such units. * **Advanced GPU Map schedules**: DaCe provides two additional built-in map schedules: :class:`~dace.dtypes.ScheduleType.GPU_ThreadBlock_Dynamic` diff --git a/doc/optimization/optimization.rst b/doc/optimization/optimization.rst index f1eb84005b..592ab5e6fc 100644 --- a/doc/optimization/optimization.rst +++ b/doc/optimization/optimization.rst @@ -36,9 +36,9 @@ tunes the data layout of arrays. The following resources are available to help you optimize your SDFG: - * Using transformations: `Using and Creating Transformations `_ - * Creating optimized schedules that can match optimized libraries: `Matrix multiplication CPU and GPU optimization example `_ - * Auto-tuning and instrumentation: `Tuning data layouts sample `_ + * Using transformations: `Using and Creating Transformations `_ + * Creating optimized schedules that can match optimized libraries: `Matrix multiplication CPU and GPU optimization example `_ + * Auto-tuning and instrumentation: `Tuning data layouts sample `_ The following subsections provide more information on the different types of optimization methods: diff --git a/doc/optimization/profiling.rst b/doc/optimization/profiling.rst index 497dc81ae8..617b3a9cb9 100644 --- a/doc/optimization/profiling.rst +++ b/doc/optimization/profiling.rst @@ -5,7 +5,7 @@ Profiling and Instrumentation .. note:: - For more information and examples, see the `Benchmarking and Instrumentation `_ tutorial. + For more information and examples, see the `Benchmarking and Instrumentation `_ tutorial. Simple profiling ---------------- @@ -120,7 +120,7 @@ There are more instrumentation types available, such as fine-grained GPU kernel Instrumentation can also collect performance counters on CPUs and GPUs using `LIKWID `_. The :class:`~dace.dtypes.InstrumentationType.LIKWID_Counters` instrumentation type can be configured to collect a wide variety of performance counters on CPUs and GPUs. An example use can be found in the -`LIKWID instrumentation code sample `_. +`LIKWID instrumentation code sample `_. Instrumentation file format diff --git a/doc/optimization/vscode.rst b/doc/optimization/vscode.rst index 1b72effbcc..07f7797b4e 100644 --- a/doc/optimization/vscode.rst +++ b/doc/optimization/vscode.rst @@ -145,5 +145,5 @@ transformations |add-xform-by-folder-btn|. The latter recursively traverses the for any Python source code files and attempts to load each one as a transformation. For more information on how to use and author data-centric transformations, -see :ref:`transforming` and the `Using and Creating Transformations `_ +see :ref:`transforming` and the `Using and Creating Transformations `_ tutorial. diff --git a/doc/sdfg/ir.rst b/doc/sdfg/ir.rst index 61dc8d4858..1a7a8368cb 100644 --- a/doc/sdfg/ir.rst +++ b/doc/sdfg/ir.rst @@ -627,7 +627,7 @@ override default implementations for a library node type, or for an entire libra Internally, an expansion is a subclass of :class:`~dace.transformation.transformation.ExpandTransformation`. It is responsible for creating a new SDFG that implements the library node, and for connecting the inputs and outputs of the library node to the new SDFG. An example of such an expansion is Einstein summation specialization -(`see full file `_): +(`see full file `_): .. code-block:: python diff --git a/doc/sdfg/transformations.rst b/doc/sdfg/transformations.rst index 0a9791ca66..470d413271 100644 --- a/doc/sdfg/transformations.rst +++ b/doc/sdfg/transformations.rst @@ -23,7 +23,7 @@ All transformations extend the :class:`~dace.transformation.transformation.Trans Transformations can have properties and those can be used when applying them: for example, tile sizes in :class:`~dace.transformation.dataflow.tiling.MapTiling`. -For more information on how to use and author data-centric transformations, see the `Using and Creating Transformations `_ +For more information on how to use and author data-centric transformations, see the `Using and Creating Transformations `_ tutorial. diff --git a/doc/setup/integration.rst b/doc/setup/integration.rst index 3e1fc5fa70..78607feda9 100644 --- a/doc/setup/integration.rst +++ b/doc/setup/integration.rst @@ -79,7 +79,7 @@ you to call the SDFG's entry point function, perform basic type checking, and ar Python callback to function pointer, etc.). Since the compiled SDFG is a low-level interface, it is much faster to call than the Python interface. -`We show this behavior in the Benchmarking tutorial `_. +`We show this behavior in the Benchmarking tutorial `_. However, it requires caution as opposed to calling the ``@dace.program`` or the ``SDFG`` object because: * Each array return value is represented internally as a single array (not reallocated every call) and will be diff --git a/doc/setup/quickstart.rst b/doc/setup/quickstart.rst index 4a54de720c..70f24cbfb1 100644 --- a/doc/setup/quickstart.rst +++ b/doc/setup/quickstart.rst @@ -36,5 +36,5 @@ From here on out, you can optimize (:ref:`interactively `, :ref:`program your code. -For more examples of how to use DaCe, see the `samples `_ and -`tutorials `_ folders on GitHub. +For more examples of how to use DaCe, see the `samples `_ and +`tutorials `_ folders on GitHub. diff --git a/tutorials/benchmarking.ipynb b/tutorials/benchmarking.ipynb index f2330957a3..59302e8090 100644 --- a/tutorials/benchmarking.ipynb +++ b/tutorials/benchmarking.ipynb @@ -1260,7 +1260,7 @@ "source": [ "### Instrumentation API\n", "\n", - "The Instrumentation API allows more fine-grained control over measuring program metrics. It creates a JSON report in `.dacecache//perf`, which can be obtained with the API or viewed with any Chrome Tracing capable viewer. More usage information and how to use the API to tune programs can be found in the [program tuning sample](https://github.com/spcl/dace/blob/master/samples/optimization/tuning.py)." + "The Instrumentation API allows more fine-grained control over measuring program metrics. It creates a JSON report in `.dacecache//perf`, which can be obtained with the API or viewed with any Chrome Tracing capable viewer. More usage information and how to use the API to tune programs can be found in the [program tuning sample](https://github.com/spcl/dace/blob/main/samples/optimization/tuning.py)." ] }, { From 057a6804ea2da60b053895e490cf230d0ef90225 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 24 Oct 2024 21:13:20 -0700 Subject: [PATCH 08/43] Use codecov tokens (#1707) --- .github/workflows/fpga-ci.yml | 3 +++ .github/workflows/general-ci.yml | 6 +++++- .github/workflows/gpu-ci.yml | 1 + .github/workflows/heterogeneous-ci.yml | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml index 29be0ec1f1..ef8e5348da 100644 --- a/.github/workflows/fpga-ci.yml +++ b/.github/workflows/fpga-ci.yml @@ -8,6 +8,9 @@ on: merge_group: branches: [ main, ci-fix ] +env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + jobs: test-fpga: if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-ci') }} diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml index 2dcffc6484..faf0a727be 100644 --- a/.github/workflows/general-ci.yml +++ b/.github/workflows/general-ci.yml @@ -85,4 +85,8 @@ jobs: ./tests/polybench_test.sh ./tests/xform_test.sh coverage combine .; coverage report; coverage xml - ./codecov + + - uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 2a1ccb43ef..527e004478 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -11,6 +11,7 @@ on: env: CUDACXX: /usr/local/cuda/bin/nvcc MKLROOT: /opt/intel/oneapi/mkl/latest/ + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} jobs: diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml index 5f7dbff77e..99b566e21f 100644 --- a/.github/workflows/heterogeneous-ci.yml +++ b/.github/workflows/heterogeneous-ci.yml @@ -12,6 +12,7 @@ env: CUDA_HOME: /usr/local/cuda CUDACXX: nvcc MKLROOT: /opt/intel/oneapi/mkl/latest/ + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} jobs: test-heterogeneous: From 813a2f435cacf509d43be8e109498f7526d06d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20M=C3=BCller?= <147368808+philip-paul-mueller@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:25:06 +0200 Subject: [PATCH 09/43] Modified `SDFGState.unordered_arglist()` (#1708) This PR fixes the way how arguments are detected in scopes. Technically this only affects GPU code generation, but it is a side effect of how the code is generated. In GPU mode a `Map` is translated into one kernel, for this reason a signature must be computed (this is the reason why CPU code generation is not affected, no function call is produced). To compute this signature the `unsorted_arglist()` function scans what is needed. However, this was implemented not correctly. Assume that AccessNode for array `A` is outside the map and inside the map a temporary scalar, `tmp_in` is defined and initialized to `tmp_in = A[__i0, __i1]`, see also this image: ![argliost_situation](https://github.com/user-attachments/assets/fdf54dea-4ef5-49be-8ce2-33b78ce5962d) If the `data` property of the Memlet that connects the MapEntry with the AccessNode for `tmp_in` is referencing `A` then the (old) function would find that `A` is needed inside, although there is no AccessNode for `A` inside the map. If however, this Memlet referrers `tmp_in` (which is not super standard, but should be allowed), then the old version would not pick up. This would then lead to a code generation error. This PR modifies the function such that such cases are handled. This is done by following all edges that are adjacent to the MapEntry (from the inside) to where the are actually originate. --- dace/sdfg/state.py | 60 ++++++-- tests/codegen/argumet_signature_test.py | 197 ++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 10 deletions(-) create mode 100644 tests/codegen/argumet_signature_test.py diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 09e7607d65..b982dfd718 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -849,6 +849,8 @@ def unordered_arglist(self, for node in self.nodes(): if isinstance(node, nd.AccessNode): descs[node.data] = node.desc(sdfg) + # NOTE: In case of multiple nodes of the same data this will + # override previously found nodes. descs_with_nodes[node.data] = node if isinstance(node.desc(sdfg), dt.Scalar): scalars_with_nodes.add(node.data) @@ -865,19 +867,57 @@ def unordered_arglist(self, else: data_args[node.data] = desc - # Add data arguments from memlets, if do not appear in any of the nodes - # (i.e., originate externally) + # Add data arguments from memlets, if do not appear in any of the nodes (i.e., originate externally) + # TODO: Investigate is scanning the adjacent edges of the input and output connectors is better. for edge in self.edges(): - if edge.data.data is not None and edge.data.data not in descs: - desc = sdfg.arrays[edge.data.data] - if isinstance(desc, dt.Scalar): - # Ignore code->code edges. - if (isinstance(edge.src, nd.CodeNode) and isinstance(edge.dst, nd.CodeNode)): - continue + if edge.data.is_empty(): + continue + + elif edge.data.data not in descs: + # The edge reads data from the outside, and the Memlet is directly indicating what is read. + if (isinstance(edge.src, nd.CodeNode) and isinstance(edge.dst, nd.CodeNode)): + continue # Ignore code->code edges. + additional_descs = {edge.data.data: sdfg.arrays[edge.data.data]} + + elif isinstance(edge.dst, (nd.AccessNode, nd.CodeNode)) and isinstance(edge.src, nd.EntryNode): + # Special case from the above; An AccessNode reads data from the Outside, but + # the Memlet references the data on the inside. Thus we have to follow the data + # to where it originates from. + # NOTE: We have to use a memlet path, because we have to go "against the flow" + # Furthermore, in a valid SDFG the data will only come from one source anyway. + top_source_edge = self.graph.memlet_path(edge)[0] + if not isinstance(top_source_edge.src, nd.AccessNode): + continue + additional_descs = ( + {top_source_edge.src.data: top_source_edge.src.desc(sdfg)} + if top_source_edge.src.data not in descs + else {} + ) + + elif isinstance(edge.dst, nd.ExitNode) and isinstance(edge.src, (nd.AccessNode, nd.CodeNode)): + # Same case as above, but for outgoing Memlets. + # NOTE: We have to use a memlet tree here, because the data could potentially + # go to multiple sources. We have to do it this way, because if we would call + # `memlet_tree()` here, then we would just get the edge back. + additional_descs = {} + connector_to_look = "OUT_" + edge.dst_conn[3:] + for oedge in self.graph.out_edges_by_connector(edge.dst, connector_to_look): + if ( + (not oedge.data.is_empty()) and (oedge.data.data not in descs) + and (oedge.data.data not in additional_descs) + ): + additional_descs[oedge.data.data] = sdfg.arrays[oedge.data.data] + + else: + # Case is ignored. + continue - scalar_args[edge.data.data] = desc + # Now processing the list of newly found data. + for aname, additional_desc in additional_descs.items(): + if isinstance(additional_desc, dt.Scalar): + scalar_args[aname] = additional_desc else: - data_args[edge.data.data] = desc + data_args[aname] = additional_desc # Loop over locally-used data descriptors for name, desc in descs.items(): diff --git a/tests/codegen/argumet_signature_test.py b/tests/codegen/argumet_signature_test.py new file mode 100644 index 0000000000..376724439f --- /dev/null +++ b/tests/codegen/argumet_signature_test.py @@ -0,0 +1,197 @@ +import dace +import copy + +def test_argument_signature_test(): + """Tests if the argument signature is computed correctly. + + The test is focused on if data dependencies are picked up if they are only + referenced indirectly. This effect is only directly visible for GPU. + The test also runs on GPU, but will only compile for GPU. + """ + + def make_sdfg() -> dace.SDFG: + sdfg = dace.SDFG("Repr") + state = sdfg.add_state(is_start_block=True) + N = dace.symbol(sdfg.add_symbol("N", dace.int32)) + for name in "BC": + sdfg.add_array( + name=name, + dtype=dace.float64, + shape=(N, N), + strides=(N, 1), + transient=False, + ) + + # `A` uses a stride that is not used by any of the other arrays. + # However, the stride is used if we want to index array `A`. + second_stride_A = dace.symbol(sdfg.add_symbol("second_stride_A", dace.int32)) + sdfg.add_array( + name="A", + dtype=dace.float64, + shape=(N,), + strides=(second_stride_A,), + transient=False, + + ) + + # Also array `D` uses a stride that is not used by any other array. + second_stride_D = dace.symbol(sdfg.add_symbol("second_stride_D", dace.int32)) + sdfg.add_array( + name="D", + dtype=dace.float64, + shape=(N, N), + strides=(second_stride_D, 1), + transient=False, + + ) + + # Simplest way to generate a mapped Tasklet, we will later modify it. + state.add_mapped_tasklet( + "computation", + map_ranges={"__i0": "0:N", "__i1": "0:N"}, + inputs={ + "__in0": dace.Memlet("A[__i1]"), + "__in1": dace.Memlet("B[__i0, __i1]"), + }, + code="__out = __in0 + __in1", + outputs={"__out": dace.Memlet("C[__i0, __i1]")}, + external_edges=True, + ) + + # Instead of going from the MapEntry to the Tasklet we will go through + # an temporary AccessNode that is only used inside the map scope. + # Thus there is no direct reference to `A` inside the map scope, that would + # need `second_stride_A`. + sdfg.add_scalar("tmp_in", transient=True, dtype=dace.float64) + tmp_in = state.add_access("tmp_in") + for e in state.edges(): + if e.dst_conn == "__in0": + iedge = e + break + state.add_edge( + iedge.src, + iedge.src_conn, + tmp_in, + None, + # The important thing is that the Memlet, that connects the MapEntry with the + # AccessNode, does not refers to the memory outside (its source) but to the transient + # inside (its destination) + dace.Memlet(data="tmp_in", subset="0", other_subset="__i1"), # This does not work! + #dace.Memlet(data="A", subset="__i1", other_subset="0"), # This would work! + ) + state.add_edge( + tmp_in, + None, + iedge.dst, + iedge.dst_conn, + dace.Memlet(f"{tmp_in.data}[0]"), + ) + state.remove_edge(iedge) + + # Here we are doing something similar as for `A`, but this time for the output. + # The output of the Tasklet is stored inside a temporary scalar. + # From that scalar we then go to `C`, here the Memlet on the inside is still + # referring to `C`, thus it is referenced directly. + # We also add a second output that goes to `D` , but the inner Memlet does + # not refer to `D` but to the temporary. Thus there is no direct mention of + # `D` inside the map scope. + sdfg.add_scalar("tmp_out", transient=True, dtype=dace.float64) + tmp_out = state.add_access("tmp_out") + for e in state.edges(): + if e.src_conn == "__out": + oedge = e + assert oedge.data.data == "C" + break + + state.add_edge( + oedge.src, + oedge.src_conn, + tmp_out, + None, + dace.Memlet(data="tmp_out", subset="0"), + ) + state.add_edge( + tmp_out, + None, + oedge.dst, + oedge.dst_conn, + dace.Memlet(data="C", subset="__i0, __i1"), + ) + + # Now we create a new output that uses `tmp_out` but goes into `D`. + # The memlet on the inside will not use `D` but `tmp_out`. + state.add_edge( + tmp_out, + None, + oedge.dst, + "IN_D", + dace.Memlet(data=tmp_out.data, subset="0", other_subset="__i1, __i0"), + ) + state.add_edge( + oedge.dst, + "OUT_D", + state.add_access("D"), + None, + dace.Memlet(data="D", subset="__i0, __i1", other_subset="0"), + ) + oedge.dst.add_in_connector("IN_D", force=True) + oedge.dst.add_out_connector("OUT_D", force=True) + state.remove_edge(oedge) + + # Without this the test does not work properly + # It is related to [Issue#1703](https://github.com/spcl/dace/issues/1703) + sdfg.validate() + for edge in state.edges(): + edge.data.try_initialize(edge=edge, sdfg=sdfg, state=state) + + for array in sdfg.arrays.values(): + if isinstance(array, dace.data.Array): + array.storage = dace.StorageType.GPU_Global + else: + array.storage = dace.StorageType.Register + sdfg.apply_gpu_transformations(simplify=False) + sdfg.validate() + + return sdfg + + # Build the SDFG + sdfg = make_sdfg() + + map_entry = None + for state in sdfg.states(): + for node in state.nodes(): + if isinstance(node, dace.nodes.MapEntry): + map_entry = node + break + if map_entry is not None: + break + + # Now get the argument list of the map. + res_arglist = { k:v for k, v in state.scope_subgraph(map_entry).arglist().items()} + + ref_arglist = { + 'A': dace.data.Array, + 'B': dace.data.Array, + 'C': dace.data.Array, + 'D': dace.data.Array, + 'N': dace.data.Scalar, + 'second_stride_A': dace.data.Scalar, + 'second_stride_D': dace.data.Scalar, + } + + assert len(ref_arglist) == len(res_arglist), f"Expected {len(ref_arglist)} but got {len(res_arglist)}" + for aname in ref_arglist.keys(): + atype_ref = ref_arglist[aname] + atype_res = res_arglist[aname] + assert isinstance(atype_res, atype_ref), f"Expected '{aname}' to have type {atype_ref}, but it had {type(atype_res)}." + + # If we have cupy we will also compile it. + try: + import cupy as cp + except ImportError: + return + + csdfg = sdfg.compile() + +if __name__ == "__main__": + test_argument_signature_test() From 2070d393993e2db9d49e278a1052d6d9972cbb6d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:53:01 -0700 Subject: [PATCH 10/43] Bump urllib3 from 2.0.7 to 2.2.2 (#1600) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7332dc0419..3cc37cc468 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,6 @@ ply==3.11 PyYAML==6.0.1 six==1.16.0 sympy==1.9 -urllib3==2.0.7 +urllib3==2.2.2 websockets==11.0.3 zipp==3.15.0 From d8ddc756820c5fd42ea10da219edc11af1552c0a Mon Sep 17 00:00:00 2001 From: Philipp Schaad Date: Tue, 29 Oct 2024 16:56:41 +0100 Subject: [PATCH 11/43] Warn on potential data races (#1712) Re-implement changes made by @luca-patrignani in #1541, see that PR for more information. Author: @luca-patrignani --- dace/config_schema.yml | 7 + dace/sdfg/validation.py | 51 ++- .../sdfg/warn_on_potential_data_race_test.py | 316 ++++++++++++++++++ 3 files changed, 367 insertions(+), 7 deletions(-) create mode 100644 tests/sdfg/warn_on_potential_data_race_test.py diff --git a/dace/config_schema.yml b/dace/config_schema.yml index da35e61997..7afb06a50a 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -919,6 +919,13 @@ required: description: > Check for undefined symbols in memlets during SDFG validation. + check_race_conditions: + type: bool + default: false + title: Check race conditions + description: > + Check for potential race conditions during validation. + ############################################# # Features for unit testing diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index e75099276f..f02a5003e9 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -1,17 +1,22 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. """ Exception classes and methods for validation of SDFGs. """ + import copy -from dace.dtypes import DebugInfo import os -from typing import TYPE_CHECKING, Dict, List, Set import warnings +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, List, Set + +import networkx as nx + from dace import dtypes, subsets, symbolic +from dace.dtypes import DebugInfo if TYPE_CHECKING: import dace + from dace.memlet import Memlet from dace.sdfg import SDFG from dace.sdfg import graph as gr - from dace.memlet import Memlet from dace.sdfg.state import ControlFlowRegion ########################################### @@ -34,8 +39,8 @@ def validate_control_flow_region(sdfg: 'SDFG', symbols: dict, references: Set[int] = None, **context: bool): - from dace.sdfg.state import SDFGState, ControlFlowRegion, ConditionalBlock from dace.sdfg.scope import is_in_scope + from dace.sdfg.state import ConditionalBlock, ControlFlowRegion, SDFGState if len(region.source_nodes()) > 1 and region.start_block is None: raise InvalidSDFGError("Starting block undefined", sdfg, None) @@ -200,7 +205,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context # Avoid import loop from dace import data as dt from dace.codegen.targets import fpga - from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga + from dace.sdfg.scope import is_devicelevel_fpga, is_devicelevel_gpu references = references or set() @@ -383,7 +388,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', from dace.sdfg import SDFG from dace.sdfg import nodes as nd from dace.sdfg import utils as sdutil - from dace.sdfg.scope import scope_contains_scope, is_devicelevel_gpu, is_devicelevel_fpga + from dace.sdfg.scope import (is_devicelevel_fpga, is_devicelevel_gpu, + scope_contains_scope) sdfg = sdfg or state.parent state_id = state_id if state_id is not None else state.parent_graph.node_id(state) @@ -839,6 +845,37 @@ def validate_state(state: 'dace.sdfg.SDFGState', continue raise error + if Config.get_bool('experimental.check_race_conditions'): + node_labels = [] + write_accesses = defaultdict(list) + read_accesses = defaultdict(list) + for node in state.data_nodes(): + node_labels.append(node.label) + write_accesses[node.label].extend( + [{'subset': e.data.dst_subset, 'node': node, 'wcr': e.data.wcr} for e in state.in_edges(node)]) + read_accesses[node.label].extend( + [{'subset': e.data.src_subset, 'node': node} for e in state.out_edges(node)]) + + for node_label in node_labels: + writes = write_accesses[node_label] + reads = read_accesses[node_label] + # Check write-write data races. + for i in range(len(writes)): + for j in range(i+1, len(writes)): + same_or_unreachable_nodes = (writes[i]['node'] == writes[j]['node'] or + not nx.has_path(state.nx, writes[i]['node'], writes[j]['node'])) + no_wcr = writes[i]['wcr'] is None and writes[j]['wcr'] is None + if same_or_unreachable_nodes and no_wcr: + subsets_intersect = subsets.intersects(writes[i]['subset'], writes[j]['subset']) + if subsets_intersect: + warnings.warn(f'Memlet range overlap while writing to "{node}" in state "{state.label}"') + # Check read-write data races. + for write in writes: + for read in reads: + if (not nx.has_path(state.nx, read['node'], write['node']) and + subsets.intersects(write['subset'], read['subset'])): + warnings.warn(f'Memlet range overlap while writing to "{node}" in state "{state.label}"') + ######################################## diff --git a/tests/sdfg/warn_on_potential_data_race_test.py b/tests/sdfg/warn_on_potential_data_race_test.py new file mode 100644 index 0000000000..8f17409a2f --- /dev/null +++ b/tests/sdfg/warn_on_potential_data_race_test.py @@ -0,0 +1,316 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. + +import warnings +import dace +import pytest + +def test_memlet_range_not_overlap_ranges(): + sdfg = dace.SDFG('memlet_range_not_overlap_ranges') + state = sdfg.add_state() + N = dace.symbol("N", dtype=dace.int32) + sdfg.add_array("A", (N//2,), dace.int32) + A = state.add_access("A") + sdfg.add_array("B", (N,), dace.int32) + B = state.add_access("B") + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N//2"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k+N//2")}, + map_ranges={"k": "0:N//2"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with dace.config.set_temporary("experimental.check_race_conditions", value=True): + sdfg.validate() + + +def test_memlet_range_write_write_overlap_ranges(): + sdfg = dace.SDFG('memlet_range_overlap_ranges') + state = sdfg.add_state() + N = dace.symbol("N", dtype=dace.int32) + sdfg.add_array("A", (N,), dace.int32) + A = state.add_access("A") + sdfg.add_array("B", (N,), dace.int32) + B = state.add_access("B") + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with pytest.warns(UserWarning): + with dace.config.set_temporary("experimental.check_race_conditions", value=True): + sdfg.validate() + +def test_memlet_range_write_read_overlap_ranges(): + sdfg = dace.SDFG('memlet_range_write_read_overlap_ranges') + state = sdfg.add_state() + N = dace.symbol("N", dtype=dace.int32) + sdfg.add_array("A", (N,), dace.int32) + A_read = state.add_read("A") + A_write = state.add_write("A") + sdfg.add_array("B", (N,), dace.int32) + B = state.add_access("B") + sdfg.add_array("C", (N,), dace.int32) + C = state.add_access("C") + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A_read}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="a = c - 20", + inputs={"c": dace.Memlet(data="C", subset="k")}, + outputs={"a": dace.Memlet(data="A", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"C": C}, + output_nodes={"A": A_write} + ) + + with pytest.warns(UserWarning): + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_memlet_overlap_ranges_two_access_nodes(): + sdfg = dace.SDFG('memlet_range_write_read_overlap_ranges') + state = sdfg.add_state() + N = dace.symbol("N", dtype=dace.int32) + sdfg.add_array("A", (N,), dace.int32) + A1 = state.add_access("A") + A2 = state.add_access("A") + sdfg.add_array("B", (N,), dace.int32) + B1 = state.add_access("B") + B2 = state.add_access("B") + + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A1}, + output_nodes={"B": B1} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A2}, + output_nodes={"B": B2} + ) + + with pytest.warns(UserWarning): + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_memlet_overlap_symbolic_ranges(): + sdfg = dace.SDFG('memlet_overlap_symbolic_ranges') + state = sdfg.add_state() + N = dace.symbol("N", dtype=dace.int32) + sdfg.add_array("A", (2*N,), dace.int32) + A = state.add_access("A") + sdfg.add_array("B", (2*N,), dace.int32) + B = state.add_access("B") + + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:N"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "0:2*N"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with pytest.warns(UserWarning): + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_constant_memlet_overlap(): + sdfg = dace.SDFG('constant_memlet_overlap') + state = sdfg.add_state() + sdfg.add_array("A", (12,), dace.int32) + A = state.add_access("A") + sdfg.add_array("B", (12,), dace.int32) + B = state.add_access("B") + + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "3:10"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "6:12"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with pytest.warns(UserWarning): + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_constant_memlet_almost_overlap(): + sdfg = dace.SDFG('constant_memlet_almost_overlap') + state = sdfg.add_state() + sdfg.add_array("A", (20,), dace.int32) + A = state.add_access("A") + sdfg.add_array("B", (20,), dace.int32) + B = state.add_access("B") + + state.add_mapped_tasklet( + name="first_tasklet", + code="b = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "3:10"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + state.add_mapped_tasklet( + name="second_tasklet", + code="b = a - 20", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="k")}, + map_ranges={"k": "10:20"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_elementwise_map(): + sdfg = dace.SDFG('elementwise_map') + state = sdfg.add_state() + sdfg.add_array("A", (20,), dace.int32) + A_read = state.add_read("A") + A_write = state.add_write("A") + + state.add_mapped_tasklet( + name="first_tasklet", + code="aa = a + 10", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"aa": dace.Memlet(data="A", subset="k")}, + map_ranges={"k": "0:20"}, + external_edges=True, + input_nodes={"A": A_read}, + output_nodes={"A": A_write} + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + +def test_memlet_overlap_with_wcr(): + sdfg = dace.SDFG('memlet_overlap_with_wcr') + state = sdfg.add_state() + sdfg.add_array("A", (20,), dace.int32) + sdfg.add_array("B", (1,), dace.int32) + A = state.add_read("A") + B = state.add_write("B") + + state.add_mapped_tasklet( + name="first_reduction", + code="b = a", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="0", wcr="lambda old, new: old + new")}, + map_ranges={"k": "0:20"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + state.add_mapped_tasklet( + name="second_reduction", + code="b = a", + inputs={"a": dace.Memlet(data="A", subset="k")}, + outputs={"b": dace.Memlet(data="B", subset="0", wcr="lambda old, new: old + new")}, + map_ranges={"k": "0:20"}, + external_edges=True, + input_nodes={"A": A}, + output_nodes={"B": B} + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with dace.config.set_temporary('experimental', 'check_race_conditions', value=True): + sdfg.validate() + + +if __name__ == '__main__': + test_memlet_range_not_overlap_ranges() + test_memlet_range_write_write_overlap_ranges() + test_memlet_range_write_read_overlap_ranges() + test_memlet_overlap_ranges_two_access_nodes() + test_memlet_overlap_symbolic_ranges() + test_constant_memlet_overlap() + test_constant_memlet_almost_overlap() + test_elementwise_map() + test_memlet_overlap_with_wcr() From 7cb93f29820c8ad8caba5f75122d912192050f0f Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Oct 2024 16:40:23 -0700 Subject: [PATCH 12/43] Python frontend stability and inline storage specification (#1711) The PR adds a new syntax to support inline storage specification with the `@` operator, supporting the following statements: `a = np.ones(M) @ dace.StorageType.CPU_ThreadLocal`. This PR also fixes multiple minor issues in the Python frontend: * `WarpTiling` did not respect sequential map schedules * Non-sequence inputs for `numpy.fill` variants (e.g., `numpy.zeros(N)`) * NumPy replacement syntax errors would sometimes not have source information * Fix type inference for nested scopes in Python frontend * Dynamic thread block scheduling does not support multi-dimensional maps * Default schedule inference should use dynamic thread blocks if they exist * Type hints with storage type not being adhered to by the Python frontend * Validation issue #1562 The following changes were added as skipped tests and deferred to future PRs: * Dynamic map range related issues: Fix deferred to #1696 * Dynamic thread block scheduling would not pass object to nested functions: Fix deferred to future PR, see #1189 for more information --- dace/codegen/targets/cuda.py | 74 ++++++------ dace/codegen/tools/type_inference.py | 4 +- dace/dtypes.py | 2 - dace/frontend/python/newast.py | 31 +++-- dace/frontend/python/replacements.py | 114 ++++++++---------- dace/sdfg/infer_types.py | 10 +- dace/sdfg/sdfg.py | 8 +- dace/sdfg/validation.py | 3 +- dace/transformation/dataflow/warp_tiling.py | 4 + dace/transformation/helpers.py | 17 ++- tests/dynamic_tb_map_cudatest.py | 70 ++++++++++- tests/numpy/array_creation_test.py | 49 +++++++- tests/numpy/map_syntax_test.py | 52 ++++++++ .../device_annotations_test.py | 49 ++++++-- tests/sdfg/cycles_test.py | 19 ++- 15 files changed, 370 insertions(+), 136 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f080f2cc62..1cf8919d74 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -23,8 +23,8 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute from dace.config import Config from dace.frontend import operations -from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, - is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope) +from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, + is_devicelevel_gpu, nodes, scope_contains_scope) from dace.sdfg import utils as sdutil from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, StateSubgraphView @@ -68,6 +68,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): dispatcher = self._dispatcher self.create_grid_barrier = False + self.dynamic_tbmap_type = None self.extra_nsdfg_args = [] CUDACodeGen._in_device_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None @@ -892,8 +893,8 @@ def increment(streams): return max_streams, max_events - def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, - dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, + def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node, + dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: u, uconn, v, vconn, memlet = edge @@ -1163,11 +1164,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St copysize=', '.join(_topy(copy_shape)), is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum or '::Copy', - args=', '.join( - [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction - ) - ), - cfg, state_id, [src_node, dst_node]) + args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + + custom_reduction)), cfg, state_id, [src_node, dst_node]) else: callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, ' + @@ -1236,8 +1234,12 @@ def _begin_streams(self, sdfg, state): result.add(e.dst._cuda_stream) return result - def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: # Two modes: device-level state and if this state has active streams if CUDACodeGen._in_device_code: @@ -1361,8 +1363,7 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: "&& threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) elif write_scope == 'block': - callsite_stream.write("if (threadIdx.x == 0) " - "{ // sub-graph begin", cfg, state.block_id) + callsite_stream.write("if (threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) else: callsite_stream.write("{ // subgraph begin", cfg, state.block_id) else: @@ -1985,16 +1986,13 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # allocating shared memory for dynamic threadblock maps if has_dtbmap: - kernel_stream.write( - '__shared__ dace::' - 'DynamicMap<{fine_grained}, {block_size}>' - '::shared_type dace_dyn_map_shared;'.format( - fine_grained=('true' - if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), - block_size=functools.reduce( - (lambda x, y: x * y), - [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg, - state_id, node) + self.dynamic_tbmap_type = ( + f'dace::DynamicMap<{"true" if Config.get_bool("compiler", "cuda", "dynamic_map_fine_grained") else "false"}, ' + f'{functools.reduce((lambda x, y: x * y), [int(x) for x in Config.get("compiler", "cuda", "dynamic_map_block_size").split(",")])}>' + '::shared_type') + kernel_stream.write(f'__shared__ {self.dynamic_tbmap_type} dace_dyn_map_shared;', cfg, state_id, node) + else: + self.dynamic_tbmap_type = None # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) @@ -2072,8 +2070,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # Generate conditions for this block's execution using min and max # element, e.g., skipping out-of-bounds threads in trailing block - # unless thsi is handled by another map down the line - if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): + # unless this is handled by another map down the line + if ((not has_tbmap or has_dtbmap) and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): dsym_end = [d + bs - 1 for d, bs in zip(dsym, self._block_dims)] minels = krange.min_element() maxels = krange.max_element() @@ -2090,10 +2088,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S condition += '%s < %s' % (v, _topy(maxel + 1)) if len(condition) > 0: self._kernel_grid_conditions.append(f'if ({condition}) {{') - kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: self._kernel_grid_conditions.append('{') - kernel_stream.write('{', cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('{', cfg, state_id, scope_entry) self._dispatcher.dispatch_subgraph(sdfg, cfg, @@ -2112,6 +2112,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S self._kernel_state = None CUDACodeGen._in_device_code = False self._grid_dims = None + self.dynamic_tbmap_type = None def get_next_scope_entries(self, dfg, scope_entry): parent_scope_entry = dfg.entry_node(scope_entry) @@ -2179,10 +2180,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco current_sdfg = current_state.parent if not outer_scope: raise ValueError(f'Failed to find the outer scope of {scope_entry}') - callsite_stream.write( - 'if ({} < {}) {{'.format(outer_scope.map.params[0], - _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg, - state_id, scope_entry) + for cond in self._kernel_grid_conditions: + callsite_stream.write(cond, cfg, state_id, scope_entry) # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule. # They define inside the schedule the bounds of the any nested Maps. @@ -2205,8 +2204,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco '__dace_dynmap_begin = {begin};\n' '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry) - # close if - callsite_stream.write('}', cfg, state_id, scope_entry) + # Close kernel grid conditions + for _ in self._kernel_grid_conditions: + callsite_stream.write('}', cfg, state_id, scope_entry) callsite_stream.write( 'dace::DynamicMap<{fine_grained}, {bsize}>::' @@ -2215,7 +2215,7 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco 'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool( 'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), bsize=total_block_size, - kmapIdx=outer_scope.map.params[0], + kmapIdx=outer_scope.map.params[-1], param=dynmap_var), cfg, state_id, scope_entry) for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): @@ -2556,8 +2556,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco for cond in self._kernel_grid_conditions: callsite_stream.write(cond, cfg, state_id, scope_entry) - def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if self.node_dispatch_predicate(sdfg, dfg, node): # Dynamically obtain node generator according to class name gen = getattr(self, '_generate_' + type(node).__name__, False) @@ -2594,6 +2594,8 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) if self.create_grid_barrier: result.append(('cub::GridBarrier&', '__gbar', '__gbar')) + if self.dynamic_tbmap_type: + result.append((f'{self.dynamic_tbmap_type}&', 'dace_dyn_map_shared', 'dace_dyn_map_shared')) # Add data from nested SDFGs to kernel arguments result.extend([(atype, aname, aname) for atype, aname, _ in self.extra_nsdfg_args]) diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py index 893866522f..8f8dd84151 100644 --- a/dace/codegen/tools/type_inference.py +++ b/dace/codegen/tools/type_inference.py @@ -9,7 +9,7 @@ import numpy as np import ast -from dace import dtypes +from dace import data, dtypes from dace import symbolic from dace.codegen import cppunparse from dace.symbolic import symbol, SymExpr, symstr @@ -286,6 +286,8 @@ def _Name(t, symbols, inferred_symbols): inferred_type = dtypes.typeclass(inferred_type.type) elif isinstance(inferred_type, symbolic.symbol): inferred_type = inferred_type.dtype + elif isinstance(inferred_type, data.Data): + inferred_type = inferred_type.dtype elif t_id in inferred_symbols: inferred_type = inferred_symbols[t_id] return inferred_type diff --git a/dace/dtypes.py b/dace/dtypes.py index c5f9bb4732..a016ac60e2 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1,10 +1,8 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ A module that contains various DaCe type definitions. """ -from __future__ import print_function import ctypes import aenum import inspect -import itertools import numpy import re from collections import OrderedDict diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index cacf15d785..78890c9cdd 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1489,19 +1489,19 @@ def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typecla else: values = str(val).split(':') if len(values) == 1: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) elif len(values) == 2: result[name] = symbolic.symbol( name, dtypes.result_type_of(infer_expr_type(values[0], { - **self.globals, + **self.defined, **dyn_inputs }), infer_expr_type(values[1], { - **self.globals, + **self.defined, **dyn_inputs }))) elif len(values) == 3: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) else: raise DaceSyntaxError( self, None, "Invalid number of arguments in a range iterator. " @@ -3258,18 +3258,23 @@ def visit_AnnAssign(self, node: ast.AnnAssign): dtype = astutils.evalnode(node.annotation, {**self.globals, **self.defined}) if isinstance(dtype, data.Data): simple_type = dtype.dtype + storage = dtype.storage else: simple_type = dtype + storage = dtypes.StorageType.Default if not isinstance(simple_type, dtypes.typeclass): raise TypeError except: dtype = None + storage = dtypes.StorageType.Default type_name = rname(node.annotation) warnings.warn('typeclass {} is not supported'.format(type_name)) if node.value is None and dtype is not None: # Annotating type without assignment self.annotated_types[rname(node.target)] = dtype return - self._visit_assign(node, node.target, None, dtype=dtype) + results = self._visit_assign(node, node.target, None, dtype=dtype) + if storage != dtypes.StorageType.Default: + self.sdfg.arrays[results[0][0]].storage = storage def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): # Get targets (elts) and results @@ -3563,6 +3568,8 @@ def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): self.cfg_target.add_edge(self.last_block, output_indirection, dace.sdfg.InterstateEdge()) self.last_block = output_indirection + return results + def visit_AugAssign(self, node: ast.AugAssign): self._visit_assign(node, node.target, augassign_ops[type(node.op).__name__]) @@ -4623,10 +4630,16 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): self._add_state('call_%d' % node.lineno) self.last_block.set_default_lineinfo(self.current_lineinfo) - if found_ufunc: - result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) - else: - result = func(self, self.sdfg, self.last_block, *args, **keywords) + try: + if found_ufunc: + result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) + else: + result = func(self, self.sdfg, self.last_block, *args, **keywords) + except DaceSyntaxError as ex: + # Attach source information to exception + if ex.node is None: + ex.node = node + raise self.last_block.set_default_lineinfo(None) diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 5e6118a34b..537fef97bf 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -322,24 +322,30 @@ def _numpy_full(pv: ProgramVisitor, is_data = True vtype = sdfg.arrays[fill_value].dtype dtype = dtype or vtype + + # Handle one-dimensional inputs + if isinstance(shape, (Number, str)) or symbolic.issymbolic(shape): + shape = [shape] + + if any(isinstance(s, str) for s in shape): + raise DaceSyntaxError( + pv, None, f'Data-dependent shape {shape} is currently not allowed. Only constants ' + 'and symbolic values can be used.') + name, _ = sdfg.add_temp_transient(shape, dtype) if is_data: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, dict(__inp=dace.Memlet(data=fill_value, subset='0')), "__out = __inp", dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) else: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) @@ -459,10 +465,8 @@ def _numpy_flip(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, axis inpidx = ','.join([f'__i{i}' for i in range(ndim)]) outidx = ','.join([f'{s} - __i{i} - 1' if a else f'__i{i}' for i, (a, s) in enumerate(zip(axis, desc.shape))]) state.add_mapped_tasklet(name="_numpy_flip_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -532,10 +536,8 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1 outidx = ','.join(out_indices) state.add_mapped_tasklet(name="_rot90_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -644,7 +646,8 @@ def _elementwise(pv: 'ProgramVisitor', else: state.add_mapped_tasklet( name="_elementwise_", - map_ranges={f'__i{dim}': f'0:{N}' for dim, N in enumerate(inparr.shape)}, + map_ranges={f'__i{dim}': f'0:{N}' + for dim, N in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(in_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, code=code, outputs={'__out': Memlet.simple(out_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, @@ -694,10 +697,8 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: else: state.add_mapped_tasklet( name=func, - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(inparr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, code='__out = {f}(__inp)'.format(f=func), outputs={'__out': Memlet.simple(outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, @@ -1046,27 +1047,22 @@ def _argminmax(pv: ProgramVisitor, code = "__init = _val_and_idx(val={}, idx=-1)".format( dtypes.min_value(a_arr.dtype) if func == 'max' else dtypes.max_value(a_arr.dtype)) - nest.add_state().add_mapped_tasklet(name="_arg{}_convert_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, - inputs={}, - code=code, - outputs={ - '__init': - Memlet.simple( - reduced_structs, - ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) - }, - external_edges=True) + nest.add_state().add_mapped_tasklet( + name="_arg{}_convert_".format(func), + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, + inputs={}, + code=code, + outputs={ + '__init': Memlet.simple(reduced_structs, + ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) + }, + external_edges=True) nest.add_state().add_mapped_tasklet( name="_arg{}_reduce_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape)}, inputs={'__in': Memlet.simple(a, ','.join('__i%d' % i for i in range(len(a_arr.shape))))}, code="__out = _val_and_idx(idx={}, val=__in)".format("__i%d" % axis), outputs={ @@ -1086,10 +1082,8 @@ def _argminmax(pv: ProgramVisitor, nest.add_state().add_mapped_tasklet( name="_arg{}_extract_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, inputs={ '__in': Memlet.simple(reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) @@ -1212,10 +1206,9 @@ def _unop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str): opcode = 'not' name, _ = sdfg.add_temp_transient(arr1.shape, restype, arr1.storage) - state.add_mapped_tasklet("_%s_" % opname, { - '__i%d' % i: '0:%s' % s - for i, s in enumerate(arr1.shape) - }, {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, + state.add_mapped_tasklet("_%s_" % opname, {'__i%d' % i: '0:%s' % s + for i, s in enumerate(arr1.shape)}, + {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, '__out = %s __in1' % opcode, {'__out': Memlet.simple(name, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, external_edges=True) @@ -4316,10 +4309,8 @@ def _ndarray_fill(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, va shape = sdfg.arrays[arr].shape state.add_mapped_tasklet( '_numpy_fill_', - map_ranges={ - f"__i{dim}": f"0:{s}" - for dim, s in enumerate(shape) - }, + map_ranges={f"__i{dim}": f"0:{s}" + for dim, s in enumerate(shape)}, inputs=inputs, code=f"__out = {body}", outputs={'__out': dace.Memlet.simple(arr, ",".join([f"__i{dim}" for dim in range(len(shape))]))}, @@ -4544,6 +4535,13 @@ def _ndarray_astype(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, return _datatype_converter(sdfg, state, arr, dtype)[0] +@oprepo.replaces_operator('Array', 'MatMult', otherclass='StorageType') +def _cast_storage(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, arr: str, stype: dace.StorageType) -> str: + desc = sdfg.arrays[arr] + desc.storage = stype + return arr + + # Replacements that need ufuncs ############################################### # TODO: Fix by separating to different modules and importing @@ -4747,13 +4745,7 @@ def _tensordot(pv: 'ProgramVisitor', @oprepo.replaces("cupy._core.core.ndarray") @oprepo.replaces("cupy.ndarray") -def _define_cupy_local( - pv: "ProgramVisitor", - sdfg: SDFG, - state: SDFGState, - shape: Shape, - dtype: typeclass -): +def _define_cupy_local(pv: "ProgramVisitor", sdfg: SDFG, state: SDFGState, shape: Shape, dtype: typeclass): """Defines a local array in a DaCe program.""" if not isinstance(shape, (list, tuple)): shape = [shape] @@ -4781,10 +4773,8 @@ def _cupy_full(pv: ProgramVisitor, name, _ = sdfg.add_temp_transient(shape, dtype, storage=dtypes.StorageType.GPU_Global) state.add_mapped_tasklet( - '_cupy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_cupy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py index cf58cf76cc..97010e95a7 100644 --- a/dace/sdfg/infer_types.py +++ b/dace/sdfg/infer_types.py @@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG): for e in state.out_edges(node): cname = e.src_conn if cname and node.out_connectors[cname] is None: - raise TypeError('Ambiguous or uninferable type in' - ' connector "%s" of node "%s"' % (cname, node)) + raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node)) ############################################################################# @@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState, else: child_schedule = _determine_child_schedule(parent_schedules) + # Special case for dynamic thread-block neighboring schedules + if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock: + from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks # Avoid import loops + if gpu_map_has_explicit_dyn_threadblocks(state, parent_node): + child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic + # Set child schedule type in scope for node in child_nodes[parent_node]: # Set default schedule types @@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType: raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG') + def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None: """ Infers aliasing information on nested SDFG arrays based on external edges and connectors. diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 38a41236a6..f25a6e24d5 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -761,13 +761,13 @@ def add_symbol(self, name, stype, find_new_name: bool = False): if name in self.symbols: raise FileExistsError(f'Symbol "{name}" already exists in SDFG') if name in self.arrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a data descriptor.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a data descriptor.') if name in self._subarrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a subarray.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a subarray.') if name in self._rdistrarrays: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a RedistrArray.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a RedistrArray.') if name in self._pgrids: - raise FileExistsError(f'Can not create symbol "{name}", the name is used by a ProcessGrid.') + raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a ProcessGrid.') if not isinstance(stype, dtypes.typeclass): stype = dtypes.dtype_to_typeclass(stype) self.symbols[name] = stype diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index f02a5003e9..2df9e17445 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -396,7 +396,6 @@ def validate_state(state: 'dace.sdfg.SDFGState', symbols = symbols or {} initialized_transients = (initialized_transients if initialized_transients is not None else {'__pystate'}) references = references or set() - scope = state.scope_dict() # Obtain whether we are already in an accelerator context if not hasattr(context, 'in_gpu'): @@ -426,6 +425,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', if state.has_cycles(): raise InvalidSDFGError('State should be acyclic but contains cycles', sdfg, state_id) + scope = state.scope_dict() + for nid, node in enumerate(state.nodes()): # Reference check if id(node) in references: diff --git a/dace/transformation/dataflow/warp_tiling.py b/dace/transformation/dataflow/warp_tiling.py index 362b51d9ac..f9091950e3 100644 --- a/dace/transformation/dataflow/warp_tiling.py +++ b/dace/transformation/dataflow/warp_tiling.py @@ -55,6 +55,10 @@ def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry: # Stride and offset all internal maps maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True) for nstate, nmap in maps_to_stride: + # Skip sequential maps + if nmap.schedule == dtypes.ScheduleType.Sequential: + continue + nsdfg = nstate.parent nsdfg_node = nsdfg.parent_nsdfg_node diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index 6ca4602079..b7bf49e62b 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S return ScopeSubgraphView(state, new_nodes, new_entry) -def offset_map(state: SDFGState, - entry: nodes.MapEntry, - dim: int, - offset: symbolic.SymbolicType, - negative: bool = True): +def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True): """ Offsets a map parameter and its contents by a value. @@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode) return False +def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool: + """ + Returns True if GPU_Device map has explicit thread-block maps nested within. + """ + internal_maps = get_internal_scopes(state, entry) + if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps): + return True + + return False + + def reconnect_edge_through_map( state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode], keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]: diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py index b24e5f2ea6..edc1eac9f2 100644 --- a/tests/dynamic_tb_map_cudatest.py +++ b/tests/dynamic_tb_map_cudatest.py @@ -12,10 +12,8 @@ @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H]) def spmv(A_row, A_col, A_val, x, b): - @dace.mapscope(_[0:H]) def compute_row(i): - @dace.map(_[A_row[i]:A_row[i + 1]]) def compute(j): a << A_val[j] @@ -292,8 +290,76 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i assert np.allclose(val, ref.data) +@pytest.mark.gpu +def test_dynamic_multidim_map(): + @dace.program + def tester(a: dace.float32[H, W, nnz]): + A = dace.ndarray([H, W, nnz], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i, j in dace.map[0:H, 0:W] @ dace.ScheduleType.GPU_Device: + for k in dace.map[0:nnz] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + A[i, j, k] = i * 110 + j * 11 + k + a[:] = A + + a = np.zeros((10, 11, 65), dtype=np.float32) + tester(a) + assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 110 + j * 11 + k, (10, 11, 65), dtype=np.float32)) + + +@pytest.mark.skip('Nested maps with work-stealing thread-block schedule are currently unsupported') +def test_dynamic_nested_map(): + @dace.program + def nested2(A: dace.float32[W], i: dace.int32, j: dace.int32): + A[j] = i * 10 + j + + @dace.program + def nested1(A: dace.float32[W], i: dace.int32): + for j in dace.map[0:W] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + nested2(A, i, j) + + @dace.program + def dynamic_nested_map(a: dace.float32[H, W]): + A = dace.ndarray([H, W], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i in dace.map[0:H] @ dace.ScheduleType.GPU_Device: + nested1(A[i], i) + + a[:] = A + + a = np.zeros((10, 11), dtype=np.float32) + sdfg = dynamic_nested_map.to_sdfg(simplify=False) + for _, _, arr in sdfg.arrays_recursive(): + if arr.storage in (dace.StorageType.GPU_Shared, dace.StorageType.Default): + arr.storage = dace.StorageType.Register + sdfg(a, H=10, W=11) + assert np.allclose(a, np.fromfunction(lambda i, j: i * 10 + j, (10, 11), dtype=np.float32)) + + +@pytest.mark.gpu +def test_dynamic_default_schedule(): + N = dace.symbol('N') + + @dace.program + def tester(a: dace.float32[N, 10]): + A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device: + smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared + smem[:] = 1 + for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + A[i, j] = i * 65 + smem[j] + a[:] = A + + a = np.zeros((65, 10), dtype=np.float32) + tester(a) + assert np.allclose(a, np.fromfunction(lambda i, j: i * 65 + 1, (65, 10), dtype=np.float32)) + + if __name__ == '__main__': test_dynamic_map() test_dynamic_maps() test_nested_dynamic_map() test_dynamic_map_with_step() + test_dynamic_multidim_map() + # test_dynamic_nested_map() + test_dynamic_default_schedule() diff --git a/tests/numpy/array_creation_test.py b/tests/numpy/array_creation_test.py index 85908c7a1f..7329b48b3f 100644 --- a/tests/numpy/array_creation_test.py +++ b/tests/numpy/array_creation_test.py @@ -1,7 +1,9 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace +from dace.frontend.python.common import DaceSyntaxError import numpy as np from common import compare_numpy_output +import pytest # M = dace.symbol('M') # N = dace.symbol('N') @@ -154,7 +156,7 @@ def test_arange_6(): def program_strides_0(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 1)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -168,7 +170,7 @@ def test_strides_0(): def program_strides_1(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(4, 2)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -182,7 +184,7 @@ def test_strides_1(): def program_strides_2(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(1, 2)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -196,7 +198,7 @@ def test_strides_2(): def program_strides_3(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 4)) for i, j in dace.map[0:2, 0:2]: - A[i, j] = i * 2 + j + A[i, j] = i * 2 + j return A @@ -206,6 +208,42 @@ def test_strides_3(): assert np.allclose(A, [[0, 1], [2, 3]]) +def test_zeros_symbolic_size_scalar(): + K = dace.symbol('K') + + @dace.program + def zeros_symbolic_size(): + return np.zeros((K), dtype=np.uint32) + + out = zeros_symbolic_size(K=10) + assert (list(out.shape) == [10]) + assert (out.dtype == np.uint32) + + +def test_ones_scalar_size_scalar(): + + @dace.program + def ones_scalar_size(k: dace.int32): + a = np.ones(k, dtype=np.uint32) + return np.sum(a) + + with pytest.raises(DaceSyntaxError): + out = ones_scalar_size(20) + assert out == 20 + + +def test_ones_scalar_size(): + + @dace.program + def ones_scalar_size(k: dace.int32): + a = np.ones((k, k), dtype=np.uint32) + return np.sum(a) + + with pytest.raises(DaceSyntaxError): + out = ones_scalar_size(20) + assert out == 20 * 20 + + if __name__ == "__main__": test_empty() test_empty_like1() @@ -233,3 +271,6 @@ def test_strides_3(): test_strides_1() test_strides_2() test_strides_3() + test_zeros_symbolic_size_scalar() + test_ones_scalar_size_scalar() + test_ones_scalar_size() diff --git a/tests/numpy/map_syntax_test.py b/tests/numpy/map_syntax_test.py index fe7af1d644..27a0cfe018 100644 --- a/tests/numpy/map_syntax_test.py +++ b/tests/numpy/map_syntax_test.py @@ -1,6 +1,7 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import numpy as np import dace +import pytest M, N, K = (dace.symbol(name) for name in ['M', 'N', 'K']) @@ -35,6 +36,57 @@ def test_map_python(): assert np.allclose(A[:, 1:], B[:, 1:]) +@pytest.mark.skip('Fails due to bug in Python frontend') +def test_nested_map_with_indirection(): + N = dace.symbol('N') + + @dace.program + def indirect_to_indirect(arr1: dace.float64[N], ind: dace.int32[10], arr2: dace.float64[N]): + for i in dace.map[0:9]: + begin, end, stride = ind[i], ind[i + 1], 1 + for _ in dace.map[0:1]: + for j in dace.map[begin:end:stride]: + arr2[j] = arr1[j] + i + + a = np.random.rand(50) + b = np.zeros(50) + ind = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=np.int32) + sdfg = indirect_to_indirect.to_sdfg(simplify=False) + sdfg(a, ind, b) + + ref = np.zeros(50) + for i in range(9): + begin, end = ind[i], ind[i + 1] + ref[begin:end] = a[begin:end] + i + + assert np.allclose(b, ref) + + +@pytest.mark.skip('Fails due to bug in Python frontend') +def test_dynamic_map_range_scalar(): + """ + From issue #650. + """ + + @dace.program + def test(A: dace.float64[20], B: dace.float64[20]): + N = dace.define_local_scalar(dace.int32) + N = 5 + for i in dace.map[0:N]: + for j in dace.map[0:N]: + with dace.tasklet: + a << A[i] + b >> B[j] + b = a + 1 + + A = np.random.rand(20) + B = np.zeros(20) + test(A, B) + assert np.allclose(B[:5], A[:5] + 1) + + if __name__ == '__main__': test_copy3d() test_map_python() + # test_nested_map_with_indirection() + # test_dynamic_map_range_scalar() diff --git a/tests/python_frontend/device_annotations_test.py b/tests/python_frontend/device_annotations_test.py index 65c8501b23..d6b512f00b 100644 --- a/tests/python_frontend/device_annotations_test.py +++ b/tests/python_frontend/device_annotations_test.py @@ -1,16 +1,19 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. import dace import pytest +import numpy as np from dace.dtypes import StorageType, DeviceType, ScheduleType from dace import dtypes -cupy = pytest.importorskip("cupy") +try: + import cupy +except (ImportError, ModuleNotFoundError): + cupy = None @pytest.mark.gpu def test_storage(): - @dace.program def add(X: dace.float32[32, 32] @ StorageType.GPU_Global): return X + 1 @@ -46,7 +49,6 @@ def add2(X: dace.float32[32, 32] @ StorageType.GPU_Global): @pytest.mark.gpu def test_pythonmode(): - def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20] @ StorageType.GPU_Global): # This map will become a GPU kernel for i in dace.map[0:20] @ ScheduleType.GPU_Device: @@ -58,7 +60,40 @@ def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20 assert cupy.allclose(gpu_b, gpu_a + 1) +def test_inline_storage_hint(): + N = dace.symbol('N') + + @dace.program + def tester(): + b = np.ones(N, dtype=np.float32) @ dace.StorageType.CPU_ThreadLocal + return b + 1 + + sdfg = tester.to_sdfg(simplify=False) + assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal + + b = tester(N=10) + assert np.allclose(b, 2) + + +def test_annotated_storage_hint(): + N = dace.symbol('N') + + @dace.program + def tester(): + b: dace.float32[N] @ dace.StorageType.CPU_ThreadLocal = np.ones(N, dtype=np.float32) + return b + 1 + + sdfg = tester.to_sdfg(simplify=False) + assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal + + b = tester(N=10) + assert np.allclose(b, 2) + + if __name__ == "__main__": - test_storage() - test_schedule() - test_pythonmode() + if cupy is not None: + test_storage() + test_schedule() + test_pythonmode() + test_inline_storage_hint() + test_annotated_storage_hint() diff --git a/tests/sdfg/cycles_test.py b/tests/sdfg/cycles_test.py index 480392ab2d..b01aec55fd 100644 --- a/tests/sdfg/cycles_test.py +++ b/tests/sdfg/cycles_test.py @@ -2,7 +2,7 @@ import pytest import dace - +from dace.sdfg.validation import InvalidSDFGError def test_cycles(): with pytest.raises(ValueError, match="Found cycles.*"): @@ -29,6 +29,23 @@ def test_cycles_memlet_path(): sdfg.validate() +def test_cycles_1562(): + """ + Test for issue #1562. + """ + with pytest.raises(InvalidSDFGError, match="cycles"): + sdfg = dace.SDFG("foo") + state = sdfg.add_state() + mentry_2, mexit_2 = state.add_map("map_2", dict(i="0:9")) + mentry_6, mexit_6 = state.add_map("map_6", dict(i="0:9")) + mentry_8, mexit_8 = state.add_map("map_8", dict(i="0:9")) + state.add_edge(mentry_8, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9')) + state.add_edge(mentry_6, "OUT_0", mentry_2, "IN_0", dace.Memlet(data="bla", subset='0:9')) + state.add_edge(mentry_2, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9')) + sdfg.validate() + + if __name__ == '__main__': test_cycles() test_cycles_memlet_path() + test_cycles_1562() From 1d8a693ef9a76af21b53e3a97c18cbf66b7ad92b Mon Sep 17 00:00:00 2001 From: Christos Kotsalos Date: Wed, 30 Oct 2024 06:31:02 +0100 Subject: [PATCH 13/43] infer_symbols_from_datadescriptor : modification to infer offset (#1525) Small modification to infer offset on top of shape and strides from the data descriptor (needed in gt4py) --- dace/frontend/python/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dace/frontend/python/parser.py b/dace/frontend/python/parser.py index d99be1265d..d03759fa8e 100644 --- a/dace/frontend/python/parser.py +++ b/dace/frontend/python/parser.py @@ -92,14 +92,15 @@ def infer_symbols_from_datadescriptor(sdfg: SDFG, desc = sdfg.arrays[arg_name] if not hasattr(desc, 'shape') or not hasattr(arg_val, 'shape'): continue - symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', [])) + symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', [])) + list(getattr(desc, 'offset', [])) given_values = list(arg_val.shape) given_strides = [] if hasattr(arg_val, 'strides'): # NumPy arrays use bytes in strides factor = getattr(arg_val, 'itemsize', 1) given_strides = [s // factor for s in arg_val.strides] - given_values += given_strides + given_offset = [o for o in arg_val.offset] if hasattr(arg_val, 'offset') else [] + given_values += given_strides + given_offset for sym_dim, real_dim in zip(symbolic_values, given_values): repldict = {} From 3c164c44900a4b89685ff77105b6e63f2ed9759b Mon Sep 17 00:00:00 2001 From: Yakup Koray Budanaz Date: Wed, 30 Oct 2024 06:31:49 +0100 Subject: [PATCH 14/43] Add CFG to generate_scope in tutorials (#1706) The DFG is missing from codegen tutorial, resulting with an error in codegen. Adding cfg as a param fixes it. --- tutorials/codegen.ipynb | 44 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/tutorials/codegen.ipynb b/tutorials/codegen.ipynb index a6effd7996..2c79f1a2e0 100644 --- a/tutorials/codegen.ipynb +++ b/tutorials/codegen.ipynb @@ -480,48 +480,50 @@ " self.frame = frame_codegen\n", " # Can be used to dispatch other code generators for allocation/nodes\n", " self.dispatcher = frame_codegen.dispatcher\n", - " \n", + "\n", " ################################################################\n", - " # Register handlers/hooks through dispatcher: Can be used for \n", + " # Register handlers/hooks through dispatcher: Can be used for\n", " # nodes, memory copy/allocation, scopes, states, and more.\n", - " \n", + "\n", " # In this case, register scopes\n", " self.dispatcher.register_map_dispatcher(dace.ScheduleType.LoopyLoop, self)\n", - " \n", + "\n", " # You can similarly use register_{array,copy,node,state}_dispatcher\n", - " \n", - " # A scope dispatcher will trigger a method called generate_scope whenever \n", + "\n", + " # A scope dispatcher will trigger a method called generate_scope whenever\n", " # an SDFG has a scope with that schedule\n", - " def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView,\n", - " state_id: int, function_stream: CodeIOStream,\n", - " callsite_stream: CodeIOStream):\n", + " def generate_scope(self, sdfg: dace.SDFG, cfg: dace.ControlFlowRegion,\n", + " scope: ScopeSubgraphView, state_id: int,\n", + " function_stream: CodeIOStream, callsite_stream: CodeIOStream):\n", " # The parameters here are:\n", " # sdfg: The SDFG we are currently generating.\n", + " # cfg: The current control flow graph (CFG) we are currently generating. For example, + " it can be the SDFG or a loop region. " # scope: The subgraph of the state containing only the scope (map contents)\n", " # we want to generate the code for.\n", - " # state_id: The state in the SDFG the subgraph is taken from (i.e., \n", + " # state_id: The state in the SDFG the subgraph is taken from (i.e.,\n", " # `sdfg.node(state_id)` is the same as `scope.graph`)\n", " # function_stream: A cursor to the global code (which can be used to define\n", " # functions, hence the name).\n", " # callsite_stream: A cursor to the current location in the code, most of\n", " # the code is generated here.\n", - " \n", + "\n", " # We can get the map entry node from the scope graph\n", " entry_node = scope.source_nodes()[0]\n", - " \n", + "\n", " # First, generate an opening brace (for instrumentation and dynamic map ranges)\n", " callsite_stream.write('{', sdfg, state_id, entry_node)\n", - " \n", + "\n", " ################################################################\n", - " # Generate specific code: We will generate a reversed loop with a \n", + " # Generate specific code: We will generate a reversed loop with a\n", " # comment for each dimension of the map. For the sake of simplicity,\n", " # dynamic map ranges are not supported.\n", - " \n", + "\n", " for param, rng in zip(entry_node.map.params, entry_node.map.range):\n", " # We use the sym2cpp function from the cpp support functions\n", " # to convert symbolic expressions to proper C++\n", " begin, end, stride = (sym2cpp(r) for r in rng)\n", - " \n", + "\n", " # Every write is optionally (but recommended to be) tagged with\n", " # 1-3 extra arguments, serving as line information to match\n", " # SDFG, state, and graph nodes/edges to written code.\n", @@ -529,17 +531,17 @@ " for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''',\n", " sdfg, state_id, entry_node\n", " )\n", - " \n", + "\n", " # NOTE: CodeIOStream will automatically take care of indentation for us.\n", - " \n", - " \n", + "\n", + "\n", " # Now that the loops have been defined, use the dispatcher to invoke any\n", " # code generator (including this one) that is registered to deal with\n", " # the internal nodes in the subgraph. We skip the MapEntry node.\n", - " self.dispatcher.dispatch_subgraph(sdfg, scope, state_id,\n", + " self.dispatcher.dispatch_subgraph(sdfg, cfg, scope, state_id,\n", " function_stream, callsite_stream,\n", " skip_entry_node=True)\n", - " \n", + "\n", " # NOTE: Since skip_exit_node above is set to False, closing braces will\n", " # be automatically generated" ] From 1343a6e6440d808644ffccd5937ae1fcb136b92e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20M=C3=BCller?= <147368808+philip-paul-mueller@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:51:40 +0100 Subject: [PATCH 15/43] Better `CopyToMap` (#1675) By default the transformation uses some linearization followed by a delinearization approach, while this is needed to copy certain shapes, it is unnecessarily complicated for memlets such as `a[0:10, 20:30] -> 40:50, 60:70`. This PR adds special cases where the source and destination subset has the same size and transforms it to a simple copy. It also supports the case where some dimensions are one, i.e. memlets such as `a[0:10, 0:10] -> 0:10, 1, 0:20`. For all cases tests were added. Most importantly the transformation now applies if the strides are the same. Before this case was blocked. This PR helps to _avoid_ errors that are related to [Issue#1674](https://github.com/spcl/dace/issues/1674), but it is not a fix or a solution to it. --- dace/transformation/dataflow/copy_to_map.py | 90 ++++++++--- tests/transformations/copy_to_map_test.py | 164 +++++++++++++++++++- 2 files changed, 229 insertions(+), 25 deletions(-) diff --git a/dace/transformation/dataflow/copy_to_map.py b/dace/transformation/dataflow/copy_to_map.py index 5b4260ad55..9c4dbce627 100644 --- a/dace/transformation/dataflow/copy_to_map.py +++ b/dace/transformation/dataflow/copy_to_map.py @@ -1,12 +1,13 @@ # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. -from dace import dtypes, symbolic, data, subsets, Memlet +from dace import dtypes, symbolic, data, subsets, Memlet, properties from dace.sdfg.scope import is_devicelevel_gpu from dace.transformation import transformation as xf from dace.sdfg import SDFGState, SDFG, nodes, utils as sdutil from typing import Tuple +import itertools - +@properties.make_properties class CopyToMap(xf.SingleStateTransformation): """ Converts an access node -> access node copy into a map. Useful for generating manual code and @@ -14,6 +15,10 @@ class CopyToMap(xf.SingleStateTransformation): """ a = xf.PatternNode(nodes.AccessNode) b = xf.PatternNode(nodes.AccessNode) + ignore_strides = properties.Property( + default=False, + desc='Ignore the stride of the data container; Defaults to `False`.', + ) @classmethod def expressions(cls): @@ -31,7 +36,10 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi if isinstance(self.b.desc(sdfg), data.View): if sdutil.get_view_node(graph, self.b) == self.a: return False - if self.a.desc(sdfg).strides == self.b.desc(sdfg).strides: + if (not self.ignore_strides) and self.a.desc(sdfg).strides == self.b.desc(sdfg).strides: + return False + # Ensures that the edge goes from `a` -> `b`. + if not any(edge.dst is self.b for edge in graph.out_edges(self.a)): return False return True @@ -62,31 +70,69 @@ def delinearize_linearize(self, desc: data.Array, copy_shape: Tuple[symbolic.Sym return subsets.Range([(ind, ind, 1) for ind in cur_index]) def apply(self, state: SDFGState, sdfg: SDFG): - adesc = self.a.desc(sdfg) - bdesc = self.b.desc(sdfg) - edge = state.edges_between(self.a, self.b)[0] + avnode = self.a + av = avnode.data + adesc = avnode.desc(sdfg) + bvnode = self.b + bv = bvnode.data + bdesc = bvnode.desc(sdfg) + + edge = state.edges_between(avnode, bvnode)[0] + src_subset = edge.data.get_src_subset(edge, state) + if src_subset is None: + src_subset = subsets.Range.from_array(adesc) + src_subset_size = src_subset.size() + red_src_subset_size = tuple(s for s in src_subset_size if s != 1) + + dst_subset = edge.data.get_dst_subset(edge, state) + if dst_subset is None: + dst_subset = subsets.Range.from_array(bdesc) + dst_subset_size = dst_subset.size() + red_dst_subset_size = tuple(s for s in dst_subset_size if s != 1) if len(adesc.shape) >= len(bdesc.shape): - copy_shape = edge.data.get_src_subset(edge, state).size() + copy_shape = src_subset_size copy_a = True else: - copy_shape = edge.data.get_dst_subset(edge, state).size() + copy_shape = dst_subset_size copy_a = False - maprange = {f'__i{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)} - - av = self.a.data - bv = self.b.data - avnode = self.a - bvnode = self.b - - # Linearize and delinearize to get index expression for other side - if copy_a: - a_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))] - b_index = self.delinearize_linearize(bdesc, copy_shape, edge.data.get_dst_subset(edge, state)) + if tuple(src_subset_size) == tuple(dst_subset_size): + # The two subsets have exactly the same shape, so we can just copying with an offset. + # We use another index variables for the tests only. + maprange = {f'__j{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)} + a_index = [symbolic.pystr_to_symbolic(f'__j{i} + ({src_subset[i][0]})') for i in range(len(copy_shape))] + b_index = [symbolic.pystr_to_symbolic(f'__j{i} + ({dst_subset[i][0]})') for i in range(len(copy_shape))] + elif red_src_subset_size == red_dst_subset_size and (len(red_dst_subset_size) > 0): + # If we remove all size 1 dimensions that the two subsets have the same size. + # This is essentially the memlet `a[0:10, 2, 0:10] -> 0:10, 10:20` + # We use another index variable only for the tests but we would have to + # recreate the index anyways. + maprange = {f'__j{i}': (0, s - 1, 1) for i, s in enumerate(red_src_subset_size)} + cnt = itertools.count(0) + a_index = [ + symbolic.pystr_to_symbolic(f'{src_subset[i][0]}') + if s == 1 + else symbolic.pystr_to_symbolic(f'__j{next(cnt)} + ({src_subset[i][0]})') + for i, s in enumerate(src_subset_size) + ] + cnt = itertools.count(0) + b_index = [ + symbolic.pystr_to_symbolic(f'{dst_subset[i][0]}') + if s == 1 + else symbolic.pystr_to_symbolic(f'__j{next(cnt)} + ({dst_subset[i][0]})') + for i, s in enumerate(dst_subset_size) + ] else: - a_index = self.delinearize_linearize(adesc, copy_shape, edge.data.get_src_subset(edge, state)) - b_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))] + # We have to delinearize and linearize + # We use another index variable for the tests. + maprange = {f'__i{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)} + if copy_a: + a_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))] + b_index = self.delinearize_linearize(bdesc, copy_shape, edge.data.get_dst_subset(edge, state)) + else: + a_index = self.delinearize_linearize(adesc, copy_shape, edge.data.get_src_subset(edge, state)) + b_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))] a_subset = subsets.Range([(ind, ind, 1) for ind in a_index]) b_subset = subsets.Range([(ind, ind, 1) for ind in b_index]) @@ -101,7 +147,7 @@ def apply(self, state: SDFGState, sdfg: SDFG): schedule = dtypes.ScheduleType.GPU_Device # Add copy map - t, _, _ = state.add_mapped_tasklet('copy', + t, _, _ = state.add_mapped_tasklet(f'copy_{av}_{bv}', maprange, dict(__inp=Memlet(data=av, subset=a_subset)), '__out = __inp', diff --git a/tests/transformations/copy_to_map_test.py b/tests/transformations/copy_to_map_test.py index 2b237d84d5..a0931fa1b8 100644 --- a/tests/transformations/copy_to_map_test.py +++ b/tests/transformations/copy_to_map_test.py @@ -4,6 +4,8 @@ import copy import pytest import numpy as np +import re +from typing import Tuple, Optional def _copy_to_map(storage: dace.StorageType): @@ -102,9 +104,165 @@ def test_preprocess(): assert np.allclose(out, inp) +def _perform_non_lin_delin_test( + sdfg: dace.SDFG, +) -> bool: + """Performs test for the special case CopyToMap that bypasses linearizing and delinearaziong. + """ + assert sdfg.number_of_nodes() == 1 + state: dace.SDFGState = sdfg.states()[0] + assert state.number_of_nodes() == 2 + assert state.number_of_edges() == 1 + assert all(isinstance(node, dace.nodes.AccessNode) for node in state.nodes()) + sdfg.validate() + + a = np.random.rand(*sdfg.arrays["a"].shape) + b_unopt = np.random.rand(*sdfg.arrays["b"].shape) + b_opt = b_unopt.copy() + sdfg(a=a, b=b_unopt) + + nb_runs = sdfg.apply_transformations_repeated(CopyToMap, validate=True, options={"ignore_strides": True}) + assert nb_runs == 1, f"Expected 1 application, but {nb_runs} were performed." + + # Now looking for the tasklet and checking if the memlets follows the expected + # simple pattern. + tasklet: dace.nodes.Tasklet = next(iter([node for node in state.nodes() if isinstance(node, dace.nodes.Tasklet)])) + pattern: re.Pattern = re.compile(r"(__j[0-9])|(__j[0-9]+\s*\+\s*[0-9]+)|([0-9]+)") + + assert state.in_degree(tasklet) == 1 + assert state.out_degree(tasklet) == 1 + in_edge = next(iter(state.in_edges(tasklet))) + out_edge = next(iter(state.out_edges(tasklet))) + + assert all(pattern.fullmatch(str(idxs[0]).strip()) for idxs in in_edge.data.src_subset), f"IN: {in_edge.data.src_subset}" + assert all(pattern.fullmatch(str(idxs[0]).strip()) for idxs in out_edge.data.dst_subset), f"OUT: {out_edge.data.dst_subset}" + + # Now call it again after the optimization. + sdfg(a=a, b=b_opt) + assert np.allclose(b_unopt, b_opt) + + return True + +def _make_non_lin_delin_sdfg( + shape_a: Tuple[int, ...], + shape_b: Optional[Tuple[int, ...]] = None +) -> Tuple[dace.SDFG, dace.SDFGState, dace.nodes.AccessNode, dace.nodes.AccessNode]: + + if shape_b is None: + shape_b = shape_a + + sdfg = dace.SDFG("bypass1") + state = sdfg.add_state(is_start_block=True) + + ac = [] + for name, shape in [('a', shape_a), ('b', shape_b)]: + sdfg.add_array( + name=name, + shape=shape, + dtype=dace.float64, + transient=False, + ) + ac.append(state.add_access(name)) + + return sdfg, state, ac[0], ac[1] + + +def test_non_lin_delin_1(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10)) + state.add_nedge( + a, + b, + dace.Memlet("a[0:10, 0:10] -> [0:10, 0:10]"), + ) + _perform_non_lin_delin_test(sdfg) + +def test_non_lin_delin_2(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10), (100, 100)) + state.add_nedge( + a, + b, + dace.Memlet("a[0:10, 0:10] -> [50:60, 40:50]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_3(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 100), (100, 100)) + state.add_nedge( + a, + b, + dace.Memlet("a[1:11, 20:30] -> [50:60, 40:50]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_4(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 4, 100), (100, 100)) + state.add_nedge( + a, + b, + dace.Memlet("a[1:11, 2, 20:30] -> [50:60, 40:50]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_5(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 4, 100), (100, 10, 100)) + state.add_nedge( + a, + b, + dace.Memlet("a[1:11, 2, 20:30] -> [50:60, 4, 40:50]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_6(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 100), (100, 10, 100)) + state.add_nedge( + a, + b, + dace.Memlet("a[1:11, 20:30] -> [50:60, 4, 40:50]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_7(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10), (20, 20)) + state.add_nedge( + a, + b, + dace.Memlet("b[5:15, 6:16]"), + ) + _perform_non_lin_delin_test(sdfg) + + +def test_non_lin_delin_8(): + sdfg, state, a, b = _make_non_lin_delin_sdfg((20, 20), (10, 10)) + state.add_nedge( + a, + b, + dace.Memlet("a[5:15, 6:16]"), + ) + _perform_non_lin_delin_test(sdfg) + + if __name__ == '__main__': + test_non_lin_delin_1() + test_non_lin_delin_2() + test_non_lin_delin_3() + test_non_lin_delin_4() + test_non_lin_delin_5() + test_non_lin_delin_6() + test_non_lin_delin_7() + test_non_lin_delin_8() + test_copy_to_map() - test_copy_to_map_gpu() test_flatten_to_map() - test_flatten_to_map_gpu() - test_preprocess() + try: + import cupy + test_copy_to_map_gpu() + test_flatten_to_map_gpu() + test_preprocess() + except ModuleNotFoundError as E: + if "'cupy'" not in str(E): + raise From 2811e40486f8ed6c21f348abfa93747b8edd6215 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Wed, 30 Oct 2024 12:12:38 -0700 Subject: [PATCH 16/43] More NumPy operation implementations (#1498) * Concatenation and stacking (numpy.concatenate, numpy.stack, and their variants) * numpy.linspace * Fix nested attribute parsing (Fixes #1295) * numpy.clip * numpy.split and its variants * numpy.full variants (zeros, ones, etc.) with a single value for shape (`np.zeros(N)`) * NumPy-compatible numpy.arange dtype inference * `numpy.fft.{fft, ifft}` --- dace/codegen/cppunparse.py | 8 +- dace/distr_types.py | 4 + dace/frontend/common/distr.py | 32 +- dace/frontend/python/newast.py | 57 +- dace/frontend/python/preprocessing.py | 2 + dace/frontend/python/replacements.py | 641 ++++++++++++++++++-- dace/libraries/blas/nodes/gemv.py | 16 +- dace/libraries/fft/__init__.py | 6 + dace/libraries/fft/algorithms/__init__.py | 0 dace/libraries/fft/algorithms/dft.py | 45 ++ dace/libraries/fft/environments/__init__.py | 2 + dace/libraries/fft/environments/cufft.py | 21 + dace/libraries/fft/nodes/__init__.py | 2 + dace/libraries/fft/nodes/fft.py | 204 +++++++ dace/libraries/standard/nodes/transpose.py | 31 +- tests/library/fft_test.py | 101 +++ tests/numpy/array_creation_test.py | 42 ++ tests/numpy/attention_simple_test.py | 2 +- tests/numpy/attribute_test.py | 43 ++ tests/numpy/concat_test.py | 133 ++++ tests/numpy/nested_call_subarray_test.py | 4 +- tests/numpy/split_test.py | 142 +++++ tests/numpy/ufunc_test.py | 6 + 23 files changed, 1458 insertions(+), 86 deletions(-) create mode 100644 dace/libraries/fft/__init__.py create mode 100644 dace/libraries/fft/algorithms/__init__.py create mode 100644 dace/libraries/fft/algorithms/dft.py create mode 100644 dace/libraries/fft/environments/__init__.py create mode 100644 dace/libraries/fft/environments/cufft.py create mode 100644 dace/libraries/fft/nodes/__init__.py create mode 100644 dace/libraries/fft/nodes/fft.py create mode 100644 tests/library/fft_test.py create mode 100644 tests/numpy/concat_test.py create mode 100644 tests/numpy/split_test.py diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index 18ee00721b..edeb5270ca 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -749,6 +749,8 @@ def _Num(self, t): # For complex values, use ``dtype_to_typeclass`` if isinstance(t_n, complex): dtype = dtypes.dtype_to_typeclass(complex) + repr_n = f'{dtype}({t_n.real}, {t_n.imag})' + # Handle large integer values if isinstance(t_n, int): @@ -765,10 +767,8 @@ def _Num(self, t): elif bits >= 64: warnings.warn(f'Value wider than 64 bits encountered in expression ({t_n}), emitting as-is') - if repr_n.endswith("j"): - self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1])) - else: - self.write(repr_n.replace("inf", INFSTR)) + repr_n = repr_n.replace("inf", INFSTR) + self.write(repr_n) def _List(self, t): raise NotImplementedError('Invalid C++') diff --git a/dace/distr_types.py b/dace/distr_types.py index 1b595a1b84..b60eb4925e 100644 --- a/dace/distr_types.py +++ b/dace/distr_types.py @@ -96,6 +96,10 @@ def _validate(self): raise ValueError('Color must have only logical true (1) or false (0) values.') return True + @property + def dtype(self): + return type(self) + def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 88a6b0c54a..c517028d53 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -50,14 +50,14 @@ def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: Shape @oprepo.replaces_method('Intracomm', 'Create_cart') -def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', dims: ShapeType): +def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, dims: ShapeType): """ Equivalent to `dace.comm.Cart_create(dims). :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. :return: Name of the new process-grid descriptor. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _cart_create(pv, sdfg, state, dims) @@ -186,13 +186,13 @@ def _bcast(pv: ProgramVisitor, def _intracomm_bcast(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, - comm: Tuple[str, 'Comm'], + comm: str, buffer: str, root: Union[str, sp.Expr, Number] = 0): """ Equivalent to `dace.comm.Bcast(buffer, root)`. """ from mpi4py import MPI - comm_name, comm_obj = comm + comm_name, comm_obj = comm, pv.globals[comm] if comm_obj == MPI.COMM_WORLD: return _bcast(pv, sdfg, state, buffer, root) # NOTE: Highly experimental @@ -267,12 +267,12 @@ def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, @oprepo.replaces_method('Intracomm', 'Alltoall') -def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: str, +def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: str, out_buffer: str): """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _alltoall(pv, sdfg, state, inp_buffer, out_buffer) @@ -303,12 +303,12 @@ def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op @oprepo.replaces_method('Intracomm', 'Allreduce') -def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: 'InPlace', +def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: 'InPlace', out_buffer: str, op: str): """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') if inp_buffer != MPI.IN_PLACE: @@ -470,12 +470,12 @@ def _send(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Send') -def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _send(pv, sdfg, state, buffer, dst, tag) @@ -592,12 +592,12 @@ def _isend(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Isend') -def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) @@ -690,12 +690,12 @@ def _recv(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Recv') -def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _recv(pv, sdfg, state, buffer, src, tag) @@ -810,12 +810,12 @@ def _irecv(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Irecv') -def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 78890c9cdd..b4e83cc1e7 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1342,7 +1342,7 @@ def defined(self): # MPI-related stuff result.update({ - k: self.sdfg.process_grids[v] + v: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids }) try: @@ -4461,7 +4461,14 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): func = node.func.value if func is None: - funcname = rname(node) + func_result = self.visit(node.func) + if isinstance(func_result, str): + if isinstance(node.func, ast.Attribute): + funcname = f'{func_result}.{node.func.attr}' + else: + funcname = func_result + else: + funcname = rname(node) # Check if the function exists as an SDFG in a different module modname = until(funcname, '.') if ('.' in funcname and len(modname) > 0 and modname in self.globals @@ -4576,7 +4583,7 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): arg = self.scope_vars[modname] else: # Fallback to (name, object) - arg = (modname, self.defined[modname]) + arg = modname args.append(arg) # Otherwise, try to find a default implementation for the SDFG elif not found_ufunc: @@ -4795,12 +4802,18 @@ def _visitname(self, name: str, node: ast.AST): self.sdfg.add_symbol(result.name, result.dtype) return result + if name in self.closure.callbacks: + return name + if name in self.sdfg.arrays: return name if name in self.sdfg.symbols: return name + if name in __builtins__: + return name + if name not in self.scope_vars: raise DaceSyntaxError(self, node, 'Use of undefined variable "%s"' % name) rname = self.scope_vars[name] @@ -4845,33 +4858,43 @@ def visit_NameConstant(self, node: NameConstant): return self.visit_Constant(node) def visit_Attribute(self, node: ast.Attribute): - # If visiting an attribute, return attribute value if it's of an array or global - name = until(astutils.unparse(node), '.') - result = self._visitname(name, node) + result = self.visit(node.value) + if isinstance(result, (tuple, list, dict)): + if len(result) > 1: + raise DaceSyntaxError( + self, node.value, f'{type(result)} object cannot use attributes. Try storing the ' + 'object to a different variable first (e.g., ``a = result; a.attribute``') + else: + result = result[0] + tmpname = f"{result}.{astutils.unparse(node.attr)}" if tmpname in self.sdfg.arrays: return tmpname + if isinstance(result, str) and result in self.sdfg.arrays: arr = self.sdfg.arrays[result] elif isinstance(result, str) and result in self.scope_arrays: arr = self.scope_arrays[result] else: - return result + arr = None # Try to find sub-SDFG attribute - func = oprepo.Replacements.get_attribute(type(arr), node.attr) - if func is not None: - # A new state is likely needed here, e.g., for transposition (ndarray.T) - self._add_state('%s_%d' % (type(node).__name__, node.lineno)) - self.last_block.set_default_lineinfo(self.current_lineinfo) - result = func(self, self.sdfg, self.last_block, result) - self.last_block.set_default_lineinfo(None) - return result + if arr is not None: + func = oprepo.Replacements.get_attribute(type(arr), node.attr) + if func is not None: + # A new state is likely needed here, e.g., for transposition (ndarray.T) + self._add_state('%s_%d' % (type(node).__name__, node.lineno)) + self.last_block.set_default_lineinfo(self.current_lineinfo) + result = func(self, self.sdfg, self.last_block, result) + self.last_block.set_default_lineinfo(None) + return result # Otherwise, try to find compile-time attribute (such as shape) try: - return getattr(arr, node.attr) - except KeyError: + if arr is not None: + return getattr(arr, node.attr) + return getattr(result, node.attr) + except (AttributeError, KeyError): return result def visit_List(self, node: ast.List): diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index eca07a4930..f51b67ddb2 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -527,6 +527,8 @@ def global_value_to_node(self, elif isinstance(value, symbolic.symbol): # Symbols resolve to the symbol name newnode = ast.Name(id=value.name, ctx=ast.Load()) + elif isinstance(value, sympy.Basic): # Symbolic or constant expression + newnode = ast.parse(symbolic.symstr(value)).body[0].value elif isinstance(value, ast.Name): newnode = ast.Name(id=value.id, ctx=ast.Load()) elif (dtypes.isconstant(value) or isinstance(value, (StringLiteral, SDFG)) or hasattr(value, '__sdfg__')): diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 537fef97bf..c5b3e3b2a2 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -313,6 +313,9 @@ def _numpy_full(pv: ProgramVisitor, """ Creates and array of the specified shape and initializes it with the fill value. """ + if isinstance(shape, Number) or symbolic.issymbolic(shape): + shape = [shape] + is_data = False if isinstance(fill_value, (Number, np.bool_)): vtype = dtypes.dtype_to_typeclass(type(fill_value)) @@ -548,8 +551,13 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1 @oprepo.replaces('numpy.arange') @oprepo.replaces('dace.arange') -def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): - """ Implementes numpy.arange """ +def _arange(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + *args, + dtype: dtypes.typeclass = None, + like: Optional[str] = None): + """ Implements numpy.arange """ start = 0 stop = None @@ -563,35 +571,42 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): else: start, stop, step = args + if isinstance(start, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar start value "{start}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + if isinstance(stop, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar stop value "{stop}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + if isinstance(step, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar step value "{step}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + actual_step = step if isinstance(start, Number) and isinstance(stop, Number): actual_step = type(start + step)(start + step) - start if any(not isinstance(s, Number) for s in [start, stop, step]): - shape = (symbolic.int_ceil(stop - start, step), ) + if step == 1: # Common case where ceiling is not necessary + shape = (stop - start,) + else: + shape = (symbolic.int_ceil(stop - start, step), ) else: shape = (np.int64(np.ceil((stop - start) / step)), ) - if not isinstance(shape[0], Number) and ('dtype' not in kwargs or kwargs['dtype'] == None): - raise NotImplementedError("The current implementation of numpy.arange requires that the output dtype is given " - "when at least one of (start, stop, step) is symbolic.") + # Infer dtype from input arguments + if dtype is None: + dtype, _ = _result_type(args) + # TODO: Unclear what 'like' does - # if 'like' in kwargs and kwargs['like'] != None: - # outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[kwargs['like']]) + # if like is not None: + # outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[like]) # outarr.shape = shape - if 'dtype' in kwargs and kwargs['dtype'] != None: - dtype = kwargs['dtype'] - if not isinstance(dtype, dtypes.typeclass): - dtype = dtypes.dtype_to_typeclass(dtype) - outname, outarr = sdfg.add_temp_transient(shape, dtype) - else: - # infer dtype based on args's dtype - # (since the `dtype` keyword argument isn't given, none of the arguments can be symbolic) - if any(isinstance(arg, (float, np.float32, np.float64)) for arg in args): - dtype = dtypes.float64 - else: - dtype = dtypes.int64 - outname, outarr = sdfg.add_temp_transient(shape, dtype) + if not isinstance(dtype, dtypes.typeclass): + dtype = dtypes.dtype_to_typeclass(dtype) + outname, outarr = sdfg.add_temp_transient(shape, dtype) + + start = f'decltype(__out)({start})' + actual_step = f'decltype(__out)({actual_step})' state.add_mapped_tasklet(name="_numpy_arange_", map_ranges={'__i': f"0:{shape[0]}"}, @@ -603,6 +618,131 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): return outname +def _add_axis_to_shape(shape: Sequence[symbolic.SymbolicType], axis: int, + axis_value: Any) -> List[symbolic.SymbolicType]: + if axis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + if axis < 0: + naxis = len(shape) + 1 + axis + if naxis < 0 or naxis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + axis = naxis + + # Make a new shape list with the inserted dimension + new_shape = [None] * (len(shape) + 1) + for i in range(len(shape) + 1): + if i == axis: + new_shape[i] = axis_value + elif i < axis: + new_shape[i] = shape[i] + else: + new_shape[i] = shape[i - 1] + + return new_shape + + +@oprepo.replaces('numpy.linspace') +def _linspace(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + start: Union[Number, symbolic.SymbolicType, str], + stop: Union[Number, symbolic.SymbolicType, str], + num: Union[Integral, symbolic.SymbolicType] = 50, + endpoint: bool = True, + retstep: bool = False, + dtype: dtypes.typeclass = None, + axis: int = 0): + """ Implements numpy.linspace """ + # Argument checks + if not isinstance(num, (Integral, sp.Basic)): + raise TypeError('numpy.linspace can only be compiled when the ``num`` argument is symbolic or constant.') + if not isinstance(axis, Integral): + raise TypeError('numpy.linspace can only be compiled when the ``axis`` argument is constant.') + + # Start and stop are broadcast together, then, a new dimension is added to axis (taken from ``ndim + 1``), + # along which the numbers are filled. + start_shape = sdfg.arrays[start].shape if (isinstance(start, str) and start in sdfg.arrays) else [] + stop_shape = sdfg.arrays[stop].shape if (isinstance(stop, str) and stop in sdfg.arrays) else [] + + shape, ranges, outind, ind1, ind2 = _broadcast_together(start_shape, stop_shape) + shape_with_axis = _add_axis_to_shape(shape, axis, num) + ranges_with_axis = _add_axis_to_shape(ranges, axis, ('__sind', f'0:{symbolic.symstr(num)}')) + if outind: + outind_with_axis = _add_axis_to_shape(outind.split(', '), axis, '__sind') + else: + outind_with_axis = ['__sind'] + + if dtype is None: + # Infer output type from start and stop + start_type = sdfg.arrays[start] if (isinstance(start, str) and start in sdfg.arrays) else start + stop_type = sdfg.arrays[stop] if (isinstance(stop, str) and stop in sdfg.arrays) else stop + + dtype, _ = _result_type((start_type, stop_type), 'Add') + + # From the NumPy documentation: The inferred dtype will never be an integer; float is chosen even if the + # arguments would produce an array of integers. + if dtype in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32, + dtypes.uint64): + dtype = dtypes.dtype_to_typeclass(float) + + outname, _ = sdfg.add_temp_transient(shape_with_axis, dtype) + + if endpoint == True: + num -= 1 + + # Fill in input memlets as necessary + inputs = {} + if isinstance(start, str) and start in sdfg.arrays: + index = f'[{ind1}]' if ind1 else '' + inputs['__start'] = Memlet(f'{start}{index}') + startcode = '__start' + else: + startcode = symbolic.symstr(start) + + if isinstance(stop, str) and stop in sdfg.arrays: + index = f'[{ind2}]' if ind2 else '' + inputs['__stop'] = Memlet(f'{stop}{index}') + stopcode = '__stop' + else: + stopcode = symbolic.symstr(stop) + + # Create tasklet code based on inputs + code = f'__out = {startcode} + __sind * decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})' + + state.add_mapped_tasklet(name="linspace", + map_ranges=ranges_with_axis, + inputs=inputs, + code=code, + outputs={'__out': Memlet(f"{outname}[{','.join(outind_with_axis)}]")}, + external_edges=True) + + if retstep == False: + return outname + + # Return step if requested + + # Handle scalar outputs + if not ranges: + ranges = [('__unused', '0:1')] + out_index = f'[{outind}]' + + if len(shape) > 0: + stepname, _ = sdfg.add_temp_transient(shape, dtype) + else: + stepname, _ = sdfg.add_scalar(sdfg.temp_data_name(), dtype, transient=True) + out_index = '[0]' + + state.add_mapped_tasklet( + 'retstep', + ranges, + copy.deepcopy(inputs), + f'__out = decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})', + {'__out': Memlet(f"{stepname}{out_index}")}, + external_edges=True) + + return outname, stepname + + @oprepo.replaces('elementwise') @oprepo.replaces('dace.elementwise') def _elementwise(pv: 'ProgramVisitor', @@ -708,9 +848,9 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: def _complex_to_scalar(complex_type: dace.typeclass): - if complex_type is dace.complex64: + if complex_type == dace.complex64: return dace.float32 - elif complex_type is dace.complex128: + elif complex_type == dace.complex128: return dace.float64 else: return complex_type @@ -814,7 +954,8 @@ def _len_array(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, a: str): return sdfg.arrays[a].shape[0] if a in sdfg.constants_prop: return len(sdfg.constants[a]) - raise TypeError(f'`len` is not supported for input "{a}" (type {type(a)})') + else: + return len(a) @oprepo.replaces('transpose') @@ -1632,8 +1773,17 @@ def _result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basi else: # Operators with 3 or more arguments result_type = _np_result_type(dtypes_for_result) + coarse_result_type = None + if result_type in complex_types: + coarse_result_type = 3 # complex + elif result_type in float_types: + coarse_result_type = 2 # float + elif result_type in signed_types: + coarse_result_type = 1 # signed integer, bool + else: + coarse_result_type = 0 # unsigned integer for i, t in enumerate(coarse_types): - if t != result_type: + if t != coarse_result_type: casting[i] = _cast_str(result_type) return result_type, casting @@ -2512,6 +2662,13 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op code="__out = log1p(__in1)", reduce=None, initial=np.log1p.identity), + clip=dict(name="_numpy_clip_", + operator=None, + inputs=["__in_a", "__in_amin", "__in_amax"], + outputs=["__out"], + code="__out = min(max(__in_a, __in_amin), __in_amax)", + reduce=None, + initial=np.inf), sqrt=dict(name="_numpy_sqrt_", operator="Sqrt", inputs=["__in1"], @@ -4087,14 +4244,13 @@ def implement_ufunc_outer(visitor: ProgramVisitor, ast_node: ast.Call, sdfg: SDF @oprepo.replaces('numpy.reshape') -def reshape( - pv: ProgramVisitor, - sdfg: SDFG, - state: SDFGState, - arr: str, - newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], - order: StringLiteral = StringLiteral('C') -) -> str: +def reshape(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arr: str, + newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], + order: StringLiteral = StringLiteral('C'), + strides: Optional[Any] = None) -> str: if isinstance(arr, (list, tuple)) and len(arr) == 1: arr = arr[0] desc = sdfg.arrays[arr] @@ -4108,10 +4264,11 @@ def reshape( # New shape and strides as symbolic expressions newshape = [symbolic.pystr_to_symbolic(s) for s in newshape] - if fortran_strides: - strides = [data._prod(newshape[:i]) for i in range(len(newshape))] - else: - strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))] + if strides is None: + if fortran_strides: + strides = [data._prod(newshape[:i]) for i in range(len(newshape))] + else: + strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))] newarr, newdesc = sdfg.add_view(arr, newshape, @@ -4326,9 +4483,13 @@ def _ndarray_reshape( sdfg: SDFG, state: SDFGState, arr: str, - newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], + *newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], order: StringLiteral = StringLiteral('C') ) -> str: + if len(newshape) == 0: + raise TypeError('reshape() takes at least 1 argument (0 given)') + if len(newshape) == 1 and isinstance(newshape, (list, tuple)): + newshape = newshape[0] return reshape(pv, sdfg, state, arr, newshape, order) @@ -4833,3 +4994,407 @@ def _op(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: StringLite for op, method in _boolop_to_method.items(): _makeboolop(op, method) + + +@oprepo.replaces('numpy.concatenate') +def _concat(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arrays: Tuple[Any], + axis: Optional[int] = 0, + out: Optional[Any] = None, + *, + dtype=None, + casting: str = 'same_kind'): + if dtype is not None and out is not None: + raise ValueError('Arguments dtype and out cannot be given together') + if casting != 'same_kind': + raise NotImplementedError('The casting argument is currently unsupported') + if not isinstance(arrays, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile concatenation') + if axis is not None and not isinstance(axis, Integral): + raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile concatenation') + if len(arrays) == 1: + return arrays[0] + for i in range(len(arrays)): + if arrays[i] not in sdfg.arrays: + raise TypeError(f'Index {i} is not an array') + if out is not None: + if out not in sdfg.arrays: + raise TypeError('Output is not an array') + dtype = sdfg.arrays[out].dtype + + descs = [sdfg.arrays[arr] for arr in arrays] + shape = list(descs[0].shape) + + if axis is None: # Flatten arrays, then concatenate + arrays = [flat(visitor, sdfg, state, arr) for arr in arrays] + descs = [sdfg.arrays[arr] for arr in arrays] + shape = list(descs[0].shape) + axis = 0 + else: + # Check shapes for validity + first_shape = copy.copy(shape) + first_shape[axis] = 0 + for i, d in enumerate(descs[1:]): + other_shape = list(d.shape) + other_shape[axis] = 0 + if other_shape != first_shape: + raise ValueError(f'Array shapes do not match at index {i}') + + shape[axis] = sum(desc.shape[axis] for desc in descs) + if out is None: + if dtype is None: + dtype = descs[0].dtype + name, odesc = sdfg.add_temp_transient(shape, dtype, storage=descs[0].storage, lifetime=descs[0].lifetime) + else: + name = out + odesc = sdfg.arrays[out] + + # Make copies + w = state.add_write(name) + offset = 0 + subset = subsets.Range.from_array(odesc) + for arr, desc in zip(arrays, descs): + r = state.add_read(arr) + subset = copy.deepcopy(subset) + subset[axis] = (offset, offset + desc.shape[axis] - 1, 1) + state.add_edge(r, None, w, None, Memlet(data=name, subset=subset)) + offset += desc.shape[axis] + + return name + + +@oprepo.replaces('numpy.stack') +def _stack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arrays: Tuple[Any], + axis: int = 0, + out: Any = None, + *, + dtype=None, + casting: str = 'same_kind'): + if dtype is not None and out is not None: + raise ValueError('Arguments dtype and out cannot be given together') + if casting != 'same_kind': + raise NotImplementedError('The casting argument is currently unsupported') + if not isinstance(arrays, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if not isinstance(axis, Integral): + raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile stack call') + + for i in range(len(arrays)): + if arrays[i] not in sdfg.arrays: + raise TypeError(f'Index {i} is not an array') + + descs = [sdfg.arrays[a] for a in arrays] + shape = descs[0].shape + for i, d in enumerate(descs[1:]): + if d.shape != shape: + raise ValueError(f'Array shapes are not equal ({shape} != {d.shape} at index {i})') + + if axis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + if axis < 0: + naxis = len(shape) + 1 + axis + if naxis < 0 or naxis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + axis = naxis + + # Stacking is implemented as a reshape followed by concatenation + reshaped = [] + for arr, desc in zip(arrays, descs): + # Make a reshaped view with the inserted dimension + new_shape = [0] * (len(shape) + 1) + new_strides = [0] * (len(shape) + 1) + for i in range(len(shape) + 1): + if i == axis: + new_shape[i] = 1 + new_strides[i] = desc.strides[i - 1] if i != 0 else desc.strides[i] + elif i < axis: + new_shape[i] = shape[i] + new_strides[i] = desc.strides[i] + else: + new_shape[i] = shape[i - 1] + new_strides[i] = desc.strides[i - 1] + + rname = reshape(visitor, sdfg, state, arr, new_shape, strides=new_strides) + reshaped.append(rname) + + return _concat(visitor, sdfg, state, reshaped, axis, out, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.vstack') +@oprepo.replaces('numpy.row_stack') +def _vstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + + # In the 1-D case, stacking is performed along the first axis + if len(sdfg.arrays[tup[0]].shape) == 1: + return _stack(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + # Otherwise, concatenation is performed + return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.hstack') +@oprepo.replaces('numpy.column_stack') +def _hstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + + # In the 1-D case, concatenation is performed along the first axis + if len(sdfg.arrays[tup[0]].shape) == 1: + return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + + return _concat(visitor, sdfg, state, tup, axis=1, out=None, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.dstack') +def _dstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile a stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + if len(sdfg.arrays[tup[0]].shape) < 3: + raise NotImplementedError('dstack is not implemented for arrays that are smaller than 3D') + + return _concat(visitor, sdfg, state, tup, axis=2, out=None, dtype=dtype, casting=casting) + + +def _split_core(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[int, Sequence[symbolic.SymbolicType], str], axis: int, allow_uneven: bool): + # Argument checks + if not isinstance(ary, str) or ary not in sdfg.arrays: + raise TypeError('Split object must be an array') + if not isinstance(axis, Integral): + raise ValueError('Cannot determine split dimension, axis is not a compile-time evaluatable integer') + + desc = sdfg.arrays[ary] + + # Test validity of axis + orig_axis = axis + if axis < 0: + axis = len(desc.shape) + axis + if axis < 0 or axis >= len(desc.shape): + raise ValueError(f'axis {orig_axis} is out of bounds for array of dimension {len(desc.shape)}') + + # indices_or_sections may only be an integer (not symbolic), list of integers, list of symbols, or an array + if isinstance(indices_or_sections, str): + raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. ' + 'Consider using numpy.reshape instead.') + elif isinstance(indices_or_sections, (list, tuple)): + if any(isinstance(i, str) for i in indices_or_sections): + raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. ' + 'Use symbolic values as an argument instead.') + # Sequence is given + sections = indices_or_sections + elif isinstance(indices_or_sections, Integral): # Constant integer given + if indices_or_sections <= 0: + raise ValueError('Number of sections must be larger than zero.') + + # If uneven sizes are not allowed and ary shape is numeric, check evenness + if not allow_uneven and not symbolic.issymbolic(desc.shape[axis]): + if desc.shape[axis] % indices_or_sections != 0: + raise ValueError('Array split does not result in an equal division. Consider using numpy.array_split ' + 'instead.') + if indices_or_sections > desc.shape[axis]: + raise ValueError('Cannot compile array split as it will result in empty arrays.') + + # Sequence is not given, compute sections + # Mimic behavior of array_split in numpy: Sections are [s+1 x N%s], s, ..., s + size = desc.shape[axis] // indices_or_sections + remainder = desc.shape[axis] % indices_or_sections + sections = [] + offset = 0 + for _ in range(min(remainder, indices_or_sections)): + offset += size + 1 + sections.append(offset) + for _ in range(remainder, indices_or_sections - 1): + offset += size + sections.append(offset) + + elif symbolic.issymbolic(indices_or_sections): + raise ValueError('Symbolic split cannot be compiled due to output tuple size being unknown. ' + 'Consider using numpy.reshape instead.') + else: + raise TypeError(f'Unsupported type {type(indices_or_sections)} for indices_or_sections in numpy.split') + + # Split according to sections + r = state.add_read(ary) + result = [] + offset = 0 + for section in sections: + shape = list(desc.shape) + shape[axis] = section - offset + name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime) + # Add copy + w = state.add_write(name) + subset = subsets.Range.from_array(desc) + subset[axis] = (offset, offset + shape[axis] - 1, 1) + offset += shape[axis] + state.add_nedge(r, w, Memlet(data=ary, subset=subset)) + result.append(name) + + # Add final section + shape = list(desc.shape) + shape[axis] -= offset + name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime) + w = state.add_write(name) + subset = subsets.Range.from_array(desc) + subset[axis] = (offset, offset + shape[axis] - 1, 1) + state.add_nedge(r, w, Memlet(data=ary, subset=subset)) + result.append(name) + + # Always return a list of results, even if the size is 1 + return result + + +@oprepo.replaces('numpy.split') +def _split(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str], + axis: int = 0): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=False) + + +@oprepo.replaces('numpy.array_split') +def _array_split(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str], + axis: int = 0): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=True) + + +@oprepo.replaces('numpy.dsplit') +def _dsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + if isinstance(ary, str) and ary in sdfg.arrays: + if len(sdfg.arrays[ary].shape) < 3: + raise ValueError('Array dimensionality must be 3 or above for dsplit') + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=2, allow_uneven=False) + + +@oprepo.replaces('numpy.hsplit') +def _hsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + if isinstance(ary, str) and ary in sdfg.arrays: + # In case of a 1D array, split with axis=0 + if len(sdfg.arrays[ary].shape) <= 1: + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False) + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=1, allow_uneven=False) + + +@oprepo.replaces('numpy.vsplit') +def _vsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False) + + +############################################################################################################ +# Fast Fourier Transform numpy package (numpy.fft) + +def _real_to_complex(real_type: dace.typeclass): + if real_type == dace.float32: + return dace.complex64 + elif real_type == dace.float64: + return dace.complex128 + else: + return real_type + + +def _fft_core(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a: str, + n: Optional[dace.symbolic.SymbolicType] = None, + axis=-1, + norm: StringLiteral = StringLiteral('backward'), + is_inverse: bool = False): + from dace.libraries.fft.nodes import FFT, IFFT # Avoid import loops + if axis != 0 and axis != -1: + raise NotImplementedError('Only one dimensional arrays are supported at the moment') + if not isinstance(a, str) or a not in sdfg.arrays: + raise ValueError('Input must be a valid array') + + libnode = FFT('fft') if not is_inverse else IFFT('ifft') + + desc = sdfg.arrays[a] + N = desc.shape[axis] + + # If n is not None, either pad input or slice and add a view + if n is not None: + raise NotImplementedError + + # Compute factor + if norm == 'forward': + factor = (1 / N) if not is_inverse else 1 + elif norm == 'backward': + factor = 1 if not is_inverse else (1 / N) + elif norm == 'ortho': + factor = sp.sqrt(1 / N) + else: + raise ValueError('norm argument can only be one of "forward", "backward", or "ortho".') + libnode.factor = factor + + # Compute output type from input type + if is_inverse and desc.dtype not in (dace.complex64, dace.complex128): + raise TypeError(f'Inverse FFT only accepts complex inputs, got {desc.dtype}') + dtype = _real_to_complex(desc.dtype) + + name, odesc = sdfg.add_temp_transient_like(desc, dtype) + r = state.add_read(a) + w = state.add_write(name) + state.add_edge(r, None, libnode, '_inp', Memlet.from_array(a, desc)) + state.add_edge(libnode, '_out', w, None, Memlet.from_array(name, odesc)) + + return name + + +@oprepo.replaces('numpy.fft.fft') +def _fft(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a: str, + n: Optional[dace.symbolic.SymbolicType] = None, + axis=-1, + norm: StringLiteral = StringLiteral('backward')): + return _fft_core(pv, sdfg, state, a, n, axis, norm, False) + + +@oprepo.replaces('numpy.fft.ifft') +def _ifft(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a, + n=None, + axis=-1, + norm: StringLiteral = StringLiteral('backward')): + return _fft_core(pv, sdfg, state, a, n, axis, norm, True) diff --git a/dace/libraries/blas/nodes/gemv.py b/dace/libraries/blas/nodes/gemv.py index baf6fb415d..52091c6864 100644 --- a/dace/libraries/blas/nodes/gemv.py +++ b/dace/libraries/blas/nodes/gemv.py @@ -730,6 +730,9 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs): dtype_a = outer_array_a.dtype.type dtype = outer_array_x.dtype.base_type veclen = outer_array_x.dtype.veclen + alpha = f'{dtype.ctype}({node.alpha})' + beta = f'{dtype.ctype}({node.beta})' + m = m or node.m n = n or node.n if m is None: @@ -765,8 +768,17 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs): func = func.lower() + 'gemv' - code = f"""cblas_{func}({layout}, {trans}, {m}, {n}, {node.alpha}, _A, {lda}, - _x, {strides_x[0]}, {node.beta}, _y, {strides_y[0]});""" + code = '' + if dtype in (dace.complex64, dace.complex128): + code = f''' + {dtype.ctype} __alpha = {alpha}; + {dtype.ctype} __beta = {beta}; + ''' + alpha = '&__alpha' + beta = '&__beta' + + code += f"""cblas_{func}({layout}, {trans}, {m}, {n}, {alpha}, _A, {lda}, + _x, {strides_x[0]}, {beta}, _y, {strides_y[0]});""" tasklet = dace.sdfg.nodes.Tasklet(node.name, node.in_connectors, diff --git a/dace/libraries/fft/__init__.py b/dace/libraries/fft/__init__.py new file mode 100644 index 0000000000..71fb014f32 --- /dev/null +++ b/dace/libraries/fft/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from dace.library import register_library +from .nodes import * +from .environments import * + +register_library(__name__, "fft") diff --git a/dace/libraries/fft/algorithms/__init__.py b/dace/libraries/fft/algorithms/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dace/libraries/fft/algorithms/dft.py b/dace/libraries/fft/algorithms/dft.py new file mode 100644 index 0000000000..340dfed22d --- /dev/null +++ b/dace/libraries/fft/algorithms/dft.py @@ -0,0 +1,45 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +""" +One-dimensional Discrete Fourier Transform (DFT) native implementations. +""" +import dace +import numpy as np +import math + + +# Native, naive version of the Discrete Fourier Transform +@dace.program +def dft(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + i = np.arange(N) + e = np.exp(-2j * np.pi * i * i[:, None] / N) + _out[:] = factor * (e @ _inp.astype(dace.complex128)) + + +@dace.program +def idft(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + i = np.arange(N) + e = np.exp(2j * np.pi * i * i[:, None] / N) + _out[:] = factor * (e @ _inp.astype(dace.complex128)) + + +# Single-map version of DFT, useful for integrating small Fourier transforms into other operations +@dace.program +def dft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + _out[:] = 0 + for i, n in dace.map[0:N, 0:N]: + with dace.tasklet: + inp << _inp[n] + exponent = 2 * math.pi * i * n / N + b = decltype(b)(math.cos(exponent), -math.sin(exponent)) * inp * factor + b >> _out(1, lambda a, b: a + b)[i] + + +@dace.program +def idft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + _out[:] = 0 + for i, n in dace.map[0:N, 0:N]: + with dace.tasklet: + inp << _inp[n] + exponent = 2 * math.pi * i * n / N + b = decltype(b)(math.cos(exponent), math.sin(exponent)) * inp * factor + b >> _out(1, lambda a, b: a + b)[i] diff --git a/dace/libraries/fft/environments/__init__.py b/dace/libraries/fft/environments/__init__.py new file mode 100644 index 0000000000..0900214e68 --- /dev/null +++ b/dace/libraries/fft/environments/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from .cufft import * diff --git a/dace/libraries/fft/environments/cufft.py b/dace/libraries/fft/environments/cufft.py new file mode 100644 index 0000000000..dd243d376a --- /dev/null +++ b/dace/libraries/fft/environments/cufft.py @@ -0,0 +1,21 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library + + +@dace.library.environment +class cuFFT: + + cmake_minimum_version = None + cmake_packages = ["CUDA"] + cmake_variables = {} + cmake_includes = [] + cmake_libraries = ["cufft"] + cmake_compile_flags = [] + cmake_link_flags = [] + cmake_files = [] + + headers = {'frame': ["cufft.h", "cufftXt.h"], 'cuda': ["cufft.h", "cufftXt.h"]} + state_fields = [] + init_code = "" + finalize_code = "" + dependencies = [] diff --git a/dace/libraries/fft/nodes/__init__.py b/dace/libraries/fft/nodes/__init__.py new file mode 100644 index 0000000000..dd8f132aa4 --- /dev/null +++ b/dace/libraries/fft/nodes/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from .fft import FFT, IFFT diff --git a/dace/libraries/fft/nodes/fft.py b/dace/libraries/fft/nodes/fft.py new file mode 100644 index 0000000000..bc85f8785b --- /dev/null +++ b/dace/libraries/fft/nodes/fft.py @@ -0,0 +1,204 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +""" +Implements Forward and Inverse Fast Fourier Transform (FFT) library nodes +""" +import warnings + +from dace import data, dtypes, SDFG, SDFGState, symbolic, library, nodes, properties +from dace import transformation as xf +from dace.libraries.fft import environments as env + + +# Define the library nodes +@library.node +class FFT(nodes.LibraryNode): + implementations = {} + default_implementation = 'pure' + + factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0) + + def __init__(self, name, *args, schedule=None, **kwargs): + super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs) + + +@library.node +class IFFT(nodes.LibraryNode): + implementations = {} + default_implementation = 'pure' + + factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0) + + def __init__(self, name, *args, schedule=None, **kwargs): + super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs) + + +################################################################################################## +# Native SDFG expansions +################################################################################################## + + +@library.register_expansion(FFT, 'pure') +class DFTExpansion(xf.ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + from dace.libraries.fft.algorithms import dft # Lazy import functions + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if len(indesc.shape) != 1: + raise NotImplementedError('Native SDFG expansion for FFT does not yet support N-dimensional inputs') + + warnings.warn('Performance Warning: No assumptions on FFT input size, falling back to DFT') + return dft.dft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor) + + +@library.register_expansion(IFFT, 'pure') +class IDFTExpansion(xf.ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + from dace.libraries.fft.algorithms import dft # Lazy import functions + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if len(indesc.shape) != 1: + raise NotImplementedError('Native SDFG expansion for IFFT does not yet support N-dimensional inputs') + + warnings.warn('Performance Warning: No assumptions on IFFT input size, falling back to DFT') + return dft.idft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor) + + +################################################################################################## +# cuFFT expansions +################################################################################################## + + +@library.register_expansion(FFT, 'cuFFT') +class cuFFTFFTExpansion(xf.ExpandTransformation): + environments = [env.cuFFT] + plan_uid = 0 + + @staticmethod + def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if str(node.factor) != '1': + raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented') + return _generate_cufft_code(indesc, outdesc, parent_sdfg, False) + + +@library.register_expansion(IFFT, 'cuFFT') +class cuFFTIFFTExpansion(xf.ExpandTransformation): + environments = [env.cuFFT] + plan_uid = 0 + + @staticmethod + def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if str(node.factor) != '1': + raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented') + return _generate_cufft_code(indesc, outdesc, parent_sdfg, True) + + +def _generate_cufft_code(indesc: data.Data, outdesc: data.Data, sdfg: SDFG, is_inverse: bool): + from dace.codegen.targets import cpp # Avoid import loops + if len(indesc.shape) not in (1, 2, 3): + raise ValueError('cuFFT only supports 1/2/3-dimensional FFT') + if indesc.storage != dtypes.StorageType.GPU_Global: + raise ValueError('cuFFT implementation requires input array to be on GPU') + if outdesc.storage != dtypes.StorageType.GPU_Global: + raise ValueError('cuFFT implementation requires output array to be on GPU') + + cufft_type = _types_to_cufft(indesc.dtype, outdesc.dtype) + init_code = '' + exit_code = '' + callsite_code = '' + + # Make a unique name for this plan + if not is_inverse: + plan_name = f'fwdplan{cuFFTFFTExpansion.plan_uid}' + cuFFTFFTExpansion.plan_uid += 1 + direction = 'CUFFT_FORWARD' + tasklet_prefix = '' + else: + plan_name = f'invplan{cuFFTIFFTExpansion.plan_uid}' + cuFFTIFFTExpansion.plan_uid += 1 + direction = 'CUFFT_INVERSE' + tasklet_prefix = 'i' + + fields = [ + f'cufftHandle {plan_name};', + ] + plan_name = f'__state->{plan_name}' + + init_code += f''' + cufftCreate(&{plan_name}); + ''' + exit_code += f''' + cufftDestroy({plan_name}); + ''' + + cdims = ', '.join([cpp.sym2cpp(s) for s in indesc.shape]) + make_plan = f''' + {{ + size_t __work_size = 0; + cufftMakePlan{len(indesc.shape)}d({plan_name}, {cdims}, {cufft_type}, /*batch=*/1, &__work_size); + }} + ''' + + # Make plan in init if not symbolic or not data-dependent, otherwise make at callsite. + symbols_that_change = set(s for ise in sdfg.edges() for s in ise.data.assignments.keys()) + symbols_that_change &= set(map(str, sdfg.symbols.keys())) + + def _fsyms(x): + if symbolic.issymbolic(x): + return set(map(str, x.free_symbols)) + return set() + + if symbols_that_change and any(_fsyms(s) & symbols_that_change for s in indesc.shape): + callsite_code += make_plan + else: + init_code += make_plan + + # Execute plan + callsite_code += f''' + cufftSetStream({plan_name}, __dace_current_stream); + cufftXtExec({plan_name}, _inp, _out, {direction}); + ''' + + return nodes.Tasklet(f'cufft_{tasklet_prefix}fft', {'_inp'}, {'_out'}, + callsite_code, + language=dtypes.Language.CPP, + state_fields=fields, + code_init=init_code, + code_exit=exit_code) + + +################################################################################################## +# Helper functions +################################################################################################## + + +def _get_input_and_output(state: SDFGState, node: nodes.LibraryNode): + """ + Helper function that returns the input and output arrays of the library node + """ + in_edge = next(e for e in state.in_edges(node) if e.dst_conn) + out_edge = next(e for e in state.out_edges(node) if e.src_conn) + return in_edge.data.data, out_edge.data.data + + +def _types_to_cufft(indtype: dtypes.typeclass, outdtype: dtypes.typeclass): + typedict = { + dtypes.float32: 'R', + dtypes.float64: 'D', + dtypes.complex64: 'C', + dtypes.complex128: 'Z', + } + return f'CUFFT_{typedict[indtype]}2{typedict[outdtype]}' diff --git a/dace/libraries/standard/nodes/transpose.py b/dace/libraries/standard/nodes/transpose.py index 58c6cfc33e..e2795ef951 100644 --- a/dace/libraries/standard/nodes/transpose.py +++ b/dace/libraries/standard/nodes/transpose.py @@ -100,6 +100,12 @@ class ExpandTransposeMKL(ExpandTransformation): @staticmethod def expansion(node, state, sdfg): node.validate(sdfg, state) + + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + dtype = node.dtype if dtype == dace.float32: func = "somatcopy" @@ -141,22 +147,30 @@ class ExpandTransposeOpenBLAS(ExpandTransformation): @staticmethod def expansion(node, state, sdfg): node.validate(sdfg, state) + + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + dtype = node.dtype cast = "" if dtype == dace.float32: func = "somatcopy" alpha = "1.0f" + cast = '' elif dtype == dace.float64: func = "domatcopy" alpha = "1.0" + cast = '' elif dtype == dace.complex64: func = "comatcopy" - cast = "(float*)" - alpha = f"{cast}dace::blas::BlasConstants::Get().Complex64Pone()" + alpha = "dace::blas::BlasConstants::Get().Complex64Pone()" + cast = '(float*)' elif dtype == dace.complex128: func = "zomatcopy" - cast = "(double*)" - alpha = f"{cast}dace::blas::BlasConstants::Get().Complex128Pone()" + alpha = "dace::blas::BlasConstants::Get().Complex128Pone()" + cast = '(double*)' else: raise ValueError("Unsupported type for OpenBLAS omatcopy extension: " + str(dtype)) # TODO: Add stride support @@ -164,8 +178,8 @@ def expansion(node, state, sdfg): # Adaptations for BLAS API order = 'CblasRowMajor' trans = 'CblasTrans' - code = ("cblas_{f}({o}, {t}, {m}, {n}, {a}, {c}_inp, " - "{n}, {c}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, c=cast) + code = ("cblas_{f}({o}, {t}, {m}, {n}, {cast}{a}, {cast}_inp, " + "{n}, {cast}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, cast=cast) tasklet = dace.sdfg.nodes.Tasklet(node.name, node.in_connectors, node.out_connectors, @@ -184,6 +198,11 @@ def expansion(node, state, sdfg, **kwargs): node.validate(sdfg, state) dtype = node.dtype + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + try: func, cdtype, factort = blas_helpers.cublas_type_metadata(dtype) except TypeError as ex: diff --git a/tests/library/fft_test.py b/tests/library/fft_test.py new file mode 100644 index 0000000000..440d0a46cf --- /dev/null +++ b/tests/library/fft_test.py @@ -0,0 +1,101 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +import pytest +import numpy as np + +import dace + + +@pytest.mark.parametrize('symbolic', (False, True)) +def test_fft(symbolic): + if symbolic: + N = dace.symbol('N') + else: + N = 21 + + @dace.program + def tester(x: dace.complex128[N]): + return np.fft.fft(x) + + a = np.random.rand(21) + 1j * np.random.rand(21) + b = tester(a) + assert np.allclose(b, np.fft.fft(a)) + + +def test_fft_r2c(): + """ + Tests implicit conversion to complex types + """ + + @dace.program + def tester(x: dace.float32[20]): + return np.fft.fft(x) + + a = np.random.rand(20).astype(np.float32) + b = tester(a) + assert b.dtype == np.complex64 + assert np.allclose(b, np.fft.fft(a)) + + +@pytest.mark.parametrize('norm', ('backward', 'forward', 'ortho')) +def test_ifft(norm): + + @dace.program + def tester(x: dace.complex128[21]): + return np.fft.ifft(x, norm=norm) + + a = np.random.rand(21) + 1j * np.random.rand(21) + b = tester(a) + assert np.allclose(b, np.fft.ifft(a, norm=norm)) + + +@pytest.mark.gpu +def test_cufft(): + import dace.libraries.fft as fftlib + + @dace.program + def tester(x: dace.complex128[210]): + return np.fft.fft(x) + + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations() + fftlib.FFT.default_implementation = 'cuFFT' + sdfg.expand_library_nodes() + fftlib.FFT.default_implementation = 'pure' + + a = np.random.rand(210) + 1j * np.random.rand(210) + b = sdfg(a) + assert np.allclose(b, np.fft.fft(a)) + + +@pytest.mark.gpu +def test_cufft_twoplans(): + import dace.libraries.fft as fftlib + + @dace.program + def tester(x: dace.complex128[210], y: dace.complex64[19]): + return np.fft.fft(x), np.fft.ifft(y, norm='forward') + + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations() + fftlib.FFT.default_implementation = 'cuFFT' + fftlib.IFFT.default_implementation = 'cuFFT' + sdfg.expand_library_nodes() + fftlib.FFT.default_implementation = 'pure' + fftlib.IFFT.default_implementation = 'pure' + + a = np.random.rand(210) + 1j * np.random.rand(210) + b = (np.random.rand(19) + 1j * np.random.rand(19)).astype(np.complex64) + c, d = sdfg(a, b) + assert np.allclose(c, np.fft.fft(a)) + assert np.allclose(d, np.fft.ifft(b, norm='forward')) + + +if __name__ == '__main__': + test_fft(False) + test_fft(True) + test_fft_r2c() + test_ifft('backward') + test_ifft('forward') + test_ifft('ortho') + test_cufft() + test_cufft_twoplans() diff --git a/tests/numpy/array_creation_test.py b/tests/numpy/array_creation_test.py index 7329b48b3f..a1f6d0329f 100644 --- a/tests/numpy/array_creation_test.py +++ b/tests/numpy/array_creation_test.py @@ -152,6 +152,42 @@ def test_arange_6(): return np.arange(2.5, 10, 3) +@compare_numpy_output() +def test_linspace_1(): + return np.linspace(2.5, 10, num=3) + + +@compare_numpy_output() +def test_linspace_2(): + space, step = np.linspace(2.5, 10, num=3, retstep=True) + return space, step + + +@compare_numpy_output() +def test_linspace_3(): + a = np.array([1, 2, 3]) + return np.linspace(a, 5, num=10) + + +@compare_numpy_output() +def test_linspace_4(): + a = np.array([[1, 2, 3], [4, 5, 6]]) + space, step = np.linspace(a, 10, endpoint=False, retstep=True) + return space, step + + +@compare_numpy_output() +def test_linspace_5(): + a = np.array([[1, 2, 3], [4, 5, 6]]) + b = np.array([[5], [10]]) + return np.linspace(a, b, endpoint=False, axis=1) + + +@compare_numpy_output() +def test_linspace_6(): + return np.linspace(-5, 5.5, dtype=np.float32) + + @dace.program def program_strides_0(): A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 1)) @@ -267,6 +303,12 @@ def ones_scalar_size(k: dace.int32): test_arange_4() test_arange_5() test_arange_6() + test_linspace_1() + test_linspace_2() + test_linspace_3() + test_linspace_4() + test_linspace_5() + test_linspace_6() test_strides_0() test_strides_1() test_strides_2() diff --git a/tests/numpy/attention_simple_test.py b/tests/numpy/attention_simple_test.py index 49558a154b..2ce0205e3f 100644 --- a/tests/numpy/attention_simple_test.py +++ b/tests/numpy/attention_simple_test.py @@ -11,7 +11,7 @@ def dace_softmax(X_in: dace.float32[N], X_out: dace.float32[N]): tmp_max = dace.reduce(lambda a, b: max(a, b), X_in) - X_out[:] = exp(X_in - tmp_max) + X_out[:] = np.exp(X_in - tmp_max) tmp_sum = dace.reduce(lambda a, b: a + b, X_out, identity=0) X_out[:] /= tmp_sum diff --git a/tests/numpy/attribute_test.py b/tests/numpy/attribute_test.py index 2181883015..e011eafc89 100644 --- a/tests/numpy/attribute_test.py +++ b/tests/numpy/attribute_test.py @@ -54,7 +54,50 @@ def fn(a: dace.float64[N, F_in], b: dace.float64[N, heads, F_out], c: dace.float assert np.allclose(c, c_expected) +def test_nested_attribute(): + + @dace.program + def tester(a: dace.complex128[20, 10]): + return a.T.real + + r = np.random.rand(20, 10) + im = np.random.rand(20, 10) + a = r + 1j * im + res = tester(a) + assert np.allclose(res, r.T) + + +def test_attribute_of_expr(): + """ + Regression reported in Issue #1295. + """ + + @dace.program + def tester(a: dace.float64[20, 20], b: dace.float64[20, 20], c: dace.float64[20, 20]): + c[:, :] = (a @ b).T + + a = np.random.rand(20, 20) + b = np.random.rand(20, 20) + c = np.random.rand(20, 20) + ref = (a @ b).T + tester(a, b, c) + assert np.allclose(c, ref) + + +def test_attribute_function(): + + @dace.program + def tester(): + return np.arange(10).reshape(10, 1) + + a = tester() + assert np.allclose(a, np.arange(10).reshape(10, 1)) + + if __name__ == '__main__': test_attribute_in_ranged_loop() test_attribute_in_ranged_loop_symbolic() test_attribute_new_state() + test_nested_attribute() + test_attribute_of_expr() + test_attribute_function() diff --git a/tests/numpy/concat_test.py b/tests/numpy/concat_test.py new file mode 100644 index 0000000000..614258e34f --- /dev/null +++ b/tests/numpy/concat_test.py @@ -0,0 +1,133 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +from common import compare_numpy_output +import pytest + +M = 10 +N = 20 +K = 30 + + +@compare_numpy_output() +def test_concatenate(): + a = np.zeros([N, N], dtype=np.float32) + b = np.ones([N, 1], dtype=np.float32) + return np.concatenate((a, b), axis=-1) + + +@compare_numpy_output() +def test_concatenate_four(): + a = np.zeros([N, N], dtype=np.float32) + b = np.ones([N, 1], dtype=np.float32) + c = np.full([N, M], 2.0, dtype=np.float32) + return np.concatenate((a, b, c, a), axis=-1) + + +@compare_numpy_output() +def test_concatenate_out(): + a = np.zeros([N, N], dtype=np.float32) + b = np.ones([M, N], dtype=np.float32) + c = np.full([N + M, N], -1, dtype=np.float32) + np.concatenate([a, b], out=c) + return c + 1 + + +def test_concatenate_symbolic(): + n = dace.symbol('n') + m = dace.symbol('m') + k = dace.symbol('k') + + @dace.program + def tester(a: dace.float64[k, m], b: dace.float64[k, n]): + return np.concatenate((a, b), axis=1) + + aa = np.random.rand(10, 4) + bb = np.random.rand(10, 5) + cc = tester(aa, bb) + assert tuple(cc.shape) == (10, 9) + assert np.allclose(np.concatenate((aa, bb), axis=1), cc) + + +def test_concatenate_fail(): + with pytest.raises(ValueError): + + @dace.program + def tester(a: dace.float64[K, M], b: dace.float64[N, K]): + return np.concatenate((a, b), axis=1) + + aa = np.random.rand(K, M) + bb = np.random.rand(N, K) + tester(aa, bb) + + +@compare_numpy_output() +def test_concatenate_flatten(): + a = np.zeros([1, 2, 3], dtype=np.float32) + b = np.ones([4, 5, 6], dtype=np.float32) + return np.concatenate([a, b], axis=None) + + +@compare_numpy_output() +def test_stack(): + a = np.zeros([N, M, K], dtype=np.float32) + b = np.ones([N, M, K], dtype=np.float32) + return np.stack((a, b), axis=-1) + + +@compare_numpy_output() +def test_vstack(): + a = np.zeros([N, M], dtype=np.float32) + b = np.ones([N, M], dtype=np.float32) + return np.vstack((a, b)) + + +@compare_numpy_output() +def test_vstack_1d(): + a = np.zeros([N], dtype=np.float32) + b = np.ones([N], dtype=np.float32) + return np.vstack((a, b)) + + +@compare_numpy_output() +def test_hstack(): + a = np.zeros([N, M], dtype=np.float32) + b = np.ones([N, M], dtype=np.float32) + return np.hstack((a, b)) + + +@compare_numpy_output() +def test_hstack_1d(): + a = np.zeros([N], dtype=np.float32) + b = np.ones([N], dtype=np.float32) + return np.hstack((a, b)) + + +@compare_numpy_output() +def test_dstack(): + a = np.zeros([N, M, K], dtype=np.float32) + b = np.ones([N, M, K], dtype=np.float32) + return np.dstack((a, b)) + + +@compare_numpy_output() +def test_dstack_4d(): + a = np.zeros([N, M, K, K], dtype=np.float32) + b = np.ones([N, M, K, K], dtype=np.float32) + return np.dstack((a, b)) + + +if __name__ == "__main__": + test_concatenate() + test_concatenate_four() + test_concatenate_out() + test_concatenate_symbolic() + test_concatenate_fail() + test_concatenate_flatten() + test_stack() + test_vstack() + test_vstack_1d() + test_hstack() + test_hstack_1d() + test_dstack() + test_dstack_4d() diff --git a/tests/numpy/nested_call_subarray_test.py b/tests/numpy/nested_call_subarray_test.py index 6a92b004fa..7501652328 100644 --- a/tests/numpy/nested_call_subarray_test.py +++ b/tests/numpy/nested_call_subarray_test.py @@ -8,7 +8,7 @@ @dace.program def dace_softmax_ncs(X_in: dace.float32[N], X_out: dace.float32[N]): tmp_max = dace.reduce(lambda a, b: a + b, X_in, identity=0) - X_out[:] = exp(X_in - tmp_max) + X_out[:] = np.exp(X_in - tmp_max) tmp_sum = dace.reduce(lambda a, b: max(a, b), X_in) X_out[:] /= tmp_sum @@ -22,7 +22,7 @@ def test_ncs_local_program(): @dace.program def dace_softmax_localprog(X_in: dace.float32[N], X_out: dace.float32[N]): tmp_max = dace.reduce(lambda a, b: a + b, X_in, identity=0) - X_out[:] = exp(X_in - tmp_max) + X_out[:] = np.exp(X_in - tmp_max) tmp_sum = dace.reduce(lambda a, b: max(a, b), X_in) X_out[:] /= tmp_sum diff --git a/tests/numpy/split_test.py b/tests/numpy/split_test.py new file mode 100644 index 0000000000..e4088754e8 --- /dev/null +++ b/tests/numpy/split_test.py @@ -0,0 +1,142 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +""" +Tests variants of the numpy split array manipulation. +""" +import dace +import numpy as np +from common import compare_numpy_output +import pytest + +M = 9 +N = 20 +K = 30 + + +@compare_numpy_output() +def test_split(): + arr = np.arange(M) + a, b, c = np.split(arr, 3) + return a + b + c + + +def test_uneven_split_fail(): + with pytest.raises(ValueError): + + @dace.program + def tester(): + arr = np.arange(N) + a, b, c = np.split(arr, 3) + return a + b + c + + tester() + + +def test_symbolic_split_fail(): + with pytest.raises(ValueError): + n = dace.symbol('n') + + @dace.program + def tester(): + arr = np.arange(N) + a, b, c = np.split(arr, n) + return a + b + c + + tester() + + +def test_array_split_fail(): + with pytest.raises(ValueError): + + @dace.program + def tester(): + arr = np.arange(N) + split = np.arange(N) + a, b, c = np.split(arr, split) + return a + b + c + + tester() + + +@compare_numpy_output() +def test_array_split(): + arr = np.arange(N) + a, b, c = np.array_split(arr, 3) + return a, b, c + + +@compare_numpy_output() +def test_array_split_multidim(): + arr = np.ones((N, N)) + a, b, c = np.array_split(arr, 3, axis=1) + return a, b, c + + +@compare_numpy_output() +def test_split_sequence(): + arr = np.arange(N) + a, b = np.split(arr, [3]) + return a, b + + +@compare_numpy_output() +def test_split_sequence_2(): + arr = np.arange(M) + a, b, c = np.split(arr, [3, 6]) + return a + b + c + + +def test_split_sequence_symbolic(): + n = dace.symbol('n') + + @dace.program + def tester(arr: dace.float64[3 * n]): + a, b, c = np.split(arr, [n, n + 2]) + return a, b, c + + nval = K // 3 + a = np.random.rand(K) + ra, rb, rc = tester(a) + assert ra.shape[0] == nval + assert rb.shape[0] == 2 + assert rc.shape[0] == K - nval - 2 + ref = np.split(a, [nval, nval + 2]) + assert len(ref) == 3 + assert np.allclose(ra, ref[0]) + assert np.allclose(rb, ref[1]) + assert np.allclose(rc, ref[2]) + + +@compare_numpy_output() +def test_vsplit(): + arr = np.ones((N, M)) + a, b = np.vsplit(arr, 2) + return a, b + + +@compare_numpy_output() +def test_hsplit(): + arr = np.ones((M, N)) + a, b = np.hsplit(arr, 2) + return a, b + + +@compare_numpy_output() +def test_dsplit_4d(): + arr = np.ones([N, M, K, K], dtype=np.float32) + a, b, c = np.dsplit(arr, 3) + return a, b, c + + +if __name__ == "__main__": + test_split() + test_uneven_split_fail() + test_symbolic_split_fail() + test_array_split_fail() + test_array_split() + test_array_split_multidim() + test_split_sequence() + test_split_sequence_2() + test_split_sequence_symbolic() + test_vsplit() + test_hsplit() + test_dsplit_4d() diff --git a/tests/numpy/ufunc_test.py b/tests/numpy/ufunc_test.py index 06bd4c3189..b769ab1082 100644 --- a/tests/numpy/ufunc_test.py +++ b/tests/numpy/ufunc_test.py @@ -1304,6 +1304,11 @@ def test_ufunc_trunc_u(A: dace.uint32[10]): return np.trunc(A) +@compare_numpy_output() +def test_ufunc_clip(A: dace.float32[10]): + return np.clip(A, 0.2, 0.5) + + if __name__ == "__main__": test_ufunc_add_ff() test_ufunc_subtract_ff() @@ -1542,3 +1547,4 @@ def test_ufunc_trunc_u(A: dace.uint32[10]): test_ufunc_trunc_c() test_ufunc_trunc_f() test_ufunc_trunc_u() + test_ufunc_clip() From 945b5ce4ad26ba9047467d95e75e9aec48792506 Mon Sep 17 00:00:00 2001 From: Philipp Schaad Date: Thu, 31 Oct 2024 12:51:56 +0100 Subject: [PATCH 17/43] Fix jupyter's version of SDFV (#1714) This requires https://github.com/spcl/dace-webclient/pull/179 to be merged before being ready. --- dace/sdfg/sdfg.py | 8 +- dace/viewer/webclient | 2 +- tutorials/explicit.ipynb | 35 ++--- tutorials/getting_started.ipynb | 112 ++++++-------- tutorials/numpy_frontend.ipynb | 40 ++--- tutorials/sdfg_api.ipynb | 82 ++++++----- tutorials/transformations.ipynb | 251 +++++++++++++++----------------- 7 files changed, 255 insertions(+), 275 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index f25a6e24d5..cb8a7d5c2d 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -1464,9 +1464,11 @@ def _repr_html_(self): var sdfg_{uid} = {sdfg}; """.format( # Dumping to a string so that Jupyter Javascript can parse it # recursively diff --git a/dace/viewer/webclient b/dace/viewer/webclient index c6b8fe4fd2..64861bbc05 160000 --- a/dace/viewer/webclient +++ b/dace/viewer/webclient @@ -1 +1 @@ -Subproject commit c6b8fe4fd2c3616b0480ead4c24d8012b91a31fd +Subproject commit 64861bbc054c62bc6cb3f8525bfc4703d6c5e364 diff --git a/tutorials/explicit.ipynb b/tutorials/explicit.ipynb index 45d172cf35..de718ffc4a 100644 --- a/tutorials/explicit.ipynb +++ b/tutorials/explicit.ipynb @@ -123,15 +123,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -159,16 +161,7 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: Casting scalar argument \"M\" from int to \n", - "WARNING: Casting scalar argument \"N\" from int to \n" - ] - } - ], + "outputs": [], "source": [ "sdfg(A=A, B=B, M=A.shape[0], N=A.shape[1])" ] @@ -201,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -225,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -310,13 +303,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING: Casting scalar argument \"threshold\" from int to \n" + "WARNING: Passing uint32 array argument \"outsz\" to a int32 array\n" ] }, { "data": { "text/plain": [ - "array([121], dtype=uint32)" + "array([114], dtype=uint32)" ] }, "execution_count": 13, @@ -351,7 +344,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dace_dev", "language": "python", "name": "python3" }, @@ -365,7 +358,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/getting_started.ipynb b/tutorials/getting_started.ipynb index 4405c28d56..266d207abc 100644 --- a/tutorials/getting_started.ipynb +++ b/tutorials/getting_started.ipynb @@ -13,22 +13,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import dace" ] @@ -42,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -66,8 +53,8 @@ { "data": { "text/plain": [ - "array([[0.74867876, 0.85403223, 0.16573784],\n", - " [0.71994615, 0.29855314, 0.21483992]])" + "array([[0.02638476, 0.15801766, 0.60640768],\n", + " [0.75281897, 0.02027034, 0.92066681]])" ] }, "execution_count": 3, @@ -89,8 +76,8 @@ { "data": { "text/plain": [ - "array([[1.49735752, 1.70806445, 0.33147568],\n", - " [1.4398923 , 0.59710627, 0.42967985]])" + "array([[0.05276951, 0.31603533, 1.21281536],\n", + " [1.50563794, 0.04054068, 1.84133362]])" ] }, "execution_count": 4, @@ -113,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -121,22 +108,21 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ "SDFG (getstarted)" ] }, - "execution_count": 5, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -174,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -201,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -209,22 +195,21 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ "SDFG (getstarted_sym)" ] }, - "execution_count": 8, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -248,19 +233,19 @@ { "data": { "text/plain": [ - "array([[1.63216549, 1.26522381, 0.21606686, ..., 0.56988572, 1.12572538,\n", - " 1.72701877],\n", - " [0.3829452 , 1.52386969, 0.82165197, ..., 1.3105662 , 1.19336786,\n", - " 1.43671993],\n", - " [1.55277426, 1.50918516, 1.30665626, ..., 1.06562809, 1.53069088,\n", - " 1.10071159],\n", + "array([[0.98818461, 1.27933885, 0.2033508 , ..., 0.547033 , 0.4299565 ,\n", + " 0.24654365],\n", + " [1.91945996, 0.8587834 , 1.6074685 , ..., 0.60969216, 1.7881462 ,\n", + " 1.6251679 ],\n", + " [0.09656663, 0.86573612, 0.79912191, ..., 1.50199177, 0.14342504,\n", + " 0.77152323],\n", " ...,\n", - " [0.60629736, 1.73240929, 1.26797782, ..., 1.72034476, 1.56691557,\n", - " 0.22283613],\n", - " [1.96245486, 1.60559508, 0.02009914, ..., 1.40944583, 1.44560312,\n", - " 0.37804927],\n", - " [1.17875002, 0.96963921, 0.28278902, ..., 1.56747976, 0.4616313 ,\n", - " 0.94999278]])" + " [1.86926975, 0.16524055, 0.57659078, ..., 0.06706506, 1.94858343,\n", + " 0.21332081],\n", + " [0.78987173, 0.32493361, 0.33111051, ..., 0.41438505, 1.6625166 ,\n", + " 1.4539469 ],\n", + " [0.32619914, 0.84155838, 0.85757214, ..., 0.93809 , 0.25236549,\n", + " 1.95588663]])" ] }, "execution_count": 9, @@ -283,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -299,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -315,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -324,14 +309,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "12 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "12 ms ± 258 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -341,14 +326,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "3.86 ms ± 271 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "5.1 ms ± 470 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -367,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -375,22 +360,21 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ "SDFG (sse_sigma)" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -422,7 +406,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dace_dev", "language": "python", "name": "python3" }, @@ -436,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/numpy_frontend.ipynb b/tutorials/numpy_frontend.ipynb index fafda2f1b1..83ca6875ba 100644 --- a/tutorials/numpy_frontend.ipynb +++ b/tutorials/numpy_frontend.ipynb @@ -107,15 +107,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -267,15 +269,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -328,15 +332,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -461,7 +467,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dace_dev", "language": "python", "name": "python3" }, @@ -475,7 +481,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/sdfg_api.ipynb b/tutorials/sdfg_api.ipynb index 645158ce88..beb4f4b6b0 100644 --- a/tutorials/sdfg_api.ipynb +++ b/tutorials/sdfg_api.ipynb @@ -112,15 +112,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -200,15 +202,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -240,15 +244,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -287,15 +293,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -347,15 +355,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -395,15 +405,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -477,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -486,14 +498,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Difference: 1.6358224e-06\n" + "Difference: 7.1136246\n" ] } ], @@ -504,7 +516,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dace_dev", "language": "python", "name": "python3" }, @@ -518,7 +530,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/tutorials/transformations.ipynb b/tutorials/transformations.ipynb index d54b294e6e..931df79e18 100644 --- a/tutorials/transformations.ipynb +++ b/tutorials/transformations.ipynb @@ -70,15 +70,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -112,15 +114,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -155,15 +159,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -197,15 +203,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -348,15 +356,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -399,15 +409,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -456,30 +468,22 @@ "4. Transformation GPUTransformLocalStorage in outer_fused[__i0=0:1000, __i1=0:1000]\n", "5. Transformation GPUTransformMap in outer_fused[__i0=0:1000, __i1=0:1000]\n", "6. Transformation GPUTransformSDFG in []\n", - "7. Transformation MapDimShuffle in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "8. Transformation MapExpansion in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "9. Transformation MapFission in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "10. Transformation MapTiling in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "11. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "12. Transformation MapUnroll in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "13. Transformation NestSDFG in []\n", - "14. Transformation ReductionNOperation in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", - "15. Transformation StripMining in outer_fused: ['__i0', '__i1']\n", - "16. Transformation TaskletFusion in [Tasklet (_Mult_), AccessNode (__tmp0), Tasklet (_Add_)]\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Select the pattern to apply (0 - 16 or name$id): MapExpansion$0\n" + "7. Transformation MapExpansion in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "8. Transformation MapFission in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "9. Transformation MapTiling in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "10. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "11. Transformation MapUnroll in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "12. Transformation NestSDFG in []\n", + "13. Transformation ReductionNOperation in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])]\n", + "14. Transformation StripMining in outer_fused: ['__i0', '__i1']\n", + "15. Transformation TaskletFusion in [Tasklet (_Mult_), AccessNode (__tmp0), Tasklet (_Add_)]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "You selected (MapExpansion$0) pattern MapExpansion in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])] with parameters {}\n", + "You selected (7) pattern MapExpansion in [MapEntry (outer_fused[__i0=0:1000, __i1=0:1000])] with parameters {}\n", "0. Transformation ElementWiseArrayOperation in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", "1. Transformation FPGATransformSDFG in []\n", "2. Transformation FPGATransformState in [SDFGState (BinOp_5)]\n", @@ -491,88 +495,63 @@ "8. Transformation InLocalStorage in outer_fused[__i0=0:1000] -> outer_fused___i1[__i1=0:1000]\n", "9. Transformation MPITransformMap in [MapEntry (outer_fused[__i0=0:1000])]\n", "10. Transformation MPITransformMap in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "11. Transformation MapDimShuffle in [MapEntry (outer_fused[__i0=0:1000])]\n", - "12. Transformation MapDimShuffle in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "13. Transformation MapFission in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "14. Transformation MapInterchange in [MapEntry (outer_fused[__i0=0:1000]), MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "15. Transformation MapTiling in [MapEntry (outer_fused[__i0=0:1000])]\n", - "16. Transformation MapTiling in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "17. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=0:1000])]\n", - "18. Transformation MapTilingWithOverlap in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "19. Transformation MapToForLoop in [MapEntry (outer_fused[__i0=0:1000])]\n", - "20. Transformation MapToForLoop in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "11. Transformation MapFission in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "12. Transformation MapInterchange in [MapEntry (outer_fused[__i0=0:1000]), MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "13. Transformation MapTiling in [MapEntry (outer_fused[__i0=0:1000])]\n", + "14. Transformation MapTiling in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "15. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=0:1000])]\n", + "16. Transformation MapTilingWithOverlap in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "17. Transformation MapToForLoop in [MapEntry (outer_fused[__i0=0:1000])]\n", + "18. Transformation MapToForLoop in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "19. Transformation MapToForLoopRegion in [MapEntry (outer_fused[__i0=0:1000])]\n", + "20. Transformation MapToForLoopRegion in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", "21. Transformation MapUnroll in [MapEntry (outer_fused[__i0=0:1000])]\n", "22. Transformation NestSDFG in []\n", "23. Transformation OutLocalStorage in outer_fused___i1[__i1=0:1000] -> outer_fused[__i0=0:1000]\n", "24. Transformation ReductionNOperation in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", "25. Transformation StripMining in outer_fused: ['__i0']\n", "26. Transformation StripMining in outer_fused___i1: ['__i1']\n", - "27. Transformation TaskletFusion in [Tasklet (_Mult_), AccessNode (__tmp0), Tasklet (_Add_)]\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Select the pattern to apply (0 - 27 or name$id): MapTiling$0(tile_sizes=(128,))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "You selected (MapTiling$0) pattern MapTiling in [MapEntry (outer_fused[__i0=0:1000])] with parameters {'tile_sizes': (128,)}\n", - "0. Transformation ElementWiseArrayOperation in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "1. Transformation FPGATransformSDFG in []\n", - "2. Transformation FPGATransformState in [SDFGState (BinOp_5)]\n", - "3. Transformation GPUGridStridedTiling in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1]), MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "4. Transformation GPUGridStridedTiling in [MapEntry (outer_fused[tile___i0=0:1000:128]), MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "5. Transformation GPUTransformLocalStorage in outer_fused[tile___i0=0:1000:128]\n", - "6. Transformation GPUTransformMap in outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1]\n", - "7. Transformation GPUTransformMap in outer_fused___i1[__i1=0:1000]\n", - "8. Transformation GPUTransformMap in outer_fused[tile___i0=0:1000:128]\n", + "27. Transformation TaskletFusion in [Tasklet (_Mult_), AccessNode (__tmp0), Tasklet (_Add_)]\n", + "You selected (11) pattern MapFission in [MapEntry (outer_fused___i1[__i1=0:1000])] with parameters {}\n", + "0. Transformation BufferTiling in [MapExit (outer_fused___i1_fission[__i1=0:1000]), AccessNode (__tmp0), MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "1. Transformation ElementWiseArrayOperation in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "2. Transformation ElementWiseArrayOperation in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "3. Transformation FPGATransformSDFG in []\n", + "4. Transformation FPGATransformState in [SDFGState (BinOp_5)]\n", + "5. Transformation GPUTransformLocalStorage in outer_fused[__i0=0:1000]\n", + "6. Transformation GPUTransformMap in outer_fused[__i0=0:1000]\n", + "7. Transformation GPUTransformMap in outer_fused___i1_fission[__i1=0:1000]\n", + "8. Transformation GPUTransformMap in outer_fused___i1_fission[__i1=0:1000]\n", "9. Transformation GPUTransformSDFG in []\n", - "10. Transformation InLocalStorage in outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1] -> outer_fused___i1[__i1=0:1000]\n", - "11. Transformation InLocalStorage in outer_fused[tile___i0=0:1000:128] -> outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1]\n", - "12. Transformation MPITransformMap in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "13. Transformation MPITransformMap in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "14. Transformation MPITransformMap in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", - "15. Transformation MapDimShuffle in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "16. Transformation MapDimShuffle in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "17. Transformation MapDimShuffle in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", - "18. Transformation MapFission in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "19. Transformation MapInterchange in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1]), MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "20. Transformation MapTiling in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "21. Transformation MapTiling in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "22. Transformation MapTiling in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", - "23. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "24. Transformation MapTilingWithOverlap in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "25. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", - "26. Transformation MapToForLoop in [MapEntry (outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1])]\n", - "27. Transformation MapToForLoop in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", - "28. Transformation MapToForLoop in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", - "29. Transformation MapUnroll in [MapEntry (outer_fused[tile___i0=0:1000:128])]\n", + "10. Transformation InLocalStorage in outer_fused[__i0=0:1000] -> outer_fused___i1_fission[__i1=0:1000]\n", + "11. Transformation InLocalStorage in outer_fused[__i0=0:1000] -> outer_fused___i1_fission[__i1=0:1000]\n", + "12. Transformation MPITransformMap in [MapEntry (outer_fused[__i0=0:1000])]\n", + "13. Transformation MPITransformMap in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "14. Transformation MPITransformMap in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "15. Transformation MapFission in [MapEntry (outer_fused[__i0=0:1000])]\n", + "16. Transformation MapFusion in [MapExit (outer_fused___i1_fission[__i1=0:1000]), AccessNode (__tmp0), MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "17. Transformation MapTiling in [MapEntry (outer_fused[__i0=0:1000])]\n", + "18. Transformation MapTiling in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "19. Transformation MapTiling in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "20. Transformation MapTilingWithOverlap in [MapEntry (outer_fused[__i0=0:1000])]\n", + "21. Transformation MapTilingWithOverlap in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "22. Transformation MapTilingWithOverlap in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "23. Transformation MapToForLoop in [MapEntry (outer_fused[__i0=0:1000])]\n", + "24. Transformation MapToForLoop in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "25. Transformation MapToForLoop in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "26. Transformation MapToForLoopRegion in [MapEntry (outer_fused[__i0=0:1000])]\n", + "27. Transformation MapToForLoopRegion in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "28. Transformation MapToForLoopRegion in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "29. Transformation MapUnroll in [MapEntry (outer_fused[__i0=0:1000])]\n", "30. Transformation NestSDFG in []\n", - "31. Transformation OutLocalStorage in outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1] -> outer_fused[tile___i0=0:1000:128]\n", - "32. Transformation OutLocalStorage in outer_fused___i1[__i1=0:1000] -> outer_fused[__i0=tile___i0:Min(999, tile___i0 + 127) + 1]\n", - "33. Transformation ReductionNOperation in [MapEntry (outer_fused___i1[__i1=0:1000])]\n", + "31. Transformation OTFMapFusion in [MapExit (outer_fused___i1_fission[__i1=0:1000]), AccessNode (__tmp0), MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "32. Transformation OutLocalStorage in outer_fused___i1_fission[__i1=0:1000] -> outer_fused[__i0=0:1000]\n", + "33. Transformation ReductionNOperation in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", "34. Transformation StripMining in outer_fused: ['__i0']\n", - "35. Transformation StripMining in outer_fused___i1: ['__i1']\n", - "36. Transformation StripMining in outer_fused: ['tile___i0']\n", - "37. Transformation TaskletFusion in [Tasklet (_Mult_), AccessNode (__tmp0), Tasklet (_Add_)]\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Select the pattern to apply (0 - 37 or name$id): \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "35. Transformation StripMining in outer_fused___i1_fission: ['__i1']\n", + "36. Transformation StripMining in outer_fused___i1_fission: ['__i1']\n", + "37. Transformation Vectorization in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", + "38. Transformation Vectorization in [MapEntry (outer_fused___i1_fission[__i1=0:1000])]\n", "You did not select a valid option. Quitting optimization ...\n" ] } @@ -654,15 +633,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -786,7 +767,7 @@ { "data": { "text/plain": [ - "1" + "0" ] }, "execution_count": 14, @@ -815,15 +796,17 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" ], "text/plain": [ @@ -864,7 +847,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "dace_dev", "language": "python", "name": "python3" }, @@ -878,7 +861,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.0" } }, "nbformat": 4, From 9dd70bb02f676acefd8ebec7f136abb6f0009d25 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo Date: Fri, 1 Nov 2024 17:01:00 +0100 Subject: [PATCH 18/43] Fix broken codegen tutorial (#1720) Quick fix follow-up from https://github.com/spcl/dace/pull/1706 which left a broken notebook. Co-authored-by: Roman Cattaneo <> --- tutorials/codegen.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/codegen.ipynb b/tutorials/codegen.ipynb index 2c79f1a2e0..84b2cf7f01 100644 --- a/tutorials/codegen.ipynb +++ b/tutorials/codegen.ipynb @@ -497,8 +497,8 @@ " function_stream: CodeIOStream, callsite_stream: CodeIOStream):\n", " # The parameters here are:\n", " # sdfg: The SDFG we are currently generating.\n", - " # cfg: The current control flow graph (CFG) we are currently generating. For example, - " it can be the SDFG or a loop region. + " # cfg: The current control flow graph (CFG) we are currently generating. For example\n", + " # it can be the SDFG or a loop region.\n", " # scope: The subgraph of the state containing only the scope (map contents)\n", " # we want to generate the code for.\n", " # state_id: The state in the SDFG the subgraph is taken from (i.e.,\n", From 2c414919ad31f486e86fe9ac685035c4c3c04fc9 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo Date: Fri, 1 Nov 2024 18:51:20 +0100 Subject: [PATCH 19/43] CI: Update checkout and setup-python actions (#1718) GitHub Actions workflows use outdated versions of - actions/checkout - actions/setup-python These actions are built for specific node versions, which are now end of life. While the workflows continue to run, GitHub issues a warning (visible in the online interface) and runs them with newer versions of node. ![image](https://github.com/user-attachments/assets/159f4d86-33f5-4d9c-ad45-a5657ad51a57) Since both, checkout and setup-python, are basic actions, none of the features that DaCe workflows are using changed. We might see slight speedup from out of the box caching added to setup-python in recent versions. Parent issue: https://github.com/GEOS-ESM/SMT-Nebulae/issues/89 Co-authored-by: Roman Cattaneo <> --- .github/workflows/fpga-ci.yml | 2 +- .github/workflows/general-ci.yml | 4 ++-- .github/workflows/gpu-ci.yml | 2 +- .github/workflows/hardware_test.yml | 2 +- .github/workflows/heterogeneous-ci.yml | 2 +- .github/workflows/pyFV3-ci.yml | 6 +++--- .github/workflows/verilator_compatibility.yml | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml index ef8e5348da..2d6d42514f 100644 --- a/.github/workflows/fpga-ci.yml +++ b/.github/workflows/fpga-ci.yml @@ -16,7 +16,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-ci') }} runs-on: [self-hosted, linux, intel-fpga, xilinx-fpga] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml index faf0a727be..2044639e5f 100644 --- a/.github/workflows/general-ci.yml +++ b/.github/workflows/general-ci.yml @@ -18,11 +18,11 @@ jobs: simplify: [0,1,autoopt] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 527e004478..b3af9c8c05 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -19,7 +19,7 @@ jobs: if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')" runs-on: [self-hosted, gpu] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/hardware_test.yml b/.github/workflows/hardware_test.yml index 3fe32aaab7..e319c72587 100644 --- a/.github/workflows/hardware_test.yml +++ b/.github/workflows/hardware_test.yml @@ -4,7 +4,7 @@ jobs: test-rtl: runs-on: [self-hosted, linux, xilinx-fpga] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml index 99b566e21f..62887ad208 100644 --- a/.github/workflows/heterogeneous-ci.yml +++ b/.github/workflows/heterogeneous-ci.yml @@ -19,7 +19,7 @@ jobs: if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')" runs-on: [self-hosted, linux] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml index f58fdf85ac..85c864e475 100644 --- a/.github/workflows/pyFV3-ci.yml +++ b/.github/workflows/pyFV3-ci.yml @@ -21,18 +21,18 @@ jobs: python-version: [3.11.7] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: repository: 'NOAA-GFDL/PyFV3' ref: 'ci/DaCe' submodules: 'recursive' path: 'pyFV3' - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: path: 'dace' submodules: 'recursive' - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install library dependencies diff --git a/.github/workflows/verilator_compatibility.yml b/.github/workflows/verilator_compatibility.yml index 7f43565812..dce0c9b1fb 100644 --- a/.github/workflows/verilator_compatibility.yml +++ b/.github/workflows/verilator_compatibility.yml @@ -17,14 +17,14 @@ jobs: steps: - name: trigger reason run: echo "Trigger Reason:" ${{ github.event.inputs.reason }} - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: checkout submodules run: git submodule update --init --recursive - name: install apt packages run: sudo apt-get update && sudo apt-get -y install git make autoconf g++ flex bison libfl2 libfl-dev - name: compile verilator run: git clone https://github.com/verilator/verilator.git && cd verilator && git fetch origin && if [ ! "${{ matrix.verilator_version }}" == "master" ]; then git checkout v${{ matrix.verilator_version }}; fi && autoconf && ./configure && make -j2 && sudo make install - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: '3.8' architecture: 'x64' From 636811dcacf768b4b8817c55f5e7f0eabb87973c Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Fri, 1 Nov 2024 21:03:52 -0700 Subject: [PATCH 20/43] Bump version and update dependencies (#1722) Removes websockets dependency and makes jinja2 dependency optional --- dace/cli/dacelab.py | 5 ----- dace/cli/sdfg_diff.py | 6 +++++- dace/cli/sdfv.py | 6 +++++- dace/version.py | 2 +- requirements.txt | 25 ++++++++----------------- setup.py | 2 +- 6 files changed, 20 insertions(+), 26 deletions(-) diff --git a/dace/cli/dacelab.py b/dace/cli/dacelab.py index 27a3215e09..647ec31a3d 100644 --- a/dace/cli/dacelab.py +++ b/dace/cli/dacelab.py @@ -2,11 +2,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import argparse -import numpy -import pickle -import json - -import dace from dace.frontend.octave import parse from dace.sdfg.nodes import AccessNode diff --git a/dace/cli/sdfg_diff.py b/dace/cli/sdfg_diff.py index 9c40e59f10..2ec0a3adf4 100644 --- a/dace/cli/sdfg_diff.py +++ b/dace/cli/sdfg_diff.py @@ -9,7 +9,6 @@ import tempfile from typing import Dict, Literal, Set, Tuple, Union -import jinja2 import dace from dace import memlet as mlt from dace.sdfg import nodes as nd @@ -179,6 +178,11 @@ def main(): diff_sets = _sdfg_diff(sdfg_A, sdfg_B, eq_strategy) if args.graphical: + try: + import jinja2 + except (ImportError, ModuleNotFoundError): + raise ImportError('Graphical SDFG diff requires jinja2, please install by running `pip install jinja2`') + basepath = os.path.join(os.path.dirname(os.path.realpath(dace.__file__)), 'viewer') template_loader = jinja2.FileSystemLoader(searchpath=os.path.join(basepath, 'templates')) template_env = jinja2.Environment(loader=template_loader) diff --git a/dace/cli/sdfv.py b/dace/cli/sdfv.py index 49255a1e7e..2012debe82 100644 --- a/dace/cli/sdfv.py +++ b/dace/cli/sdfv.py @@ -13,7 +13,6 @@ import dace import tempfile -import jinja2 def partialclass(cls, *args, **kwds): @@ -48,6 +47,11 @@ def view(sdfg: dace.SDFG, filename: Optional[Union[str, int]] = None, verbose: b os.close(fd) return + try: + import jinja2 + except (ImportError, ModuleNotFoundError): + raise ImportError('SDFG.view() requires jinja2, please install by running `pip install jinja2`') + if type(sdfg) is dace.SDFG: sdfg = dace.serialize.dumps(sdfg.to_json()) diff --git a/dace/version.py b/dace/version.py index 9513287c94..1f356cc57b 100644 --- a/dace/version.py +++ b/dace/version.py @@ -1 +1 @@ -__version__ = '0.16.1' +__version__ = '1.0.0' diff --git a/requirements.txt b/requirements.txt index 3cc37cc468..b902968b73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,12 @@ -aenum==3.1.12 +aenum==3.1.15 astunparse==1.6.3 -certifi==2024.7.4 -charset-normalizer==3.1.0 -click==8.1.3 -dill==0.3.6 -fparser==0.1.3 -idna==3.7 -importlib-metadata==6.6.0 -Jinja2==3.1.4 -MarkupSafe==2.1.3 +dill==0.3.9 +fparser==0.1.4 mpmath==1.3.0 -networkx==3.1 -numpy==1.26.1 +networkx==3.4.2 +numpy==1.26.4 +packaging==24.1 ply==3.11 -PyYAML==6.0.1 +PyYAML==6.0.2 six==1.16.0 -sympy==1.9 -urllib3==2.2.2 -websockets==11.0.3 -zipp==3.15.0 +sympy==1.13.3 diff --git a/setup.py b/setup.py index 6e8635bdf6..c228ae4558 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ }, include_package_data=True, install_requires=[ - 'numpy < 2.0', 'networkx >= 2.5', 'astunparse', 'sympy >= 1.9', 'pyyaml', 'ply', 'websockets', 'jinja2', + 'numpy < 2.0', 'networkx >= 2.5', 'astunparse', 'sympy >= 1.9', 'pyyaml', 'ply', 'fparser >= 0.1.3', 'aenum >= 3.1', 'dataclasses; python_version < "3.7"', 'dill', 'pyreadline;platform_system=="Windows"', 'typing-compat; python_version < "3.8"', 'packaging' ] + cmake_requires, From b27024b57eeb679fe1326b00525228d581ca369e Mon Sep 17 00:00:00 2001 From: Philipp Schaad Date: Mon, 4 Nov 2024 17:44:37 +0100 Subject: [PATCH 21/43] Various Cutout Fixes (#1662) - [x] Fix cutouts w.r.t. the use of UIDs, allowing them to be preserved or re-generated depending on an input parameter - [x] Fix singlestate cutout extraction when memlets access struct members. --- dace/sdfg/analysis/cutout.py | 56 ++++++++++++++++++++++++++++++------ dace/sdfg/nodes.py | 10 +++++++ dace/sdfg/sdfg.py | 10 +++++++ 3 files changed, 67 insertions(+), 9 deletions(-) diff --git a/dace/sdfg/analysis/cutout.py b/dace/sdfg/analysis/cutout.py index 5d2eae7c6f..ec95157989 100644 --- a/dace/sdfg/analysis/cutout.py +++ b/dace/sdfg/analysis/cutout.py @@ -118,7 +118,7 @@ def from_json(cls, json_obj, context=None): def from_transformation( cls, sdfg: SDFG, transformation: Union[PatternTransformation, SubgraphTransformation], make_side_effects_global = True, use_alibi_nodes: bool = True, reduce_input_config = True, - symbols_map: Optional[Dict[str, Any]] = None + symbols_map: Optional[Dict[str, Any]] = None, preserve_guids: bool = False ) -> Union['SDFGCutout', SDFG]: """ Create a cutout from a transformation's set of affected graph elements. @@ -130,6 +130,9 @@ def from_transformation( :param reduce_input_config: Whether to reduce the input configuration where possible in singlestate cutouts. :param symbols_map: A mapping of symbols to values to use for the cutout. Optional, only used when reducing the input configuration. + :param preserve_guids: If True, ensures that the GUIDs of graph elements contained in the cutout remain + identical to the ones in their original graph. If False, new GUIDs will be generated. + False by default. :return: The cutout. """ affected_nodes = _transformation_determine_affected_nodes(sdfg, transformation) @@ -150,11 +153,12 @@ def from_transformation( state = target_sdfg.node(transformation.state_id) cutout = cls.singlestate_cutout(state, *affected_nodes, make_side_effects_global=make_side_effects_global, use_alibi_nodes=use_alibi_nodes, reduce_input_config=reduce_input_config, - symbols_map=symbols_map) + symbols_map=symbols_map, preserve_guids=preserve_guids) cutout.translate_transformation_into(transformation) return cutout elif isinstance(transformation, MultiStateTransformation): - cutout = cls.multistate_cutout(*affected_nodes, make_side_effects_global=make_side_effects_global) + cutout = cls.multistate_cutout(*affected_nodes, make_side_effects_global=make_side_effects_global, + preserve_guids=preserve_guids) # If the cutout is an SDFG, there's no need to translate the transformation. if isinstance(cutout, SDFGCutout): cutout.translate_transformation_into(transformation) @@ -169,14 +173,15 @@ def singlestate_cutout(cls, make_side_effects_global: bool = True, use_alibi_nodes: bool = True, reduce_input_config: bool = False, - symbols_map: Optional[Dict[str, Any]] = None) -> 'SDFGCutout': + symbols_map: Optional[Dict[str, Any]] = None, + preserve_guids: bool = False) -> 'SDFGCutout': """ Cut out a subgraph of a state from an SDFG to run separately for localized testing or optimization. The subgraph defined by the list of nodes will be extended to include access nodes of data containers necessary to run the graph separately. In addition, all transient data containers that may contain data when the cutout is executed are made global, as well as any transient data containers which are written to inside the cutout but may be read after the cutout. - + :param state: The SDFG state in which the subgraph resides. :param nodes: The nodes in the subgraph to cut out. :param make_copy: If True, deep-copies every SDFG element in the copy. Otherwise, original references are kept. @@ -188,17 +193,29 @@ def singlestate_cutout(cls, :param reduce_input_config: Whether to reduce the input configuration where possible in singlestate cutouts. :param symbols_map: A mapping of symbols to values to use for the cutout. Optional, only used when reducing the input configuration. + :param preserve_guids: If True, ensures that the GUIDs of graph elements contained in the cutout remain + identical to the ones in their original graph. If False, new GUIDs will be generated. + False by default - if make_copy is False, this has no effect by extension. :return: The created SDFGCutout. """ if reduce_input_config: nodes = _reduce_in_configuration(state, nodes, use_alibi_nodes, symbols_map) - create_element = copy.deepcopy if make_copy else (lambda x: x) + + def clone_f(x: Union[Memlet, InterstateEdge, nd.Node, ControlFlowBlock]): + ret = copy.deepcopy(x) + if preserve_guids: + ret.guid = x.guid + return ret + + create_element = clone_f if make_copy else (lambda x: x) sdfg = state.parent subgraph: StateSubgraphView = StateSubgraphView(state, nodes) subgraph = _extend_subgraph_with_access_nodes(state, subgraph, use_alibi_nodes) # Make a new SDFG with the included constants, used symbols, and data containers. cutout = SDFGCutout(sdfg.name + '_cutout', sdfg.constants_prop) + if preserve_guids: + cutout.guid = sdfg.guid cutout._base_sdfg = sdfg defined_syms = subgraph.defined_symbols() freesyms = subgraph.free_symbols @@ -213,11 +230,24 @@ def singlestate_cutout(cls, memlet = edge.data if memlet.data in cutout.arrays: continue - new_desc = sdfg.arrays[memlet.data].clone() - cutout.add_datadesc(memlet.data, new_desc) + dataname = memlet.data + if '.' in dataname: + # This is an access to a struct memeber, which typically happens for the memlets between an access node + # pointing to a struct (or view thereof), and a view pointing to the member. Assert that this is indeed + # the case (i.e., only one '.' is found in the name of the data being accessed), and if so, clone the + # struct (or struct view) data descriptor instad. + parts = dataname.split('.') + if len(parts) == 2: + dataname = parts[0] + else: + raise RuntimeError('Attempting to add invalid multi-nested data ' + memlet.data + ' to a cutout') + new_desc = sdfg.arrays[dataname].clone() + cutout.add_datadesc(dataname, new_desc) # Add a single state with the extended subgraph new_state = cutout.add_state(state.label, is_start_state=True) + if preserve_guids: + new_state.guid = state.guid in_translation = dict() out_translation = dict() for e in sg_edges: @@ -322,6 +352,7 @@ def singlestate_cutout(cls, def multistate_cutout(cls, *states: SDFGState, make_side_effects_global: bool = True, + preserve_guids: bool = False, override_start_block: Optional[ControlFlowBlock] = None) -> Union['SDFGCutout', SDFG]: """ Cut out a multi-state subgraph from an SDFG to run separately for localized testing or optimization. @@ -337,12 +368,19 @@ def multistate_cutout(cls, :param make_side_effects_global: If True, all transient data containers which are read inside the cutout but may be written to _before_ the cutout, or any data containers which are written to inside the cutout but may be read _after_ the cutout, are made global. + :param preserve_guids: If True, ensures that the GUIDs of graph elements contained in the cutout remain + identical to the ones in their original graph. If False, new GUIDs will be generated. + False by default - if make_copy is False, this has no effect by extension. :param override_start_block: If set, explicitly force a given control flow block to be the start block. If left None (default), the start block is automatically determined based on domination relationships in the original graph. :return: The created SDFGCutout or the original SDFG where no smaller cutout could be obtained. """ - create_element = copy.deepcopy + def create_element(x: Union[ControlFlowBlock, InterstateEdge]) -> Union[ControlFlowBlock, InterstateEdge]: + ret = copy.deepcopy(x) + if preserve_guids: + ret.guid = x.guid + return ret # Check that all states are inside the same SDFG. sdfg = list(states)[0].parent diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 4ae91d5ea0..d29b1a22e4 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -55,6 +55,16 @@ def __str__(self): else: return type(self).__name__ + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + if k == 'guid': # Skip ID + continue + setattr(result, k, dcpy(v, memo)) + return result + def validate(self, sdfg, state): pass diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index cb8a7d5c2d..19d2a47295 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -205,6 +205,16 @@ def __setattr__(self, name: str, value: Any) -> None: super().__setattr__('_uncond', None) return super().__setattr__(name, value) + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + if k == 'guid': # Skip ID + continue + setattr(result, k, copy.deepcopy(v, memo)) + return result + @staticmethod def _convert_assignment(assignment) -> str: if isinstance(assignment, ast.AST): From 64d76799bf4f27f128b5301d97aecb78b634df2f Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 4 Nov 2024 08:44:54 -0800 Subject: [PATCH 22/43] Various stability improvements and convenience APIs (#1724) * Various minor code generation and runtime fixes * Minor API improvements to CompiledSDFG, sdfg.view(), and subset offsetting * Minor memlet propagation fix * Various simplify pass fixes that pertain to use of views, references, and tasklets with side effects * Symbolic support for shift and ternary expressions (fixes #1315) * Pass permissiveness into transformations --- dace/cli/sdfgcc.py | 2 +- dace/cli/sdfv.py | 6 +- dace/codegen/compiled_sdfg.py | 5 + dace/codegen/compiler.py | 2 +- dace/codegen/cppunparse.py | 8 +- dace/codegen/tools/type_inference.py | 2 + dace/dtypes.py | 2 + dace/memlet.py | 4 +- dace/properties.py | 2 +- dace/runtime/include/dace/stream.h | 8 +- dace/sdfg/graph.py | 2 +- dace/sdfg/propagation.py | 6 +- dace/sdfg/sdfg.py | 9 +- dace/sdfg/utils.py | 67 ++++++++-- dace/sdfg/validation.py | 4 +- dace/subsets.py | 26 ++-- dace/symbolic.py | 117 ++++++++++++++---- .../dataflow/redundant_array.py | 10 ++ .../transformation/dataflow/tasklet_fusion.py | 102 +++++++++++++-- dace/transformation/helpers.py | 3 + dace/transformation/interstate/loop_to_map.py | 5 +- .../interstate/multistate_inline.py | 3 + .../transformation/interstate/sdfg_nesting.py | 3 + .../transformation/interstate/state_fusion.py | 9 ++ .../passes/analysis/analysis.py | 55 +++++++- .../transformation/passes/pattern_matching.py | 2 + .../transformation/passes/scalar_to_symbol.py | 25 ++-- tests/transformations/tasklet_fusion_test.py | 35 +++++- 28 files changed, 440 insertions(+), 84 deletions(-) diff --git a/dace/cli/sdfgcc.py b/dace/cli/sdfgcc.py index 1df7604b4b..0d04950be7 100644 --- a/dace/cli/sdfgcc.py +++ b/dace/cli/sdfgcc.py @@ -48,7 +48,7 @@ def main(): sdfg = SDFGOptimizer(sdfg).optimize() # Compile SDFG - sdfg.compile(outpath) + sdfg.compile(outpath, return_program_handle=False) # Copying header file to optional path if outpath is not None: diff --git a/dace/cli/sdfv.py b/dace/cli/sdfv.py index 2012debe82..d14059468f 100644 --- a/dace/cli/sdfv.py +++ b/dace/cli/sdfv.py @@ -43,7 +43,11 @@ def view(sdfg: dace.SDFG, filename: Optional[Union[str, int]] = None, verbose: b ): fd, filename = tempfile.mkstemp(suffix='.sdfg') sdfg.save(filename) - os.system(f'code {filename}') + if platform.system() == 'Darwin': + # Special case for MacOS + os.system(f'open {filename}') + else: + os.system(f'code {filename}') os.close(fd) return diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 9bfcc439e0..332db028ae 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -518,6 +518,9 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: # Otherwise, None values are passed as null pointers below elif isinstance(arg, ctypes._Pointer): pass + elif isinstance(arg, str): + # Cast to bytes + arglist[i] = ctypes.c_char_p(arg.encode('utf-8')) else: raise TypeError(f'Passing an object (type {type(arg).__name__}) to an array in argument "{a}"') elif is_array and not is_dtArray: @@ -550,6 +553,8 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: pass elif isinstance(arg, float) and atype.dtype.type == np.float64: pass + elif isinstance(arg, bool) and atype.dtype.type == np.bool_: + pass elif (isinstance(arg, str) or arg is None) and atype.dtype == dtypes.string: if arg is None: arglist[i] = ctypes.c_char_p(None) diff --git a/dace/codegen/compiler.py b/dace/codegen/compiler.py index 350e141606..236f832cac 100644 --- a/dace/codegen/compiler.py +++ b/dace/codegen/compiler.py @@ -213,7 +213,7 @@ def configure_and_compile(program_folder, program_name=None, output_stream=None) # Clean CMake directory and try once more if Config.get_bool('debugprint'): print('Cleaning CMake build folder and retrying...') - shutil.rmtree(build_folder) + shutil.rmtree(build_folder, ignore_errors=True) os.makedirs(build_folder) try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index edeb5270ca..c375147930 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -555,7 +555,11 @@ def _write_constant(self, value): if result.find("b'") >= 0: self.write(result) else: - self.write(result.replace('\'', '\"')) + towrite = result + if result.startswith("'"): + towrite = result[1:-1].replace('"', '\\"') + towrite = f'"{towrite}"' + self.write(towrite) def _Constant(self, t): value = t.value @@ -1187,6 +1191,8 @@ def py2cpp(code, expr_semicolon=True, defined_symbols=None): return cppunparse(ast.parse(symbolic.symstr(code, cpp_mode=True)), expr_semicolon, defined_symbols=defined_symbols) + elif isinstance(code, int): + return str(code) elif code.__class__.__name__ == 'function': try: code_str = inspect.getsource(code) diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py index 8f8dd84151..26b369fa9d 100644 --- a/dace/codegen/tools/type_inference.py +++ b/dace/codegen/tools/type_inference.py @@ -375,6 +375,8 @@ def _Compare(t, symbols, inferred_symbols): for o, e in zip(t.ops, t.comparators): if o.__class__.__name__ not in cppunparse.CPPUnparser.cmpops: continue + if isinstance(e, ast.Constant) and e.value is None: + continue inf_type = _dispatch(e, symbols, inferred_symbols) if isinstance(inf_type, dtypes.vector): # Make sure all occuring vectors are of same size diff --git a/dace/dtypes.py b/dace/dtypes.py index a016ac60e2..d0c6f23e03 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -404,6 +404,8 @@ def __init__(self, wrapped_type, typename=None): wrapped_type = numpy.bool_ elif getattr(wrapped_type, '__name__', '') == 'bool_' and typename is None: typename = 'bool' + elif wrapped_type is type(None): + wrapped_type = None self.type = wrapped_type # Type in Python self.ctype = _CTYPES[wrapped_type] # Type in C diff --git a/dace/memlet.py b/dace/memlet.py index f78da3a6b7..85bd0a348d 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -555,9 +555,9 @@ def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]: from dace.sdfg import nodes if isinstance(edge.dst, nodes.CodeNode) or isinstance(edge.src, nodes.CodeNode): view_edge = True - elif edge.dst_conn == 'views' and isinstance(edge.dst, nodes.AccessNode): + elif edge.dst_conn and isinstance(edge.dst, nodes.AccessNode): view_edge = True - elif edge.src_conn == 'views' and isinstance(edge.src, nodes.AccessNode): + elif edge.src_conn and isinstance(edge.src, nodes.AccessNode): view_edge = True if not view_edge: diff --git a/dace/properties.py b/dace/properties.py index 09439ce4f8..82be72f9fd 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -329,7 +329,7 @@ def initialize_properties(obj, *args, **kwargs): for name, prop in own_properties.items(): # Only assign our own properties, so we don't overwrite what's been # set by the base class - if hasattr(obj, name): + if hasattr(obj, '_' + name): raise PropertyError("Property {} already assigned in {}".format(name, type(obj).__name__)) if not prop.indirected: if prop.allow_none or prop.default is not None: diff --git a/dace/runtime/include/dace/stream.h b/dace/runtime/include/dace/stream.h index 255e16ec2b..1f8134fae6 100644 --- a/dace/runtime/include/dace/stream.h +++ b/dace/runtime/include/dace/stream.h @@ -338,7 +338,7 @@ namespace dace { template struct Consume { - template