From 78a1a03052aeb0b95422fa96be2db251b30464da Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 26 Jul 2022 17:17:54 +0200 Subject: [PATCH 001/127] Added module, method, and attribute replacements to allow parsing of mpi4py compatible cartesian comm methods, bcast, and allreduce. --- dace/frontend/common/distr.py | 293 +++++++++++++++++++++++++--------- 1 file changed, 221 insertions(+), 72 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 98a8f23e87..c201e7ce14 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -16,6 +16,150 @@ RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic] +##### MPI Cartesian Communicators + + +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Create_cart') +@oprepo.replaces('dace.comm.Cart_create') +def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: ShapeType): + """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html). + :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. + :return: Name of the new process-grid descriptor. + """ + pgrid_name = sdfg.add_pgrid(dims) + + # Dummy tasklet adds MPI variables to the program's state. + from dace.libraries.mpi import Dummy + tasklet = Dummy(pgrid_name, [ + f'MPI_Comm {pgrid_name}_comm;', + f'MPI_Group {pgrid_name}_group;', + f'int {pgrid_name}_coords[{len(dims)}];', + f'int {pgrid_name}_dims[{len(dims)}];', + f'int {pgrid_name}_rank;', + f'int {pgrid_name}_size;', + f'bool {pgrid_name}_valid;', + ]) + + state.add_node(tasklet) + + # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. + _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) + wnode = state.add_write(pgrid_name) + state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) + + return pgrid_name + + +@oprepo.replaces_method('Intracomm', 'Create_cart') +def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', dims: ShapeType): + """ Equivalent to `dace.comm.Cart_create(dims). + :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. + :return: Name of the new process-grid descriptor. + """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + return _cart_create(pv, sdfg, state, dims) + + + +@oprepo.replaces('dace.comm.Cart_sub') +def _cart_sub(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + parent_grid: str, + color: Sequence[Union[Integral, bool]], + exact_grid: RankType = None): + """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program. + The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html). + :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). + :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). + :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication. + :return: Name of the new sub-grid descriptor. + """ + pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid) + + # Count sub-grid dimensions. + pgrid_ndims = sum([bool(c) for c in color]) + + # Dummy tasklet adds MPI variables to the program's state. + from dace.libraries.mpi import Dummy + tasklet = Dummy(pgrid_name, [ + f'MPI_Comm {pgrid_name}_comm;', + f'MPI_Group {pgrid_name}_group;', + f'int {pgrid_name}_coords[{pgrid_ndims}];', + f'int {pgrid_name}_dims[{pgrid_ndims}];', + f'int {pgrid_name}_rank;', + f'int {pgrid_name}_size;', + f'bool {pgrid_name}_valid;', + ]) + + state.add_node(tasklet) + + # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. + _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) + wnode = state.add_write(pgrid_name) + state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) + + return pgrid_name + + +@oprepo.replaces_method('ProcessGrid', 'Sub') +def _pgrid_sub(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + parent_grid: str, + color: Sequence[Union[Integral, bool]]): + """ Equivalent to `dace.comm.Cart_sub(parent_grid, color). + :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). + :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). + :return: Name of the new sub-grid descriptor. + """ + + return _cart_sub(pv, sdfg, state, parent_grid, color) + + +@oprepo.replaces_operator('ProcessGrid', 'Eq', otherclass='Comm') +@oprepo.replaces_operator('ProcessGrid', 'Is', otherclass='Comm') +def _pgrid_eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'): + from mpi4py import MPI + if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL: + return False + return True + + +@oprepo.replaces_operator('Comm', 'Eq', otherclass='ProcessGrid') +@oprepo.replaces_operator('Comm', 'Is', otherclass='ProcessGrid') +def _comm_eq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'): + from mpi4py import MPI + if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL: + return False + return True + + +@oprepo.replaces_operator('ProcessGrid', 'NotEq', otherclass='Comm') +@oprepo.replaces_operator('ProcessGrid', 'IsNot', otherclass='Comm') +def _pgrid_neq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'): + from mpi4py import MPI + if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL: + return True + return False + + +@oprepo.replaces_operator('Comm', 'NotEq', otherclass='ProcessGrid') +@oprepo.replaces_operator('Comm', 'IsNot', otherclass='ProcessGrid') +def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'): + from mpi4py import MPI + if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL: + return True + return False + + +##### MPI Collectives + + +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast') @oprepo.replaces('dace.comm.Bcast') def _bcast(pv: 'ProgramVisitor', sdfg: SDFG, @@ -45,6 +189,41 @@ def _bcast(pv: 'ProgramVisitor', return None +@oprepo.replaces_method('Intracomm', 'Bcast') +def _intracomm_bcast(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + icomm: 'Intracomm', + buffer: str, + root: Union[str, sp.Expr, Number] = 0): + + """ Equivalent to `dace.comm.Bcast(buffer, root)`. """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + return _bcast(pv, sdfg, state, buffer, root) + + +@oprepo.replaces_method('ProcessGrid', 'Bcast') +def _pgrid_bcast(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + pgrid: str, + buffer: str, + root: Union[str, sp.Expr, Number] = 0): + + """ Equivalent to `dace.comm.Bcast(buffer, root, grid=pgrid)`. """ + + return _bcast(pv, sdfg, state, buffer, root, grid=pgrid) + + +def _mpi4py_to_MPI(MPI, op): + if op is MPI.SUM: + return 'MPI_SUM' + raise NotImplementedError + + @oprepo.replaces('dace.comm.Reduce') def _Reduce(pv: 'ProgramVisitor', sdfg: SDFG, @@ -75,8 +254,9 @@ def _Reduce(pv: 'ProgramVisitor', return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') -def _Allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): +def _allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): from dace.libraries.mpi.nodes.allreduce import Allreduce @@ -90,6 +270,46 @@ def _Allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, return None +@oprepo.replaces_method('Intracomm', 'Allreduce') +def _intracomm_allreduce(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + icomm: 'Intracomm', + inp_buffer: 'InPlace', + out_buffer: str, + op: str): + + """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + if inp_buffer != MPI.IN_PLACE: + raise ValueError('DaCe currently supports in-place Allreduce only.') + if isinstance(op, MPI.Op): + op = _mpi4py_to_MPI(MPI, op) + return _allreduce(pv, sdfg, state, out_buffer, op) + + +@oprepo.replaces_method('ProcessGrid', 'Allreduce') +def _pgrid_allreduce(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + pgrid: str, + inp_buffer: 'InPlace', + out_buffer: str, + op: str): + + """ Equivalent to `dace.comm.Allreduce(out_buffer, op, grid=pgrid)`. """ + + from mpi4py import MPI + if inp_buffer != MPI.IN_PLACE: + raise ValueError('DaCe currently supports in-place Allreduce only.') + if isinstance(op, MPI.Op): + op = _mpi4py_to_MPI(MPI, op) + return _allreduce(pv, sdfg, state, out_buffer, op, grid=pgrid) + + @oprepo.replaces('dace.comm.Scatter') def _scatter(pv: 'ProgramVisitor', sdfg: SDFG, @@ -519,77 +739,6 @@ def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str): return None -@oprepo.replaces('dace.comm.Cart_create') -def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: ShapeType): - """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html). - :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. - :return: Name of the new process-grid descriptor. - """ - pgrid_name = sdfg.add_pgrid(dims) - - # Dummy tasklet adds MPI variables to the program's state. - from dace.libraries.mpi import Dummy - tasklet = Dummy(pgrid_name, [ - f'MPI_Comm {pgrid_name}_comm;', - f'MPI_Group {pgrid_name}_group;', - f'int {pgrid_name}_coords[{len(dims)}];', - f'int {pgrid_name}_dims[{len(dims)}];', - f'int {pgrid_name}_rank;', - f'int {pgrid_name}_size;', - f'bool {pgrid_name}_valid;', - ]) - - state.add_node(tasklet) - - # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. - _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) - wnode = state.add_write(pgrid_name) - state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) - - return pgrid_name - - -@oprepo.replaces('dace.comm.Cart_sub') -def _cart_sub(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - parent_grid: str, - color: Sequence[Union[Integral, bool]], - exact_grid: RankType = None): - """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program. - The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html). - :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). - :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). - :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication. - :return: Name of the new sub-grid descriptor. - """ - pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid) - - # Count sub-grid dimensions. - pgrid_ndims = sum([bool(c) for c in color]) - - # Dummy tasklet adds MPI variables to the program's state. - from dace.libraries.mpi import Dummy - tasklet = Dummy(pgrid_name, [ - f'MPI_Comm {pgrid_name}_comm;', - f'MPI_Group {pgrid_name}_group;', - f'int {pgrid_name}_coords[{pgrid_ndims}];', - f'int {pgrid_name}_dims[{pgrid_ndims}];', - f'int {pgrid_name}_rank;', - f'int {pgrid_name}_size;', - f'bool {pgrid_name}_valid;', - ]) - - state.add_node(tasklet) - - # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. - _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) - wnode = state.add_write(pgrid_name) - state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) - - return pgrid_name - - @oprepo.replaces('dace.comm.Subarray') def _subarray(pv: 'ProgramVisitor', sdfg: SDFG, From a13a81d139515511405dfff05303d2dbd77a9982 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 26 Jul 2022 17:18:51 +0200 Subject: [PATCH 002/127] ProcessGrids now appear in defined variables and are explicitely returned by the gettype method. --- dace/frontend/python/newast.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index b41745ffaa..f1bdcbd97b 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1117,6 +1117,13 @@ def __init__(self, # Indirections self.indirections = dict() + # Add mpi4py.MPI.COMM_WORLD aliases to variables + # try: + # from mpi4py import MPI + # self.variables.update({k: "MPI_COMM_WORLD" for k, v in self.globals.items() if v is MPI.COMM_WORLD}) + # except: + # pass + @classmethod def progress_count(cls) -> int: """ Returns the number of parsed SDFGs so far within this run. """ @@ -1267,6 +1274,14 @@ def defined(self): # TODO: Is there a case of a variable-symbol? result.update({k: self.sdfg.symbols[v] for k, v in self.variables.items() if v in self.sdfg.symbols}) + # MPI-related stuff + result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids}) + # try: + # from mpi4py import MPI + # result.update({k: v for k, v in self.globals.items() if v is MPI.COMM_WORLD}) + # except: + # pass + return result def _add_state(self, label=None): @@ -4453,7 +4468,9 @@ def _gettype(self, opnode: ast.AST) -> List[Tuple[str, str]]: result = [] for operand in operands: - if isinstance(operand, str) and operand in self.sdfg.arrays: + if isinstance(operand, str) and operand in self.sdfg.process_grids: + result.append((operand, type(self.sdfg.process_grids[operand]).__name__)) + elif isinstance(operand, str) and operand in self.sdfg.arrays: result.append((operand, type(self.sdfg.arrays[operand]).__name__)) elif isinstance(operand, str) and operand in self.scope_arrays: result.append((operand, type(self.scope_arrays[operand]).__name__)) From a8d56901b7dc8372131951e3a026fb11df4052e1 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 26 Jul 2022 17:19:30 +0200 Subject: [PATCH 003/127] Added MPIResolver to resolve mpi4py-related constants during preprocessing. --- dace/frontend/python/preprocessing.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index 9f39648f09..9b6d3650c1 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -1316,6 +1316,28 @@ def find_disallowed_statements(node: ast.AST): return None +class MPIResolver(ast.NodeTransformer): + """ Resolves mpi4py-related constants, e.g., mpi4py.MPI.COMM_WORLD. """ + def __init__(self, globals: Dict[str, Any]): + from mpi4py import MPI + self.globals = globals + self.MPI = MPI + + def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]: + if node.id in self.globals: + obj = self.globals[node.id] + if isinstance(obj, self.MPI.Comm): + lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI') + if obj is self.MPI.COMM_WORLD: + return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node) + elif obj is self.MPI.COMM_NULL: + return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node) + else: + raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used ' + 'directly inside a DaCe Python program.') + return node + + def preprocess_dace_program(f: Callable[..., Any], argtypes: Dict[str, data.Data], global_vars: Dict[str, Any], @@ -1356,6 +1378,11 @@ def preprocess_dace_program(f: Callable[..., Any], newmod = global_vars[mod] #del global_vars[mod] global_vars[modval] = newmod + + try: + src_ast = MPIResolver(global_vars).visit(src_ast) + except ModuleNotFoundError: + pass # Resolve constants to their values (if they are not already defined in this scope) # and symbols to their names From ac480bd2ec75cf0587be113707f2ac570a8c44da Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 26 Jul 2022 17:20:09 +0200 Subject: [PATCH 004/127] Added mpi4py compatiblity tests. --- tests/library/mpi/mpi4py_test.py | 181 +++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 tests/library/mpi/mpi4py_test.py diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py new file mode 100644 index 0000000000..cc9968e4f8 --- /dev/null +++ b/tests/library/mpi/mpi4py_test.py @@ -0,0 +1,181 @@ +# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +import numpy as np +import pytest + + + +@pytest.mark.mpi +def test_process_grid_bcast(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def pgrid_bcast(A: dace.int32[10]): + pgrid = MPI.COMM_WORLD.Create_cart([1, size]) + if pgrid != MPI.COMM_NULL: + pgrid.Bcast(A) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = pgrid_bcast.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + if rank == 0: + A = np.arange(10, dtype=np.int32) + A_ref = A.copy() + else: + A = np.zeros((10, ), dtype=np.int32) + A_ref = A.copy() + + func(A=A) + pgrid_bcast.f(A_ref) + + assert(np.array_equal(A, A_ref)) + + +@pytest.mark.mpi +def test_sub_grid_bcast(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def subgrid_bcast(A: dace.int32[10], rank: dace.int32): + pgrid = commworld.Create_cart([2, size // 2]) + if pgrid != MPI.COMM_NULL: + sgrid = pgrid.Sub([False, True]) + pgrid.Bcast(A) + B = np.empty_like(A) + B[:] = rank % 10 + if pgrid != MPI.COMM_NULL: + sgrid.Bcast(B) + A[:] = B + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = subgrid_bcast.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + if rank == 0: + A = np.arange(10, dtype=np.int32) + else: + A = np.ones((10, ), dtype=np.int32) + A_ref = A.copy() + + func(A=A, rank=rank) + subgrid_bcast.f(A_ref, rank) + + assert(np.array_equal(A, A_ref)) + + +def initialize_3mm(b_NI: int, b_NJ: int, b_NK: int, b_NL: int, b_NM: int, + ts_NI: int, ts_NJ: int, ts_NK, ts_NL: int, ts_NM: int, + NI: int, NJ: int, NK: int, NL: int, NM: int, + datatype: type = np.float64): + + A = np.fromfunction(lambda i, k: b_NK + k + 1, (ts_NI, ts_NK), dtype=datatype) + B = np.eye(ts_NK, ts_NJ, b_NK - b_NJ) + C = np.fromfunction(lambda j, m: b_NJ + j + 1, (ts_NJ, ts_NM), dtype=datatype) + D = np.eye(ts_NM, ts_NL, b_NM - b_NL) + + if b_NI + ts_NI > NI: + A[NI - b_NI:] = 0 + if b_NJ + ts_NJ > NJ: + B[:, NJ - b_NJ:] = 0 + C[NJ - b_NJ:] = 0 + if b_NK + ts_NJ > NK: + A[:, NK - b_NK:] = 0 + B[NK - b_NK:] = 0 + if b_NL + ts_NL > NL: + D[:NL - b_NL] = 0 + if b_NM + ts_NM > NM: + C[:NM - b_NM] = 0 + D[NM - b_NM:] = 0 + + return A, B, C, D + + +@pytest.mark.mpi +def test_3mm(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def k3mm(A, B, C, D): + cart_comm = commworld.Create_cart([1, size, 1]) + if cart_comm != MPI.COMM_NULL: + + ab_reduce_comm = cart_comm.Sub([False, False, True]) + cd_reduce_comm = cart_comm.Sub([True, False, False]) + abcd_reduce_comm = cart_comm.Sub([False, True, False]) + + ab = A @ B + ab_reduce_comm.Allreduce(MPI.IN_PLACE, ab, op=MPI.SUM) + cd = C @ D + cd_reduce_comm.Allreduce(MPI.IN_PLACE, cd, op=MPI.SUM) + E = ab @ cd + abcd_reduce_comm.Allreduce(MPI.IN_PLACE, E, op=MPI.SUM) + + return E + + N = 128 + assert(size <= 128) + + NI, NJ, NK, NL, NM = (N,) * 5 + PNI, PNJ, PNK, PNL, PNM = 1, 2, 1, 1, 1 + + cart_comm = commworld.Create_cart([1, size, 1]) + cart_rank = cart_comm.Get_rank() + cart_size = cart_comm.Get_size() + cart_coords = cart_comm.Get_coords(cart_rank) + + ts_NI = int(np.ceil(NI / PNI)) + ts_NJ = int(np.ceil(NJ / PNJ)) + ts_NK = int(np.ceil(NJ / PNK)) + ts_NL = int(np.ceil(NL / PNL)) + ts_NM = int(np.ceil(NM / PNM)) + + b_NI = cart_coords[0] * ts_NI + b_NJ = cart_coords[1] * ts_NJ + b_NK = cart_coords[2] * ts_NK + b_NL = cart_coords[2] * ts_NL + b_NM = cart_coords[0] * ts_NM + A, B, C, D = initialize_3mm(b_NI, b_NJ, b_NK, b_NL, b_NM, ts_NI, ts_NJ, ts_NK, ts_NL, ts_NM, NI, NJ, NK, NL, NM) + + sdfg = None + if rank == 0: + sdfg = k3mm.to_sdfg(A=A, B=B, C=C, D=D) + func = utils.distributed_compile(sdfg, commworld) + + E = func(A=A, B=B, C=C, D=D) + commworld.Barrier() + E_ref = k3mm.f(A, B, C, D) + commworld.Barrier() + + if E_ref is not None: + assert(np.array_equal(E, E_ref)) + + + +if __name__ == "__main__": + + test_process_grid_bcast() + test_sub_grid_bcast() + test_3mm() From d2292669d7b4b88fc2987239a3805ed67b60164a Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:54:26 +0200 Subject: [PATCH 005/127] Made opaque type for MPI_Request a basic dace type. --- dace/dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dace/dtypes.py b/dace/dtypes.py index 0055eef837..a622b697c2 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1115,6 +1115,7 @@ def isconstant(var): float64 = typeclass(numpy.float64) complex64 = typeclass(numpy.complex64) complex128 = typeclass(numpy.complex128) +MPI_Request = opaque('MPI_Request') @undefined_safe_enum From f933974fe9d378716c16b499e76f3e805d1b2fba Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:55:30 +0200 Subject: [PATCH 006/127] Adjusted existing Isend/Irecv replacements and added new ones for mpi4py compatibility. --- dace/frontend/common/distr.py | 99 +++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index c201e7ce14..c34fe54f41 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -370,6 +370,8 @@ def _gather(pv: 'ProgramVisitor', return None +##### Point-To-Point Communication + @oprepo.replaces('dace.comm.Send') def _send(pv: 'ProgramVisitor', sdfg: SDFG, @@ -442,13 +444,19 @@ def _send(pv: 'ProgramVisitor', return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend') @oprepo.replaces('dace.comm.Isend') def _isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, dst: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number], request: str): + tag: Union[str, sp.Expr, Number], request: str = None, grid: str = None): from dace.libraries.mpi.nodes.isend import Isend - libnode = Isend('_Isend_') + ret_req = False + if not request: + ret_req = True + request, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + + libnode = Isend('_Isend_', grid=grid) buf_range = None if isinstance(buffer, tuple): @@ -523,9 +531,47 @@ def _isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, dst: state.add_edge(tag_node, None, libnode, '_tag', tag_mem) state.add_edge(libnode, '_request', req_node, None, req_mem) + if ret_req: + return request return None +@oprepo.replaces_method('Intracomm', 'Isend') +def _intracomm_isend(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + icomm: 'Intracomm', + buffer: str, + dst: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number]): + + """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + _isend(pv, sdfg, state, buffer, dst, tag, req) + return req + + +@oprepo.replaces_method('ProcessGrid', 'Isend') +def _pgrid_isend(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + pgrid: str, + buffer: str, + dst: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number]): + + """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """ + + from mpi4py import MPI + req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + _isend(pv, sdfg, state, buffer, dst, tag, req, grid=pgrid) + return req + + @oprepo.replaces('dace.comm.Recv') def _recv(pv: 'ProgramVisitor', sdfg: SDFG, @@ -598,13 +644,19 @@ def _recv(pv: 'ProgramVisitor', return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv') @oprepo.replaces('dace.comm.Irecv') def _irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, src: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number], request: str): + tag: Union[str, sp.Expr, Number], request: str = None, grid: str = None): from dace.libraries.mpi.nodes.irecv import Irecv - libnode = Irecv('_Irecv_') + ret_req = False + if not request: + ret_req = True + request, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + + libnode = Irecv('_Irecv_', grid=grid) buf_range = None if isinstance(buffer, tuple): @@ -677,9 +729,47 @@ def _irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, src: state.add_edge(tag_node, None, libnode, '_tag', tag_mem) state.add_edge(libnode, '_request', req_node, None, req_mem) + if ret_req: + return request return None +@oprepo.replaces_method('Intracomm', 'Irecv') +def _intracomm_irecv(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + icomm: 'Intracomm', + buffer: str, + src: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number]): + + """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + _irecv(pv, sdfg, state, buffer, src, tag, req) + return req + + +@oprepo.replaces_method('ProcessGrid', 'Irecv') +def _pgrid_irecv(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + pgrid: str, + buffer: str, + src: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number]): + + """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """ + + from mpi4py import MPI + req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + _irecv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid) + return req + + @oprepo.replaces('dace.comm.Wait') def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str): @@ -713,6 +803,7 @@ def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str): return None +@oprepo.replaces('mpi4py.MPI.Request.Waitall') @oprepo.replaces('dace.comm.Waitall') def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str): From c2240131bb32005dc81746e963d4bd20e94fc2a8 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:57:04 +0200 Subject: [PATCH 007/127] Adjusted visit_Attribute of MPI_Resolver to not trigger to calls of MPI.Request. Added preprocessor class for converting modulo expressions for C/C++ compatibility. --- dace/frontend/python/preprocessing.py | 46 +++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index 9b6d3650c1..f465ae8e02 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -1322,20 +1322,61 @@ def __init__(self, globals: Dict[str, Any]): from mpi4py import MPI self.globals = globals self.MPI = MPI + self.parent = None + + def visit(self, node): + node.parent = self.parent + self.parent = node + node = super().visit(node) + if isinstance(node, ast.AST): + self.parent = node.parent + return node def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]: + self.generic_visit(node) if node.id in self.globals: obj = self.globals[node.id] if isinstance(obj, self.MPI.Comm): lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI') if obj is self.MPI.COMM_WORLD: - return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node) + newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node) + newnode.parent = node.parent + return newnode elif obj is self.MPI.COMM_NULL: - return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node) + newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node) + newnode.parent = node.parent + return newnode else: raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used ' 'directly inside a DaCe Python program.') return node + + def visit_Attribute(self, node: ast.Attribute) -> ast.Attribute: + self.generic_visit(node) + if isinstance(node.attr, str) and node.attr == 'Request': + try: + val = astutils.evalnode(node, self.globals) + if val is self.MPI.Request and not isinstance(node.parent, ast.Attribute): + newnode = ast.copy_location( + ast.Attribute(value=ast.Name(id='dace', ctx=ast.Load), attr='MPI_Request'), node) + newnode.parent = node.parent + return newnode + except SyntaxError: + pass + return node + + +class ModuloConverter(ast.NodeTransformer): + """ Converts a % b expressions to (a + b) % b for C/C++ compatibility. """ + + def visit_BinOp(self, node: ast.BinOp) -> ast.BinOp: + if isinstance(node.op, ast.Mod): + left = self.generic_visit(node.left) + right = self.generic_visit(node.right) + newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=copy.deepcopy(right)), left) + node.left = newleft + return node + return self.generic_visit(node) def preprocess_dace_program(f: Callable[..., Any], @@ -1383,6 +1424,7 @@ def preprocess_dace_program(f: Callable[..., Any], src_ast = MPIResolver(global_vars).visit(src_ast) except ModuleNotFoundError: pass + src_ast = ModuloConverter().visit(src_ast) # Resolve constants to their values (if they are not already defined in this scope) # and symbols to their names From ebe22ed82b20bc7105c020b135e0e07a4416cbd2 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:57:35 +0200 Subject: [PATCH 008/127] Replacement for numpy full now also works with (scalar) data. --- dace/frontend/python/replacements.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 4a673f1179..411f8e551c 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -281,26 +281,38 @@ def _numpy_full(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, shape: Shape, - fill_value: Union[sp.Expr, Number], + fill_value: Union[sp.Expr, Number, data.Scalar], dtype: dace.typeclass = None): """ Creates and array of the specified shape and initializes it with the fill value. """ + is_data = False if isinstance(fill_value, (Number, np.bool_)): vtype = dtypes.DTYPE_TO_TYPECLASS[type(fill_value)] elif isinstance(fill_value, sp.Expr): vtype = _sym_type(fill_value) else: - raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value)) + is_data = True + vtype = sdfg.arrays[fill_value].dtype + # raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value)) dtype = dtype or vtype name, _ = sdfg.add_temp_transient(shape, dtype) - state.add_mapped_tasklet( - '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape)}, {}, - "__out = {}".format(fill_value), - dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), - external_edges=True) + if is_data: + state.add_mapped_tasklet( + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, + dict(__inp=dace.Memlet(data=fill_value, subset='0')), + "__out = __inp", + dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), + external_edges=True) + else: + state.add_mapped_tasklet( + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, + "__out = {}".format(fill_value), + dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), + external_edges=True) return name From 2bfcea90c615ef4443cb14a43b69541952fdd184 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:58:09 +0200 Subject: [PATCH 009/127] Isend/Irecv can now use communicators other than COMM_WORLD. --- dace/libraries/mpi/nodes/irecv.py | 12 ++++++++++-- dace/libraries/mpi/nodes/isend.py | 21 ++++++++++++--------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/dace/libraries/mpi/nodes/irecv.py b/dace/libraries/mpi/nodes/irecv.py index 903bed7543..ad43cb4103 100644 --- a/dace/libraries/mpi/nodes/irecv.py +++ b/dace/libraries/mpi/nodes/irecv.py @@ -20,6 +20,11 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): if buffer.dtype.veclen > 1: raise NotImplementedError + + comm = "MPI_COMM_WORLD" + if node.grid: + comm = f"__state->{node.grid}_comm" + code = "" if ddt is not None: code = f"""static MPI_Datatype newtype; @@ -33,7 +38,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): mpi_dtype_str = "newtype" count_str = "1" buffer_offset = 0 #this is here because the frontend already changes the pointer - code += f"MPI_Irecv(_buffer, {count_str}, {mpi_dtype_str}, _src, _tag, MPI_COMM_WORLD, _request);" + code += f"MPI_Irecv(_buffer, {count_str}, {mpi_dtype_str}, int(_src), int(_tag), {comm}, _request);" if ddt is not None: code += f"""// MPI_Type_free(&newtype); """ @@ -58,8 +63,11 @@ class Irecv(MPINode): } default_implementation = "MPI" - def __init__(self, name, *args, **kwargs): + grid = dace.properties.Property(dtype=str, allow_none=True, default=None) + + def __init__(self, name, grid=None, *args, **kwargs): super().__init__(name, *args, inputs={"_src", "_tag"}, outputs={"_buffer", "_request"}, **kwargs) + self.grid = grid def validate(self, sdfg, state): """ diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py index 342bf2b420..cfd69e46ab 100644 --- a/dace/libraries/mpi/nodes/isend.py +++ b/dace/libraries/mpi/nodes/isend.py @@ -20,6 +20,10 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): if buffer.dtype.veclen > 1: raise NotImplementedError + + comm = "MPI_COMM_WORLD" + if node.grid: + comm = f"__state->{node.grid}_comm" code = "" @@ -40,7 +44,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): mpi_dtype_str = "newtype" count_str = "1" buffer_offset = 0 - code += f"MPI_Isend(&(_buffer[{buffer_offset}]), {count_str}, {mpi_dtype_str}, _dest, _tag, MPI_COMM_WORLD, _request);" + code += f"MPI_Isend(&(_buffer[{buffer_offset}]), {count_str}, {mpi_dtype_str}, int(_dest), int(_tag), {comm}, _request);" if ddt is not None: code += f"""// MPI_Type_free(&newtype); """ @@ -68,13 +72,12 @@ class Isend(MPINode): } default_implementation = "MPI" - # Object fields - n = dace.properties.SymbolicProperty(allow_none=True, default=None) - + grid = dace.properties.Property(dtype=str, allow_none=True, default=None) nosync = dace.properties.Property(dtype=bool, default=False, desc="Do not sync if memory is on GPU") - def __init__(self, name, *args, **kwargs): + def __init__(self, name, grid=None, *args, **kwargs): super().__init__(name, *args, inputs={"_buffer", "_dest", "_tag"}, outputs={"_request"}, **kwargs) + self.grid = grid def validate(self, sdfg, state): """ @@ -93,10 +96,10 @@ def validate(self, sdfg, state): if e.src_conn == "_request": req = sdfg.arrays[e.data.data] - if dest.dtype.base_type != dace.dtypes.int32: - raise ValueError("Source must be an integer!") - if tag.dtype.base_type != dace.dtypes.int32: - raise ValueError("Tag must be an integer!") + # if dest.dtype.base_type != dace.dtypes.int32: + # raise ValueError("Destination must be an integer!") + # if tag.dtype.base_type != dace.dtypes.int32: + # raise ValueError("Tag must be an integer!") count_str = "XXX" for _, _, _, dst_conn, data in state.in_edges(self): From fd0dc4076f4032abb1f39df7fafb153714152c6c Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 27 Jul 2022 10:58:58 +0200 Subject: [PATCH 010/127] Added mpi4py-compatible Isend/Irecv test. --- tests/library/mpi/mpi4py_test.py | 37 +++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index cc9968e4f8..7c314b7516 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -173,9 +173,40 @@ def k3mm(A, B, C, D): assert(np.array_equal(E, E_ref)) +@pytest.mark.mpi +def test_isend_irecv(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def chain(rank: dace.int32, size: dace.int32): + src = (rank - 1) % size + dst = (rank + 1) % size + req = np.empty((2, ), dtype=MPI.Request) + sbuf = np.full((1,), rank, dtype=np.int32) + req[0] = commworld.Isend(sbuf, dst, tag=0) + rbuf = np.empty((1, ), dtype=np.int32) + req[1] = commworld.Irecv(rbuf, src, tag=0) + MPI.Request.Waitall(req) + return rbuf + + sdfg = None + if rank == 0: + sdfg = chain.to_sdfg(simplify=True) + func = utils.distributed_compile(sdfg, commworld) + + val = func(rank=rank, size=size) + ref = chain.f(rank, size) + + assert(val[0] == ref[0]) + if __name__ == "__main__": - test_process_grid_bcast() - test_sub_grid_bcast() - test_3mm() + # test_process_grid_bcast() + # test_sub_grid_bcast() + # test_3mm() + test_isend_irecv() From 7ca527ca13d7e9fe62b6d73d1e002f19d402d157 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 19 Jun 2023 22:37:00 +0800 Subject: [PATCH 011/127] Updated mpi_allgather_test.py for coding style consistency --- tests/library/mpi/mpi_allgather_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/library/mpi/mpi_allgather_test.py b/tests/library/mpi/mpi_allgather_test.py index 1f0a30a4d1..1eebcd5676 100644 --- a/tests/library/mpi/mpi_allgather_test.py +++ b/tests/library/mpi/mpi_allgather_test.py @@ -22,7 +22,10 @@ def make_sdfg(dtype): outA = state.add_access("outA") allgather_node = mpi.nodes.allgather.Allgather("allgather") - state.add_memlet_path(inA, allgather_node, dst_conn="_inbuffer", memlet=Memlet.simple(inA, "0:n", num_accesses=n)) + state.add_memlet_path(inA, + allgather_node, + dst_conn="_inbuffer", + memlet=Memlet.simple(inA, "0:n", num_accesses=n)) state.add_memlet_path(allgather_node, outA, src_conn="_outbuffer", From 38be7493036c6401bba1d39c5bcba0e8b76af6ee Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 19 Jun 2023 22:55:59 +0800 Subject: [PATCH 012/127] Added alltoall node basic version based on other collectives --- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/alltoall.py | 84 ++++++++++++++++++++++++++ tests/library/mpi/mpi_alltoall_test.py | 78 ++++++++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 dace/libraries/mpi/nodes/alltoall.py create mode 100644 tests/library/mpi/mpi_alltoall_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index b4789d952e..0cd36cc82f 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -10,5 +10,6 @@ from .reduce import Reduce from .allreduce import Allreduce from .allgather import Allgather +from .alltoall import Alltoall from .dummy import Dummy from .redistribute import Redistribute diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py new file mode 100644 index 0000000000..b0accfb52d --- /dev/null +++ b/dace/libraries/mpi/nodes/alltoall.py @@ -0,0 +1,84 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandAlltoallMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): + (inbuffer, in_count_str), (outbuffer, out_count_str) = node.validate(parent_sdfg, parent_state) + in_mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type) + out_mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(outbuffer.dtype.base_type) + + if inbuffer.dtype.veclen > 1: + raise (NotImplementedError) + + comm = "MPI_COMM_WORLD" + if node.grid: + comm = f"__state->{node.grid}_comm" + + # code = f""" + # MPI_Alltoall({buffer}, {count_str}, {mpi_dtype_str}, _outbuffer, {count_str}, {mpi_dtype_str}, {comm}); + # """ + code = f""" + MPI_Alltoall(_inbuffer, {in_count_str}, {in_mpi_dtype_str}, \ + _outbuffer, {out_count_str}, {out_mpi_dtype_str}, \ + {comm}); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP) + return tasklet + + +@dace.library.node +class Alltoall(MPINode): + + # Global properties + implementations = { + "MPI": ExpandAlltoallMPI, + } + default_implementation = "MPI" + + grid = dace.properties.Property(dtype=str, allow_none=True, default=None) + + def __init__(self, name, grid=None, *args, **kwargs): + super().__init__(name, *args, inputs={"_inbuffer"}, outputs={"_outbuffer"}, **kwargs) + self.grid = grid + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer, root) of the three data descriptors in the + parent SDFG. + """ + + inbuffer, outbuffer = None, None + for e in state.out_edges(self): + if e.src_conn == "_outbuffer": + outbuffer = sdfg.arrays[e.data.data] + for e in state.in_edges(self): + if e.dst_conn == "_inbuffer": + inbuffer = sdfg.arrays[e.data.data] + + in_count_str = "XXX" + out_count_str = "XXX" + for _, src_conn, _, _, data in state.out_edges(self): + if src_conn == '_outbuffer': + dims = [str(e) for e in data.subset.size_exact()] + out_count_str = "*".join(dims) + for _, _, _, dst_conn, data in state.in_edges(self): + if dst_conn == '_inbuffer': + dims = [str(e) for e in data.subset.size_exact()] + in_count_str = "*".join(dims) + + return (inbuffer, in_count_str), (outbuffer, out_count_str) diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py new file mode 100644 index 0000000000..cf155fc640 --- /dev/null +++ b/tests/library/mpi/mpi_alltoall_test.py @@ -0,0 +1,78 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import numpy as np +import pytest + +############################################################################### + + +def make_sdfg(dtype): + + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_alltoall") + state = sdfg.add_state("dataflow") + + sdfg.add_array("inbuf", [n], dtype, transient=False) + sdfg.add_array("outbuf", [n], dtype, transient=False) + inbuf = state.add_access("inbuf") + outbuf = state.add_access("outbuf") + alltoall_node = mpi.nodes.alltoall.Alltoall("alltoall") + + state.add_memlet_path(inbuf, + alltoall_node, + dst_conn="_inbuffer", + memlet=Memlet.simple(inbuf, "0:n", num_accesses=n)) + state.add_memlet_path(alltoall_node, + outbuf, + src_conn="_outbuffer", + memlet=Memlet.simple(outbuf, "0:n", num_accesses=n)) + + return sdfg + + +############################################################################### + + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.float64, marks=pytest.mark.mpi) +]) +def test_mpi(implementation, dtype): + from mpi4py import MPI as MPI4PY + np_dtype = getattr(np, dtype.to_string()) + comm = MPI4PY.COMM_WORLD + rank = comm.Get_rank() + commsize = comm.Get_size() + mpi_sdfg = None + if commsize < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + for r in range(0, commsize): + if r == rank: + sdfg = make_sdfg(dtype) + mpi_sdfg = sdfg.compile() + comm.Barrier() + + size = 128 + size_per_proc = int(size/commsize) + A = np.arange(0, size, dtype=np_dtype) + B = np.full(size, 0, dtype=np_dtype) + mpi_sdfg(inbuf=A, outbuf=B, n=size_per_proc) + + # now B should be an array of size, + # containing (size / size_per_proc) repeated chunked_data + chunked_data = A[rank * size_per_proc: (rank + 1) * size_per_proc] + correct_data = np.tile(chunked_data, int(size / size_per_proc)) + if (not np.allclose(B, correct_data)): + raise (ValueError("The received values are not what I expected on root.")) + + +############################################################################### + +if __name__ == "__main__": + test_mpi("MPI", dace.float32) + test_mpi("MPI", dace.float64) + +############################################################################### From e5085aeecdb6dba457053358e898bc75edc81378 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 22 Jun 2023 21:17:44 +0800 Subject: [PATCH 013/127] Fixed mpi_send_recv_test.py --- tests/library/mpi/mpi_send_recv_test.py | 35 +++++++++++-------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py index 52034111a5..48c8170949 100644 --- a/tests/library/mpi/mpi_send_recv_test.py +++ b/tests/library/mpi/mpi_send_recv_test.py @@ -1,5 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace +from dace.sdfg import utils from dace.memlet import Memlet import dace.libraries.mpi as mpi import numpy as np @@ -75,21 +76,15 @@ def test_mpi(): ############################################################################### -myrank = dace.symbol('myrank', dtype=dace.int32) -mysize = dace.symbol('mysize', dtype=dace.int32) - - @dace.program -def dace_send_recv(): - tmp1 = np.full([1], myrank, dtype=np.int32) - tmp2 = np.zeros([1], dtype=np.int32) - if myrank == 0: - dace.comm.Send(tmp1, 1, tag=42) - dace.comm.Recv(tmp2, mysize - 1, tag=42) - else: - dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42) - dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42) - return tmp2 +def dace_send_recv(rank: dace.int32, size: dace.int32): + src = np.full([1], (rank - 1) % size, dtype=np.int32) + dst = np.full([1], (rank + 1) % size, dtype=np.int32) + sbuf = np.full([1], rank, dtype=np.int32) + rbuf = np.zeros([1], dtype=np.int32) + dace.comm.Recv(rbuf, src, tag=42) + dace.comm.Send(sbuf, dst, tag=42) + return rbuf @pytest.mark.mpi @@ -101,14 +96,14 @@ def test_dace_send_recv(): mpi_sdfg = None if commsize < 2: raise ValueError("This test is supposed to be run with at least two processes!") - for r in range(0, commsize): - if r == rank: - mpi_sdfg = dace_send_recv.compile() - comm.Barrier() + sdfg = None + if rank == 0: + sdfg = dace_send_recv.to_sdfg(simplify=True) + mpi_sdfg = utils.distributed_compile(sdfg, comm) - prv_rank = mpi_sdfg(myrank=rank, mysize=commsize) + val = mpi_sdfg(rank=rank, size=commsize) - assert (prv_rank[0] == (rank - 1) % commsize) + assert (val[0] == (rank - 1) % commsize) ############################################################################### From f8c9550db8fcfa28ae9c8a54663fd9da08f1b0aa Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 22 Jun 2023 21:26:06 +0800 Subject: [PATCH 014/127] Added mpi4py replacement for send/recv --- dace/frontend/common/distr.py | 2 ++ tests/library/mpi/mpi4py_test.py | 41 ++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index c34fe54f41..c47040728f 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -372,6 +372,7 @@ def _gather(pv: 'ProgramVisitor', ##### Point-To-Point Communication +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send') @oprepo.replaces('dace.comm.Send') def _send(pv: 'ProgramVisitor', sdfg: SDFG, @@ -572,6 +573,7 @@ def _pgrid_isend(pv: 'ProgramVisitor', return req +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv') @oprepo.replaces('dace.comm.Recv') def _recv(pv: 'ProgramVisitor', sdfg: SDFG, diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 7c314b7516..603a6786cb 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -175,14 +175,13 @@ def k3mm(A, B, C, D): @pytest.mark.mpi def test_isend_irecv(): - from mpi4py import MPI commworld = MPI.COMM_WORLD rank = commworld.Get_rank() size = commworld.Get_size() @dace.program - def chain(rank: dace.int32, size: dace.int32): + def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32): src = (rank - 1) % size dst = (rank + 1) % size req = np.empty((2, ), dtype=MPI.Request) @@ -192,21 +191,49 @@ def chain(rank: dace.int32, size: dace.int32): req[1] = commworld.Irecv(rbuf, src, tag=0) MPI.Request.Waitall(req) return rbuf - + sdfg = None if rank == 0: - sdfg = chain.to_sdfg(simplify=True) + sdfg = mpi4py_isend_irecv.to_sdfg(simplify=True) func = utils.distributed_compile(sdfg, commworld) val = func(rank=rank, size=size) - ref = chain.f(rank, size) + ref = mpi4py_isend_irecv.f(rank, size) - assert(val[0] == ref[0]) + assert (val[0] == ref[0]) -if __name__ == "__main__": +@pytest.mark.mpi +def test_send_recv(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + @dace.program + def mpi4py_send_recv(rank: dace.int32, size: dace.int32): + src = np.full([1], (rank - 1) % size, dtype=np.int32) + dst = np.full([1], (rank + 1) % size, dtype=np.int32) + sbuf = np.full((1,), rank, dtype=np.int32) + commworld.Send(sbuf, dst, tag=0) + rbuf = np.empty((1, ), dtype=np.int32) + commworld.Recv(rbuf, src, tag=0) + return rbuf + + sdfg = None + if rank == 0: + sdfg = mpi4py_send_recv.to_sdfg(simplify=True) + func = utils.distributed_compile(sdfg, commworld) + + val = func(rank=rank, size=size) + ref = mpi4py_send_recv.f(rank, size) + + assert (val[0] == ref[0]) + + +if __name__ == "__main__": # test_process_grid_bcast() # test_sub_grid_bcast() # test_3mm() test_isend_irecv() + test_send_recv() From 345c36b01820ecc12f0b8c43197fd1d060a76401 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 6 Jul 2023 14:24:48 +0800 Subject: [PATCH 015/127] Updated mpi_send_recv_test.py for correctness of blocking comm --- tests/library/mpi/mpi_send_recv_test.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py index 48c8170949..ec094e7cf5 100644 --- a/tests/library/mpi/mpi_send_recv_test.py +++ b/tests/library/mpi/mpi_send_recv_test.py @@ -82,8 +82,12 @@ def dace_send_recv(rank: dace.int32, size: dace.int32): dst = np.full([1], (rank + 1) % size, dtype=np.int32) sbuf = np.full([1], rank, dtype=np.int32) rbuf = np.zeros([1], dtype=np.int32) - dace.comm.Recv(rbuf, src, tag=42) - dace.comm.Send(sbuf, dst, tag=42) + if rank % 2 == 0: + dace.comm.Recv(rbuf, src, tag=42) + dace.comm.Send(sbuf, dst, tag=42) + else: + dace.comm.Send(sbuf, dst, tag=42) + dace.comm.Recv(rbuf, src, tag=42) return rbuf @@ -99,10 +103,11 @@ def test_dace_send_recv(): sdfg = None if rank == 0: sdfg = dace_send_recv.to_sdfg(simplify=True) + # disable openMP section for blocking + sdfg.openmp_sections = False mpi_sdfg = utils.distributed_compile(sdfg, comm) val = mpi_sdfg(rank=rank, size=commsize) - assert (val[0] == (rank - 1) % commsize) From 1cea59ec62a3ecaaa224d0dbe9949b6db6bd4329 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 6 Jul 2023 16:15:40 +0800 Subject: [PATCH 016/127] Updated Isend/Irecv test --- tests/library/mpi/mpi_isend_irecv_test.py | 65 ++++++++++------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/tests/library/mpi/mpi_isend_irecv_test.py b/tests/library/mpi/mpi_isend_irecv_test.py index 0c9a1ef0a9..9fab8c0158 100644 --- a/tests/library/mpi/mpi_isend_irecv_test.py +++ b/tests/library/mpi/mpi_isend_irecv_test.py @@ -1,5 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace +from dace.sdfg import utils from dace.memlet import Memlet import dace.libraries.mpi as mpi import numpy as np @@ -104,55 +105,45 @@ def _test_mpi(info, sdfg, dtype): raise (ValueError("The received values are not what I expected.")) -# TODO: The test deadlocks in the CI (Ubuntu 18.04, MPICH 3.3a2) -# but works fine in up-to-date systems, including when using pytest. -@pytest.mark.skip +@pytest.mark.mpi def test_mpi(): - _test_mpi("MPI Send/Recv", make_sdfg(np.float64), np.float64) - + _test_mpi("MPI Isend/Irecv", make_sdfg(np.float64), np.float64) ############################################################################### -myrank = dace.symbol('myrank', dtype=dace.int32) -mysize = dace.symbol('mysize', dtype=dace.int32) - +@pytest.mark.mpi +def test_isend_irecv(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() -@dace.program -def dace_send_recv(): - tmp1 = np.full([1], myrank, dtype=np.int32) - tmp2 = np.zeros([1], dtype=np.int32) - if myrank == 0: - dace.comm.Send(tmp1, 1, tag=42) - dace.comm.Recv(tmp2, mysize - 1, tag=42) - else: - dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42) - dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42) - return tmp2 + @dace.program + def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32): + src = (rank - 1) % size + dst = (rank + 1) % size + req = np.empty((2, ), dtype=MPI.Request) + sbuf = np.full((1,), rank, dtype=np.int32) + req[0] = commworld.Isend(sbuf, dst, tag=0) + rbuf = np.empty((1, ), dtype=np.int32) + req[1] = commworld.Irecv(rbuf, src, tag=0) + MPI.Request.Waitall(req) + return rbuf + sdfg = None + if rank == 0: + sdfg = mpi4py_isend_irecv.to_sdfg(simplify=True) + func = utils.distributed_compile(sdfg, commworld) -# TODO: The test is redundant. It must be updated to use Isend/Irecv. -@pytest.mark.skip -def test_dace_send_recv(): - from mpi4py import MPI as MPI4PY - comm = MPI4PY.COMM_WORLD - rank = comm.Get_rank() - commsize = comm.Get_size() - mpi_sdfg = None - if commsize < 2: - raise ValueError("This test is supposed to be run with at least two processes!") - for r in range(0, commsize): - if r == rank: - mpi_sdfg = dace_send_recv.compile() - comm.Barrier() - - prv_rank = mpi_sdfg(myrank=rank, mysize=commsize) + val = func(rank=rank, size=size) + ref = mpi4py_isend_irecv.f(rank, size) - assert (prv_rank[0] == (rank - 1) % commsize) + assert (val[0] == ref[0]) ############################################################################### if __name__ == "__main__": test_mpi() - test_dace_send_recv() + test_isend_irecv() ############################################################################### From b82b06ac2ca122789e1d4c2bbbc2a45b277faf36 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 7 Jul 2023 00:01:49 +0800 Subject: [PATCH 017/127] Updated alltoall library node for logical correctness --- dace/libraries/mpi/nodes/alltoall.py | 10 +++++----- tests/library/mpi/mpi_alltoall_test.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py index b0accfb52d..92be24ce45 100644 --- a/dace/libraries/mpi/nodes/alltoall.py +++ b/dace/libraries/mpi/nodes/alltoall.py @@ -25,12 +25,12 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): if node.grid: comm = f"__state->{node.grid}_comm" - # code = f""" - # MPI_Alltoall({buffer}, {count_str}, {mpi_dtype_str}, _outbuffer, {count_str}, {mpi_dtype_str}, {comm}); - # """ code = f""" - MPI_Alltoall(_inbuffer, {in_count_str}, {in_mpi_dtype_str}, \ - _outbuffer, {out_count_str}, {out_mpi_dtype_str}, \ + int size; + MPI_Comm_size({comm}, &size); + int sendrecv_amt = {in_count_str} / size; + MPI_Alltoall(_inbuffer, sendrecv_amt, {in_mpi_dtype_str}, \ + _outbuffer, sendrecv_amt, {out_mpi_dtype_str}, \ {comm}); """ tasklet = dace.sdfg.nodes.Tasklet(node.name, diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py index cf155fc640..e1eb4fe5f1 100644 --- a/tests/library/mpi/mpi_alltoall_test.py +++ b/tests/library/mpi/mpi_alltoall_test.py @@ -59,7 +59,7 @@ def test_mpi(implementation, dtype): size_per_proc = int(size/commsize) A = np.arange(0, size, dtype=np_dtype) B = np.full(size, 0, dtype=np_dtype) - mpi_sdfg(inbuf=A, outbuf=B, n=size_per_proc) + mpi_sdfg(inbuf=A, outbuf=B, n=size) # now B should be an array of size, # containing (size / size_per_proc) repeated chunked_data From a115db68447e4f6dfc6f599757de12d5c2f4e86a Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 7 Jul 2023 00:14:09 +0800 Subject: [PATCH 018/127] Added replacement and test for mpi4py alltoall --- dace/frontend/common/distr.py | 30 +++++++++++++++++++++++++++++- tests/library/mpi/mpi4py_test.py | 27 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index c47040728f..4200ad9024 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -254,9 +254,37 @@ def _Reduce(pv: 'ProgramVisitor', return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall') +@oprepo.replaces('dace.comm.Alltoall') +def _allreduce(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + inbuffer: str, + outbuffer: str, + grid: str = None): + + from dace.libraries.mpi.nodes.alltoall import Alltoall + + + libnode = Alltoall('_Alltoall_', grid) + in_desc = sdfg.arrays[inbuffer] + in_buffer = state.add_read(inbuffer) + out_desc = sdfg.arrays[inbuffer] + out_buffer = state.add_write(outbuffer) + state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc)) + state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(out_buffer, out_desc)) + + return None + + @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') -def _allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): +def _allreduce(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + buffer: str, + op: str, + grid: str = None): from dace.libraries.mpi.nodes.allreduce import Allreduce diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 603a6786cb..a9c94ea4a0 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -231,9 +231,36 @@ def mpi4py_send_recv(rank: dace.int32, size: dace.int32): assert (val[0] == ref[0]) +@pytest.mark.mpi +def test_alltoall(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def mpi4py_alltoall(rank: dace.int32, size: dace.int32): + sbuf = np.full((128,), rank, dtype=np.int32) + rbuf = np.zeros((128, ), dtype=np.int32) + commworld.Alltoall(sbuf, rbuf) + return rbuf + + sdfg = None + if rank == 0: + sdfg = mpi4py_alltoall.to_sdfg(simplify=True) + func = utils.distributed_compile(sdfg, commworld) + + val = func(rank=rank, size=size) + ref = mpi4py_alltoall.f(rank, size) + + if (not np.allclose(val, ref)): + raise (ValueError("The received values are not what I expected.")) + + if __name__ == "__main__": # test_process_grid_bcast() # test_sub_grid_bcast() # test_3mm() test_isend_irecv() test_send_recv() + test_alltoall() From eebefe4618a715c02099da3375245bb4c737c7b9 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 7 Jul 2023 13:38:08 +0800 Subject: [PATCH 019/127] Corrected the out_desc in alltoall replacement --- dace/frontend/common/distr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 4200ad9024..dd20a7b6fe 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -269,7 +269,7 @@ def _allreduce(pv: 'ProgramVisitor', libnode = Alltoall('_Alltoall_', grid) in_desc = sdfg.arrays[inbuffer] in_buffer = state.add_read(inbuffer) - out_desc = sdfg.arrays[inbuffer] + out_desc = sdfg.arrays[outbuffer] out_buffer = state.add_write(outbuffer) state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc)) state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(out_buffer, out_desc)) From 110d0f2a334bca1ac96a0cae1c7ba6d82f990d11 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 7 Jul 2023 14:05:17 +0800 Subject: [PATCH 020/127] Added alltoall replacement for ProcessGrid and Intracomm --- dace/frontend/common/distr.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index dd20a7b6fe..f20e6f6729 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -256,7 +256,7 @@ def _Reduce(pv: 'ProgramVisitor', @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall') @oprepo.replaces('dace.comm.Alltoall') -def _allreduce(pv: 'ProgramVisitor', +def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, @@ -265,7 +265,6 @@ def _allreduce(pv: 'ProgramVisitor', from dace.libraries.mpi.nodes.alltoall import Alltoall - libnode = Alltoall('_Alltoall_', grid) in_desc = sdfg.arrays[inbuffer] in_buffer = state.add_read(inbuffer) @@ -277,6 +276,36 @@ def _allreduce(pv: 'ProgramVisitor', return None +@oprepo.replaces_method('Intracomm', 'Alltoall') +def _intracomm_alltoall(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + icomm: 'Intracomm', + inp_buffer: str, + out_buffer: str): + + """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """ + + from mpi4py import MPI + if icomm != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + return _alltoall(pv, sdfg, state, inp_buffer, out_buffer) + + +@oprepo.replaces_method('ProcessGrid', 'Alltoall') +def _pgrid_alltoall(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + pgrid: str, + inp_buffer: str, + out_buffer: str): + + """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer, grid=pgrid)`. """ + + from mpi4py import MPI + return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid) + + @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') def _allreduce(pv: 'ProgramVisitor', From 8626b9a8f7e74c805430adf5e3649f09e31ee718 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 7 Jul 2023 17:15:44 +0200 Subject: [PATCH 021/127] Fixed bad merge. --- dace/frontend/common/distr.py | 128 ++++++++++++---------------------- 1 file changed, 44 insertions(+), 84 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 653ca38337..af08623083 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -16,7 +16,6 @@ RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic] ProgramVisitor = 'dace.frontend.python.newast.ProgramVisitor' - ##### MPI Cartesian Communicators @@ -64,7 +63,6 @@ def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: return _cart_create(pv, sdfg, state, dims) - @oprepo.replaces('dace.comm.Cart_sub') def _cart_sub(pv: 'ProgramVisitor', sdfg: SDFG, @@ -107,11 +105,8 @@ def _cart_sub(pv: 'ProgramVisitor', @oprepo.replaces_method('ProcessGrid', 'Sub') -def _pgrid_sub(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - parent_grid: str, - color: Sequence[Union[Integral, bool]]): +def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid: str, color: Sequence[Union[Integral, + bool]]): """ Equivalent to `dace.comm.Cart_sub(parent_grid, color). :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). @@ -197,7 +192,6 @@ def _intracomm_bcast(pv: 'ProgramVisitor', icomm: 'Intracomm', buffer: str, root: Union[str, sp.Expr, Number] = 0): - """ Equivalent to `dace.comm.Bcast(buffer, root)`. """ from mpi4py import MPI @@ -213,7 +207,6 @@ def _pgrid_bcast(pv: 'ProgramVisitor', pgrid: str, buffer: str, root: Union[str, sp.Expr, Number] = 0): - """ Equivalent to `dace.comm.Bcast(buffer, root, grid=pgrid)`. """ return _bcast(pv, sdfg, state, buffer, root, grid=pgrid) @@ -257,12 +250,7 @@ def _Reduce(pv: ProgramVisitor, @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall') @oprepo.replaces('dace.comm.Alltoall') -def _alltoall(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - inbuffer: str, - outbuffer: str, - grid: str = None): +def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None): from dace.libraries.mpi.nodes.alltoall import Alltoall @@ -278,13 +266,8 @@ def _alltoall(pv: 'ProgramVisitor', @oprepo.replaces_method('Intracomm', 'Alltoall') -def _intracomm_alltoall(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - icomm: 'Intracomm', - inp_buffer: str, - out_buffer: str): - +def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: str, + out_buffer: str): """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """ from mpi4py import MPI @@ -294,13 +277,7 @@ def _intracomm_alltoall(pv: 'ProgramVisitor', @oprepo.replaces_method('ProcessGrid', 'Alltoall') -def _pgrid_alltoall(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - pgrid: str, - inp_buffer: str, - out_buffer: str): - +def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, inp_buffer: str, out_buffer: str): """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer, grid=pgrid)`. """ from mpi4py import MPI @@ -309,7 +286,7 @@ def _pgrid_alltoall(pv: 'ProgramVisitor', @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') -def _Allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): +def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): from dace.libraries.mpi.nodes.allreduce import Allreduce @@ -324,14 +301,8 @@ def _Allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op @oprepo.replaces_method('Intracomm', 'Allreduce') -def _intracomm_allreduce(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - icomm: 'Intracomm', - inp_buffer: 'InPlace', - out_buffer: str, - op: str): - +def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: 'InPlace', + out_buffer: str, op: str): """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """ from mpi4py import MPI @@ -345,14 +316,8 @@ def _intracomm_allreduce(pv: 'ProgramVisitor', @oprepo.replaces_method('ProcessGrid', 'Allreduce') -def _pgrid_allreduce(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - pgrid: str, - inp_buffer: 'InPlace', - out_buffer: str, - op: str): - +def _pgrid_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, inp_buffer: 'InPlace', + out_buffer: str, op: str): """ Equivalent to `dace.comm.Allreduce(out_buffer, op, grid=pgrid)`. """ from mpi4py import MPI @@ -425,6 +390,7 @@ def _gather(pv: ProgramVisitor, ##### Point-To-Point Communication + @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send') @oprepo.replaces('dace.comm.Send') def _send(pv: ProgramVisitor, @@ -500,15 +466,24 @@ def _send(pv: ProgramVisitor, @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend') @oprepo.replaces('dace.comm.Isend') -def _isend(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, dst: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number], request: str): +def _isend(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + buffer: str, + dst: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number], + request: str = None, + grid: str = None): from dace.libraries.mpi.nodes.isend import Isend ret_req = False if not request: ret_req = True - request, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + request, _ = sdfg.add_array("isend_req", [1], + dace.dtypes.opaque("MPI_Request"), + transient=True, + find_new_name=True) libnode = Isend('_Isend_', grid=grid) @@ -591,14 +566,8 @@ def _isend(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, dst: U @oprepo.replaces_method('Intracomm', 'Isend') -def _intracomm_isend(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - icomm: 'Intracomm', - buffer: str, - dst: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number]): - +def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """ from mpi4py import MPI @@ -610,14 +579,8 @@ def _intracomm_isend(pv: 'ProgramVisitor', @oprepo.replaces_method('ProcessGrid', 'Isend') -def _pgrid_isend(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - pgrid: str, - buffer: str, - dst: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number]): - +def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str, + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """ from mpi4py import MPI @@ -701,15 +664,24 @@ def _recv(pv: ProgramVisitor, @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv') @oprepo.replaces('dace.comm.Irecv') -def _irecv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, src: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number], request: str): +def _irecv(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + buffer: str, + src: Union[str, sp.Expr, Number], + tag: Union[str, sp.Expr, Number], + request: str = None, + grid: str = None): from dace.libraries.mpi.nodes.irecv import Irecv ret_req = False if not request: ret_req = True - request, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) + request, _ = sdfg.add_array("irecv_req", [1], + dace.dtypes.opaque("MPI_Request"), + transient=True, + find_new_name=True) libnode = Irecv('_Irecv_', grid=grid) @@ -790,14 +762,8 @@ def _irecv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, src: U @oprepo.replaces_method('Intracomm', 'Irecv') -def _intracomm_irecv(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - icomm: 'Intracomm', - buffer: str, - src: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number]): - +def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, + src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """ from mpi4py import MPI @@ -809,14 +775,8 @@ def _intracomm_irecv(pv: 'ProgramVisitor', @oprepo.replaces_method('ProcessGrid', 'Irecv') -def _pgrid_irecv(pv: 'ProgramVisitor', - sdfg: SDFG, - state: SDFGState, - pgrid: str, - buffer: str, - src: Union[str, sp.Expr, Number], - tag: Union[str, sp.Expr, Number]): - +def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str, + src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """ from mpi4py import MPI From 442a8734419393f2797e82328829eb74a4ce8377 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 7 Jul 2023 17:15:57 +0200 Subject: [PATCH 022/127] Updated tests. --- tests/library/mpi/mpi4py_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index a9c94ea4a0..bbc72bc6c4 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -239,18 +239,18 @@ def test_alltoall(): size = commworld.Get_size() @dace.program - def mpi4py_alltoall(rank: dace.int32, size: dace.int32): - sbuf = np.full((128,), rank, dtype=np.int32) - rbuf = np.zeros((128, ), dtype=np.int32) + def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): + sbuf = np.full((size,), rank, dtype=np.int32) + rbuf = np.zeros((size, ), dtype=np.int32) commworld.Alltoall(sbuf, rbuf) return rbuf sdfg = None if rank == 0: - sdfg = mpi4py_alltoall.to_sdfg(simplify=True) + sdfg = mpi4py_alltoall.to_sdfg(simplify=True, size=size) func = utils.distributed_compile(sdfg, commworld) - val = func(rank=rank, size=size) + val = func(rank=rank) ref = mpi4py_alltoall.f(rank, size) if (not np.allclose(val, ref)): @@ -261,6 +261,6 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.int32): # test_process_grid_bcast() # test_sub_grid_bcast() # test_3mm() - test_isend_irecv() - test_send_recv() + # test_isend_irecv() + # test_send_recv() test_alltoall() From 832c203598dba64abc093b39743e1b153954ed9e Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 7 Jul 2023 17:52:57 +0200 Subject: [PATCH 023/127] uncommented out tests. --- tests/library/mpi/mpi4py_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index bbc72bc6c4..a81294c47f 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -258,9 +258,9 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): if __name__ == "__main__": - # test_process_grid_bcast() - # test_sub_grid_bcast() - # test_3mm() - # test_isend_irecv() - # test_send_recv() + test_process_grid_bcast() + test_sub_grid_bcast() + test_3mm() + test_isend_irecv() + test_send_recv() test_alltoall() From e67aa8eb56d055967ce8f4dd74dfffc893db304b Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:17:36 +0200 Subject: [PATCH 024/127] The COMM_WORLD communicator object does not have its name changes to mpi4py.MPI.COMM_WORLD any longer. All (mpi4py) communicators are now allowed to pass as-is through preprocessing. --- dace/frontend/python/preprocessing.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index 03f07d0050..ea312a18c0 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -1525,17 +1525,10 @@ def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]: obj = self.globals[node.id] if isinstance(obj, self.MPI.Comm): lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI') - if obj is self.MPI.COMM_WORLD: - newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node) - newnode.parent = node.parent - return newnode - elif obj is self.MPI.COMM_NULL: + if obj is self.MPI.COMM_NULL: newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node) newnode.parent = node.parent return newnode - else: - raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used ' - 'directly inside a DaCe Python program.') return node def visit_Attribute(self, node: ast.Attribute) -> ast.Attribute: From 2741579000d87a2429ae4c7b805398fb5590c411 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:21:22 +0200 Subject: [PATCH 025/127] All (mpi4py) communicators in the global context are now registered in the ProgramVisitor's defined variables. When calling a method on an object, if the object is not in the ProgramVisitor's current/outer scope variables, pass to the method a tuple with the object's name and the object itself. --- dace/frontend/python/newast.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 5147dc01fe..ce2a9e06e1 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1312,11 +1312,11 @@ def defined(self): # MPI-related stuff result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids}) - # try: - # from mpi4py import MPI - # result.update({k: v for k, v in self.globals.items() if v is MPI.COMM_WORLD}) - # except: - # pass + try: + from mpi4py import MPI + result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)}) + except: + pass return result @@ -4369,8 +4369,11 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): # Add object as first argument if modname in self.variables.keys(): arg = self.variables[modname] - else: + elif modname in self.scope_vars.keys(): arg = self.scope_vars[modname] + else: + # Fallback to (name, object) + arg = (modname, self.defined[modname]) args.append(arg) # Otherwise, try to find a default implementation for the SDFG elif not found_ufunc: From 6fe26c46a00f438d0ce7c8430d313c4e8b9280c6 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:23:14 +0200 Subject: [PATCH 026/127] The Bcast LibraryNode can now accept as a string the name of a variable that holds the Fortran int handle of a communicator. --- dace/libraries/mpi/nodes/bcast.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dace/libraries/mpi/nodes/bcast.py b/dace/libraries/mpi/nodes/bcast.py index c39ef91980..bf3729ff38 100644 --- a/dace/libraries/mpi/nodes/bcast.py +++ b/dace/libraries/mpi/nodes/bcast.py @@ -42,11 +42,16 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): if isinstance(buffer, dace.data.Scalar): ref = "&" + init = "" comm = "MPI_COMM_WORLD" if node.grid: comm = f"__state->{node.grid}_comm" + elif node.fcomm: + init = f"MPI_Comm __comm = MPI_Comm_f2c({node.fcomm});" + comm = "__comm" code = f""" + {init} MPI_Bcast({ref}_inbuffer, {count_str}, {mpi_dtype_str}, _root, {comm}); _outbuffer = _inbuffer;""" tasklet = dace.sdfg.nodes.Tasklet(node.name, @@ -67,10 +72,12 @@ class Bcast(MPINode): default_implementation = "MPI" grid = dace.properties.Property(dtype=str, allow_none=True, default=None) + fcomm = dace.properties.Property(dtype=str, allow_none=True, default=None) - def __init__(self, name, grid=None, *args, **kwargs): + def __init__(self, name, grid=None, fcomm=None, *args, **kwargs): super().__init__(name, *args, inputs={"_inbuffer", "_root"}, outputs={"_outbuffer"}, **kwargs) self.grid = grid + self.fcomm = fcomm def validate(self, sdfg, state): """ From af624c3770bcec7d6efbaf64863f7ee0d9db4129 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:25:32 +0200 Subject: [PATCH 027/127] Replacements for COMM_WORLD were removed. Instead, the Intracomm's class method replacements should now trigger. Added (experimental) support for calling Bcast from a Cart/Intracomm object defined in CPython. --- dace/frontend/common/distr.py | 84 +++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index af08623083..72fe176ac0 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1,6 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from numbers import Integral, Number -from typing import Sequence, Union +from typing import Sequence, Tuple, Union import dace from dace import dtypes, symbolic @@ -58,7 +58,8 @@ def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _cart_create(pv, sdfg, state, dims) @@ -155,18 +156,19 @@ def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Co ##### MPI Collectives -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast') +# @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast') @oprepo.replaces('dace.comm.Bcast') def _bcast(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, root: Union[str, sp.Expr, Number] = 0, - grid: str = None): + grid: str = None, + fcomm: str = None): from dace.libraries.mpi.nodes.bcast import Bcast - libnode = Bcast('_Bcast_', grid) + libnode = Bcast('_Bcast_', grid, fcomm) desc = sdfg.arrays[buffer] in_buffer = state.add_read(buffer) out_buffer = state.add_write(buffer) @@ -185,19 +187,23 @@ def _bcast(pv: ProgramVisitor, return None +@oprepo.replaces_method('Cartcomm', 'Bcast') @oprepo.replaces_method('Intracomm', 'Bcast') def _intracomm_bcast(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, - icomm: 'Intracomm', + comm: Tuple[str, 'Comm'], buffer: str, root: Union[str, sp.Expr, Number] = 0): """ Equivalent to `dace.comm.Bcast(buffer, root)`. """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: - raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') - return _bcast(pv, sdfg, state, buffer, root) + comm_name, comm_obj = comm + if comm_obj == MPI.COMM_WORLD: + return _bcast(pv, sdfg, state, buffer, root) + # NOTE: Highly experimental + sdfg.add_scalar(comm_name, dace.int32) + return _bcast(pv, sdfg, state, buffer, root, fcomm=comm_name) @oprepo.replaces_method('ProcessGrid', 'Bcast') @@ -248,7 +254,6 @@ def _Reduce(pv: ProgramVisitor, return None -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall') @oprepo.replaces('dace.comm.Alltoall') def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None): @@ -271,7 +276,8 @@ def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icom """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _alltoall(pv, sdfg, state, inp_buffer, out_buffer) @@ -284,7 +290,6 @@ def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: s return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid) -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): @@ -306,7 +311,8 @@ def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, ico """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') if inp_buffer != MPI.IN_PLACE: raise ValueError('DaCe currently supports in-place Allreduce only.') @@ -391,7 +397,6 @@ def _gather(pv: ProgramVisitor, ##### Point-To-Point Communication -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send') @oprepo.replaces('dace.comm.Send') def _send(pv: ProgramVisitor, sdfg: SDFG, @@ -464,7 +469,27 @@ def _send(pv: ProgramVisitor, return None -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend') +@oprepo.replaces_method('Intracomm', 'Send') +def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """ + + from mpi4py import MPI + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + return _send(pv, sdfg, state, buffer, dst, tag) + + +@oprepo.replaces_method('ProcessGrid', 'Send') +def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str, + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + """ Equivalent to `dace.comm.Send(buffer, dst, tag, grid=pgrid)`. """ + + raise NotImplementedError('ProcessGrid.Send is not supported yet.') + # return _send(pv, sdfg, state, buffer, dst, tag, grid=pgrid) + + @oprepo.replaces('dace.comm.Isend') def _isend(pv: ProgramVisitor, sdfg: SDFG, @@ -571,7 +596,8 @@ def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) _isend(pv, sdfg, state, buffer, dst, tag, req) @@ -589,7 +615,6 @@ def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, return req -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv') @oprepo.replaces('dace.comm.Recv') def _recv(pv: ProgramVisitor, sdfg: SDFG, @@ -662,7 +687,27 @@ def _recv(pv: ProgramVisitor, return None -@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv') +@oprepo.replaces_method('Intracomm', 'Recv') +def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, + src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """ + + from mpi4py import MPI + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: + raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') + return _recv(pv, sdfg, state, buffer, src, tag) + + +@oprepo.replaces_method('ProcessGrid', 'Recv') +def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str, + src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + """ Equivalent to `dace.comm.Recv(buffer, dst, tag, grid=pgrid)`. """ + + raise NotImplementedError('ProcessGrid.Recv is not supported yet.') + # return _recv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid) + + @oprepo.replaces('dace.comm.Irecv') def _irecv(pv: ProgramVisitor, sdfg: SDFG, @@ -767,7 +812,8 @@ def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """ from mpi4py import MPI - if icomm != MPI.COMM_WORLD: + icomm_name, icomm_obj = icomm + if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) _irecv(pv, sdfg, state, buffer, src, tag, req) From fe22182ceccb4ae286802480fdba8f5b6d598bb6 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:26:19 +0200 Subject: [PATCH 028/127] Added two new Bcast tests for COMM_WORLD and Intracomm object. --- tests/library/mpi/mpi4py_test.py | 72 ++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index a81294c47f..e99768be5c 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -6,6 +6,76 @@ import pytest +@pytest.mark.mpi +def test_comm_world_bcast(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def comm_world_bcast(A: dace.int32[10]): + commworld.Bcast(A) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = comm_world_bcast.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + if rank == 0: + A = np.arange(10, dtype=np.int32) + A_ref = A.copy() + else: + A = np.zeros((10, ), dtype=np.int32) + A_ref = A.copy() + + func(A=A) + comm_world_bcast.f(A_ref) + + assert(np.array_equal(A, A_ref)) + + +@pytest.mark.mpi +def test_external_comm_bcast(): + + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + new_comm = commworld.Split(rank % 2, 0) + + @dace.program + def external_comm_bcast(A: dace.int32[10]): + new_comm.Bcast(A) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = external_comm_bcast.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + if rank == 0: + A = np.arange(10, dtype=np.int32) + A_ref = A.copy() + elif rank == 1: + A = np.arange(10, 20, dtype=np.int32) + A_ref = A.copy() + else: + A = np.zeros((10, ), dtype=np.int32) + A_ref = A.copy() + + func(A=A, new_comm=new_comm.py2f()) + external_comm_bcast.f(A_ref) + + assert(np.array_equal(A, A_ref)) + @pytest.mark.mpi def test_process_grid_bcast(): @@ -258,6 +328,8 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): if __name__ == "__main__": + test_comm_world_bcast() + test_external_comm_bcast() test_process_grid_bcast() test_sub_grid_bcast() test_3mm() From 6c5ffa1d77fdabac4adc80b817e4e9a3e5000050 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:49:10 +0200 Subject: [PATCH 029/127] Restored replacements needed for full name of COMM_WORLD. Cleaned up duplicate methods. --- dace/frontend/common/distr.py | 84 +++++------------------------------ 1 file changed, 10 insertions(+), 74 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 72fe176ac0..68b6f120d8 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -156,7 +156,7 @@ def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Co ##### MPI Collectives -# @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast') +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast') @oprepo.replaces('dace.comm.Bcast') def _bcast(pv: ProgramVisitor, sdfg: SDFG, @@ -224,6 +224,7 @@ def _mpi4py_to_MPI(MPI, op): raise NotImplementedError +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Reduce') @oprepo.replaces('dace.comm.Reduce') def _Reduce(pv: ProgramVisitor, sdfg: SDFG, @@ -254,6 +255,7 @@ def _Reduce(pv: ProgramVisitor, return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall') @oprepo.replaces('dace.comm.Alltoall') def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None): @@ -290,6 +292,7 @@ def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: s return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid) +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce') @oprepo.replaces('dace.comm.Allreduce') def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None): @@ -334,6 +337,7 @@ def _pgrid_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: return _allreduce(pv, sdfg, state, out_buffer, op, grid=pgrid) +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Scatter') @oprepo.replaces('dace.comm.Scatter') def _scatter(pv: ProgramVisitor, sdfg: SDFG, @@ -364,6 +368,7 @@ def _scatter(pv: ProgramVisitor, return None +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Gather') @oprepo.replaces('dace.comm.Gather') def _gather(pv: ProgramVisitor, sdfg: SDFG, @@ -397,6 +402,7 @@ def _gather(pv: ProgramVisitor, ##### Point-To-Point Communication +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send') @oprepo.replaces('dace.comm.Send') def _send(pv: ProgramVisitor, sdfg: SDFG, @@ -490,6 +496,7 @@ def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, # return _send(pv, sdfg, state, buffer, dst, tag, grid=pgrid) +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend') @oprepo.replaces('dace.comm.Isend') def _isend(pv: ProgramVisitor, sdfg: SDFG, @@ -615,6 +622,7 @@ def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, return req +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv') @oprepo.replaces('dace.comm.Recv') def _recv(pv: ProgramVisitor, sdfg: SDFG, @@ -708,6 +716,7 @@ def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, # return _recv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid) +@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv') @oprepo.replaces('dace.comm.Irecv') def _irecv(pv: ProgramVisitor, sdfg: SDFG, @@ -891,79 +900,6 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str): return None -@oprepo.replaces('dace.comm.Cart_create') -def _cart_create(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, dims: ShapeType): - """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html). - - :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. - :return: Name of the new process-grid descriptor. - """ - pgrid_name = sdfg.add_pgrid(dims) - - # Dummy tasklet adds MPI variables to the program's state. - from dace.libraries.mpi import Dummy - tasklet = Dummy(pgrid_name, [ - f'MPI_Comm {pgrid_name}_comm;', - f'MPI_Group {pgrid_name}_group;', - f'int {pgrid_name}_coords[{len(dims)}];', - f'int {pgrid_name}_dims[{len(dims)}];', - f'int {pgrid_name}_rank;', - f'int {pgrid_name}_size;', - f'bool {pgrid_name}_valid;', - ]) - - state.add_node(tasklet) - - # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. - _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) - wnode = state.add_write(pgrid_name) - state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) - - return pgrid_name - - -@oprepo.replaces('dace.comm.Cart_sub') -def _cart_sub(pv: ProgramVisitor, - sdfg: SDFG, - state: SDFGState, - parent_grid: str, - color: Sequence[Union[Integral, bool]], - exact_grid: RankType = None): - """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program. - The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html). - - :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). - :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). - :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication. - :return: Name of the new sub-grid descriptor. - """ - pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid) - - # Count sub-grid dimensions. - pgrid_ndims = sum([bool(c) for c in color]) - - # Dummy tasklet adds MPI variables to the program's state. - from dace.libraries.mpi import Dummy - tasklet = Dummy(pgrid_name, [ - f'MPI_Comm {pgrid_name}_comm;', - f'MPI_Group {pgrid_name}_group;', - f'int {pgrid_name}_coords[{pgrid_ndims}];', - f'int {pgrid_name}_dims[{pgrid_ndims}];', - f'int {pgrid_name}_rank;', - f'int {pgrid_name}_size;', - f'bool {pgrid_name}_valid;', - ]) - - state.add_node(tasklet) - - # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. - _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) - wnode = state.add_write(pgrid_name) - state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) - - return pgrid_name - - @oprepo.replaces('dace.comm.Subarray') def _subarray(pv: ProgramVisitor, sdfg: SDFG, From c3b1a4b0c4f09d1c894540c02ef7d2d51cde4fff Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 15:58:19 +0200 Subject: [PATCH 030/127] Further clean up --- dace/frontend/python/newast.py | 7 ------- dace/frontend/python/replacements.py | 1 - dace/libraries/mpi/nodes/isend.py | 1 + tests/library/mpi/mpi4py_test.py | 4 ++-- tests/library/mpi/mpi_send_recv_test.py | 2 +- 5 files changed, 4 insertions(+), 11 deletions(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index ce2a9e06e1..fef2d989d5 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1150,13 +1150,6 @@ def __init__(self, # Indirections self.indirections = dict() - # Add mpi4py.MPI.COMM_WORLD aliases to variables - # try: - # from mpi4py import MPI - # self.variables.update({k: "MPI_COMM_WORLD" for k, v in self.globals.items() if v is MPI.COMM_WORLD}) - # except: - # pass - @classmethod def progress_count(cls) -> int: """ Returns the number of parsed SDFGs so far within this run. """ diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 30c92be81f..a681f48ba6 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -295,7 +295,6 @@ def _numpy_full(pv: ProgramVisitor, else: is_data = True vtype = sdfg.arrays[fill_value].dtype - # raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value)) dtype = dtype or vtype name, _ = sdfg.add_temp_transient(shape, dtype) diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py index 95b3de3ae7..8de4035515 100644 --- a/dace/libraries/mpi/nodes/isend.py +++ b/dace/libraries/mpi/nodes/isend.py @@ -97,6 +97,7 @@ def validate(self, sdfg, state): if e.src_conn == "_request": req = sdfg.arrays[e.data.data] + # TODO: Should we expect any integer type here and cast to int32 later?. Investigate further in the future. # if dest.dtype.base_type != dace.dtypes.int32: # raise ValueError("Destination must be an integer!") # if tag.dtype.base_type != dace.dtypes.int32: diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index e99768be5c..1bbeae627f 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -91,8 +91,8 @@ def pgrid_bcast(A: dace.int32[10]): if pgrid != MPI.COMM_NULL: pgrid.Bcast(A) - if size < 2: - raise ValueError("Please run this test with at least two processes.") + # if size < 2: + # raise ValueError("Please run this test with at least two processes.") sdfg = None if rank == 0: diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py index ec094e7cf5..bf39c955d3 100644 --- a/tests/library/mpi/mpi_send_recv_test.py +++ b/tests/library/mpi/mpi_send_recv_test.py @@ -103,7 +103,7 @@ def test_dace_send_recv(): sdfg = None if rank == 0: sdfg = dace_send_recv.to_sdfg(simplify=True) - # disable openMP section for blocking + # Disable OpenMP section to allow blocking sdfg.openmp_sections = False mpi_sdfg = utils.distributed_compile(sdfg, comm) From c3f953522d6d5acf969d55dba729a937b412a08e Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 12 Jul 2023 21:36:28 +0200 Subject: [PATCH 031/127] Drop connectors/arguments from (nested) Program/SDFG call, if the connector is not in the SDFG's arrays. --- dace/frontend/python/newast.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 52a6862083..e1629e20c6 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -3740,6 +3740,15 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no for arg in args_to_remove: args.remove(arg) + # Drop args that are not in the SDFG + filtered_args = [] + for conn, arg in args: + if conn not in sdfg.arrays: + warnings.warn(f'Connector {conn} not found in SDFG; dropping it') + else: + filtered_args.append((conn, arg)) + args = filtered_args + # Change connector names updated_args = [] arrays_before = list(sdfg.arrays.items()) @@ -3829,6 +3838,12 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no for k, v in argdict.items() if self._is_outputnode(sdfg, k) } + # If an argument does not register as input nor as output, put it in the inputs. + # This may happen with input arguments that are used to set a promoted scalar. + for k, v in argdict.items(): + if k not in inputs.keys() and k not in outputs.keys(): + inputs[k] = v + # Add closure to global inputs/outputs (e.g., if processed as part of a map) for arrname in closure_arrays.keys(): if arrname not in names_to_replace: @@ -3840,13 +3855,6 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no if narrname in outputs: self.outputs[arrname] = (state, outputs[narrname], []) - # If an argument does not register as input nor as output, - # put it in the inputs. - # This may happen with input argument that are used to set - # a promoted scalar. - for k, v in argdict.items(): - if k not in inputs.keys() and k not in outputs.keys(): - inputs[k] = v # Unset parent inputs/read accesses that # turn out to be outputs/write accesses. for memlet in outputs.values(): From 866e38f605cde529acbb598d496c8271e2cdc463 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 13 Jul 2023 13:39:24 +0200 Subject: [PATCH 032/127] Ensure that the access node exists in the SDFGState. --- dace/transformation/passes/array_elimination.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py index e313f7bf66..d1b80c2327 100644 --- a/dace/transformation/passes/array_elimination.py +++ b/dace/transformation/passes/array_elimination.py @@ -170,6 +170,9 @@ def remove_redundant_copies(self, sdfg: SDFG, state: SDFGState, removable_data: for anode in access_nodes[aname]: if anode in removed_nodes: continue + if anode not in state.nodes(): + removed_nodes.add(anode) + continue if state.out_degree(anode) == 1: succ = state.successors(anode)[0] From 7ae787d7db85b6bbf04ae0be62d3c644b59b9dde Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 14 Jul 2023 14:27:29 +0200 Subject: [PATCH 033/127] Don't add child SDFG's closure arrays to parent SDFG's arrays and to child SDFG's arguments if the array is not actually in the child SDFG. --- dace/frontend/python/newast.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index e1629e20c6..23e1bd9134 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -3653,6 +3653,11 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no # If the symbol is a callback, but is not used in the nested SDFG, skip it continue + # NOTE: Is it possible that an array in the SDFG's closure is not in the SDFG? + # NOTE: Perhaps its use was simplified/optimized away? + if aname not in sdfg.arrays: + continue + # First, we do an inverse lookup on the already added closure arrays for `arr`. is_new_arr = True for k, v in self.nested_closure_arrays.items(): @@ -3740,15 +3745,6 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no for arg in args_to_remove: args.remove(arg) - # Drop args that are not in the SDFG - filtered_args = [] - for conn, arg in args: - if conn not in sdfg.arrays: - warnings.warn(f'Connector {conn} not found in SDFG; dropping it') - else: - filtered_args.append((conn, arg)) - args = filtered_args - # Change connector names updated_args = [] arrays_before = list(sdfg.arrays.items()) From adf32d76ca2ea136e4f450533b01fd34c86891cf Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 07:56:22 -0700 Subject: [PATCH 034/127] Support None sets in SetProperty --- dace/properties.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dace/properties.py b/dace/properties.py index 0e8a010d71..6e883f8549 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -894,11 +894,18 @@ def from_json(self, l, sdfg=None): return set(l) def __get__(self, obj, objtype=None): + val = super(SetProperty, self).__get__(obj, objtype) + if val is None: + return val + # Copy to avoid changes in the set at callee to be reflected in # the node directly - return set(super(SetProperty, self).__get__(obj, objtype)) + return set(val) def __set__(self, obj, val): + if val is None: + return super(SetProperty, self).__set__(obj, val) + # Check for uniqueness if len(val) != len(set(val)): dups = set([x for x in val if val.count(x) > 1]) From 5d7c67b0e13e73452e19845725c922fbadf88a39 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 07:56:37 -0700 Subject: [PATCH 035/127] Filter symbols in PruneSymbols --- dace/transformation/passes/prune_symbols.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dace/transformation/passes/prune_symbols.py b/dace/transformation/passes/prune_symbols.py index 26530b87dc..05220763a9 100644 --- a/dace/transformation/passes/prune_symbols.py +++ b/dace/transformation/passes/prune_symbols.py @@ -23,6 +23,7 @@ class RemoveUnusedSymbols(ppl.Pass): CATEGORY: str = 'Simplification' recursive = properties.Property(dtype=bool, default=True, desc='Prune nested SDFGs recursively') + symbols = properties.SetProperty(element_type=str, allow_none=True, desc='Limit considered symbols to this set') def modifies(self) -> ppl.Modifies: return ppl.Modifies.Symbols @@ -43,11 +44,13 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]: """ result: Set[str] = set() + symbols_to_consider = self.symbols or set(sdfg.symbols.keys()) + # Compute used symbols used_symbols = self.used_symbols(sdfg) # Remove unused symbols - for sym in set(sdfg.symbols.keys()) - used_symbols: + for sym in symbols_to_consider - used_symbols: sdfg.remove_symbol(sym) result.add(sym) From 25839427a4e7415bd10bd37155aa364abc7d03d5 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 07:57:35 -0700 Subject: [PATCH 036/127] Persistent fusion: Remove now-unused symbols and do not remove used scalars --- .../subgraph/gpu_persistent_fusion.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py index 1cf93469bb..df8511a288 100644 --- a/dace/transformation/subgraph/gpu_persistent_fusion.py +++ b/dace/transformation/subgraph/gpu_persistent_fusion.py @@ -45,7 +45,7 @@ class GPUPersistentKernel(SubgraphTransformation): validate = Property( desc="Validate the sdfg and the nested sdfg", dtype=bool, - default=True, + default=False, ) include_in_assignment = Property( @@ -172,10 +172,15 @@ def apply(self, sdfg: SDFG): # create sdfg for kernel and fill it with states and edges from # ssubgraph dfg will be nested at the end kernel_sdfg = SDFG('{}kernel'.format(self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) + new_symbols = set() edges = subgraph.edges() for edge in edges: kernel_sdfg.add_edge(edge.src, edge.dst, edge.data) + for k in entry_edge.data.assignments: + new_symbols.add(k) + if k in sdfg.symbols and k not in kernel_sdfg.symbols: + kernel_sdfg.add_symbol(k, sdfg.symbols[k]) # Setting entry node in nested SDFG if no entry guard was created if entry_guard_state is None: @@ -187,6 +192,7 @@ def apply(self, sdfg: SDFG): # remove the now nested nodes from the outer sdfg and make sure the # launch state is properly connected to remaining states sdfg.remove_nodes_from(subgraph.nodes()) + other_states = sdfg.nodes() if entry_state_out is not None \ and len(sdfg.edges_between(entry_state_out, launch_state)) == 0: @@ -199,13 +205,16 @@ def apply(self, sdfg: SDFG): # Handle data for kernel kernel_data = set(node.data for state in kernel_sdfg for node in state.nodes() if isinstance(node, nodes.AccessNode)) + other_data = set(node.data for state in other_states for node in state.nodes() + if isinstance(node, nodes.AccessNode)) # move Streams and Register data into the nested SDFG # normal data will be added as kernel argument kernel_args = [] for data in kernel_data: - if (isinstance(sdfg.arrays[data], dace.data.Stream) or - (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage == StorageType.Register)): + if data not in other_data and (isinstance(sdfg.arrays[data], dace.data.Stream) or + (isinstance(sdfg.arrays[data], dace.data.Array) + and sdfg.arrays[data].storage == StorageType.Register)): kernel_sdfg.add_datadesc(data, sdfg.arrays[data]) del sdfg.arrays[data] else: @@ -266,6 +275,12 @@ def apply(self, sdfg: SDFG): src_conn=arg, memlet=Memlet.from_array(arg, sdfg.arrays[arg])) + # Remove no-longer-used symbols in parent SDFG + from dace.transformation.passes.prune_symbols import RemoveUnusedSymbols + p = RemoveUnusedSymbols() + p.symbols = new_symbols + p.apply_pass(sdfg, {}) + # Transformation is done if self.validate: sdfg.validate() From 174eb8ed8f64f6a6a7fa54b4893a0d20ff96b933 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 09:07:59 -0700 Subject: [PATCH 037/127] CUDA codegen: persistent free tasklet write changes generated scope --- dace/codegen/targets/cuda.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 8f0139f8fb..ee49f04d03 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1306,10 +1306,31 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre for c in components: has_map = any(isinstance(node, dace.nodes.MapEntry) for node in c.nodes()) + # If a global is modified, execute once per global state, + # if a shared memory element is modified, execute once per block, + # if a local scalar is modified, execute in every thread. if not has_map: - callsite_stream.write("if (blockIdx.x == 0 " - "&& threadIdx.x == 0) " - "{ // sub-graph begin", sdfg, state.node_id) + written_nodes = [n for n in c if state.in_degree(n) > 0 and isinstance(n, dace.nodes.AccessNode)] + + # The order of the branching below matters - it reduces the scope with every detected write + write_scope = 'thread' # General case acts in every thread + if any(sdfg.arrays[n.data].storage in (dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned) + for n in written_nodes): + write_scope = 'grid' + if any(sdfg.arrays[n.data].storage == dtypes.StorageType.GPU_Shared for n in written_nodes): + write_scope = 'block' + if any(sdfg.arrays[n.data].storage == dtypes.StorageType.Register for n in written_nodes): + write_scope = 'thread' + + if write_scope == 'grid': + callsite_stream.write("if (blockIdx.x == 0 " + "&& threadIdx.x == 0) " + "{ // sub-graph begin", sdfg, state.node_id) + elif write_scope == 'block': + callsite_stream.write("if (threadIdx.x == 0) " + "{ // sub-graph begin", sdfg, state.node_id) + else: + callsite_stream.write("{ // subgraph begin", sdfg, state.node_id) else: callsite_stream.write("{ // subgraph begin", sdfg, state.node_id) From aae25a772793e2e9ddbe538b1d4325090c39c90e Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 09:08:17 -0700 Subject: [PATCH 038/127] Handle empty inputs/outputs --- .../subgraph/gpu_persistent_fusion.py | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py index df8511a288..ff4812d0af 100644 --- a/dace/transformation/subgraph/gpu_persistent_fusion.py +++ b/dace/transformation/subgraph/gpu_persistent_fusion.py @@ -213,8 +213,8 @@ def apply(self, sdfg: SDFG): kernel_args = [] for data in kernel_data: if data not in other_data and (isinstance(sdfg.arrays[data], dace.data.Stream) or - (isinstance(sdfg.arrays[data], dace.data.Array) - and sdfg.arrays[data].storage == StorageType.Register)): + (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage + in (StorageType.Register, StorageType.GPU_Shared))): kernel_sdfg.add_datadesc(data, sdfg.arrays[data]) del sdfg.arrays[data] else: @@ -257,23 +257,29 @@ def apply(self, sdfg: SDFG): ) nested_sdfg.schedule = ScheduleType.GPU_Persistent + # If no inputs or outputs were given, connect with an empty memlet + if not kernel_args_read: + launch_state.add_nedge(map_entry, nested_sdfg, dace.Memlet()) + if not kernel_args_write: + launch_state.add_nedge(nested_sdfg, map_exit, dace.Memlet()) + # Create and connect read only data access nodes for arg in kernel_args_read: read_node = launch_state.add_read(arg) - launch_state.add_memlet_path(read_node, - map_entry, - nested_sdfg, - dst_conn=arg, - memlet=Memlet.from_array(arg, sdfg.arrays[arg])) + launch_state.add_edge_pair(map_entry, + nested_sdfg, + read_node, + internal_connector=arg, + internal_memlet=Memlet.from_array(arg, sdfg.arrays[arg])) # Create and connect writable data access nodes for arg in kernel_args_write: write_node = launch_state.add_write(arg) - launch_state.add_memlet_path(nested_sdfg, - map_exit, - write_node, - src_conn=arg, - memlet=Memlet.from_array(arg, sdfg.arrays[arg])) + launch_state.add_edge_pair(map_exit, + nested_sdfg, + write_node, + internal_connector=arg, + internal_memlet=Memlet.from_array(arg, sdfg.arrays[arg])) # Remove no-longer-used symbols in parent SDFG from dace.transformation.passes.prune_symbols import RemoveUnusedSymbols @@ -318,6 +324,12 @@ def is_gpu_state(sdfg: SDFG, state: SDFGState) -> bool: @staticmethod def get_entry_states(sdfg: SDFG, subgraph): + """ + Returns a 2-tuple of the (internal, external) states inside and outside of the SDFG, + around which the new nested SDFG will be created. The first element will be a set + of source nodes in the internal SDFG; and the second element will be a set of + predecessor nodes to the nested SDFG. + """ entry_states_in = set() entry_states_out = set() @@ -333,6 +345,12 @@ def get_entry_states(sdfg: SDFG, subgraph): @staticmethod def get_exit_states(sdfg: SDFG, subgraph): + """ + Returns a 2-tuple of the (internal, external) states inside and outside of the SDFG, + around which the new nested SDFG will be created. The first element will be a set + of sink nodes in the internal SDFG; and the second element will be a set of + successor nodes to the nested SDFG. + """ exit_states_in = set() exit_states_out = set() From f11e5c44e0a796e5af1702dd08cfe32bb787af9b Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 09:08:28 -0700 Subject: [PATCH 039/127] Add test --- tests/persistent_fusion_cudatest.py | 266 ++++++++++++++++------------ 1 file changed, 152 insertions(+), 114 deletions(-) diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py index ac05761bee..415162e4f8 100644 --- a/tests/persistent_fusion_cudatest.py +++ b/tests/persistent_fusion_cudatest.py @@ -10,118 +10,152 @@ N = dace.symbol('N') nnz = dace.symbol('nnz') -bfs = dace.SDFG('bfs') -# Inputs to the BFS SDFG -bfs.add_array('col_index', shape=[nnz], dtype=dace.int32) -bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32) -bfs.add_scalar('root', dtype=dace.int32) -bfs.add_array('result', shape=[N], dtype=dace.int32) +def _make_sdfg(): + bfs = dace.SDFG('bfs') -# Transients fot interstate data transfers -# TODO: Replace may_alias with better code generation -bfs.add_transient('count1', shape=[1], dtype=dace.int32, may_alias=True) -bfs.add_transient('frontier1', shape=[N], dtype=dace.int32, may_alias=True) + # Inputs to the BFS SDFG + bfs.add_array('col_index', shape=[nnz], dtype=dace.int32) + bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32) + bfs.add_scalar('root', dtype=dace.int32) + bfs.add_array('result', shape=[N], dtype=dace.int32) -bfs.add_transient('count2', shape=[1], dtype=dace.int32, may_alias=True) -bfs.add_transient('frontier2', shape=[N], dtype=dace.int32, may_alias=True) + # Transients fot interstate data transfers + # TODO: Replace may_alias with better code generation + bfs.add_transient('count1', shape=[1], dtype=dace.int32, may_alias=True) + bfs.add_transient('frontier1', shape=[N], dtype=dace.int32, may_alias=True) -# Transient streams to accommodate dynamic size of frontier arrays -bfs.add_stream('stream1', dtype=dace.int32, transient=True, buffer_size=N) -bfs.add_stream('stream2', dtype=dace.int32, transient=True, buffer_size=N) + bfs.add_transient('count2', shape=[1], dtype=dace.int32, may_alias=True) + bfs.add_transient('frontier2', shape=[N], dtype=dace.int32, may_alias=True) -# Transients needed for update states -bfs.add_transient('temp_ids1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) -bfs.add_transient('temp_ide1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) + # Transient streams to accommodate dynamic size of frontier arrays + bfs.add_stream('stream1', dtype=dace.int32, transient=True, buffer_size=N) + bfs.add_stream('stream2', dtype=dace.int32, transient=True, buffer_size=N) -bfs.add_transient('temp_ids2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) -bfs.add_transient('temp_ide2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) + # Transients needed for update states + bfs.add_transient('temp_ids1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) + bfs.add_transient('temp_ide1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) -# Adding states -# init data -s_init = bfs.add_state('init') + bfs.add_transient('temp_ids2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) + bfs.add_transient('temp_ide2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register) -# copy of the states because we don't want to copy the data -s_reset1 = bfs.add_state('reset1') -s_update1 = bfs.add_state('update1') + # Adding states + # init data + s_init = bfs.add_state('init') -s_reset2 = bfs.add_state('reset2') -s_update2 = bfs.add_state('update2') + # copy of the states because we don't want to copy the data + s_reset1 = bfs.add_state('reset1') + s_update1 = bfs.add_state('update1') -# end state to make transformation work -s_end = bfs.add_state('end') + s_reset2 = bfs.add_state('reset2') + s_update2 = bfs.add_state('update2') -# Connecting states with appropriate conditions and depth updates -bfs.add_edge(s_init, s_reset1, dace.InterstateEdge(None, {'depth': '1'})) -bfs.add_edge(s_reset1, s_update1, dace.InterstateEdge(None)) -bfs.add_edge(s_update1, s_reset2, dace.InterstateEdge('count2[0] > 0', {'depth': 'depth + 1'})) -bfs.add_edge(s_update1, s_end, dace.InterstateEdge('count2[0] <= 0')) -bfs.add_edge(s_reset2, s_update2, dace.InterstateEdge(None)) -bfs.add_edge(s_update2, s_reset1, dace.InterstateEdge('count1[0] > 0', {'depth': 'depth + 1'})) -bfs.add_edge(s_update2, s_end, dace.InterstateEdge('count1[0] <= 0')) + # end state to make transformation work + s_end = bfs.add_state('end') -# ----------------------------- -# Helper functions to init data -# ----------------------------- + # Connecting states with appropriate conditions and depth updates + bfs.add_edge(s_init, s_reset1, dace.InterstateEdge(None, {'depth': '1'})) + bfs.add_edge(s_reset1, s_update1, dace.InterstateEdge(None)) + bfs.add_edge(s_update1, s_reset2, dace.InterstateEdge('count2[0] > 0', {'depth': 'depth + 1'})) + bfs.add_edge(s_update1, s_end, dace.InterstateEdge('count2[0] <= 0')) + bfs.add_edge(s_reset2, s_update2, dace.InterstateEdge(None)) + bfs.add_edge(s_update2, s_reset1, dace.InterstateEdge('count1[0] > 0', {'depth': 'depth + 1'})) + bfs.add_edge(s_update2, s_end, dace.InterstateEdge('count1[0] <= 0')) + # ============================================================= + # State: init + # Filling init state with init of result, frontier1, and count1 -def init_scalar(state, node, value): - tasklet = state.add_tasklet('set_%s' % node.data, {}, {'out'}, ''' -out = %d - ''' % value) + root_in = s_init.add_read('root') - state.add_memlet_path(tasklet, node, src_conn='out', memlet=dace.Memlet.simple(node.data, '0')) + count1_out = s_init.add_write('count1') + result_out = s_init.add_write('result') + frontier_out = s_init.add_write('frontier1') + + s_init.add_memlet_path(root_in, frontier_out, memlet=dace.Memlet.simple(root_in.data, '0', other_subset_str='0')) + + tasklet = s_init.add_tasklet( + 'set_count1', + {}, + {'out'}, + 'out = 1', + ) + + s_init.add_memlet_path(tasklet, count1_out, src_conn='out', memlet=dace.Memlet.simple(count1_out.data, '0')) + + map_entry, map_exit = s_init.add_map( + 'set_result_map', + dict(i='0:N'), + ) + + tasklet = s_init.add_tasklet('set_result', {'root_idx'}, {'result_out'}, 'result_out = 0 if i == root_idx else -1') + + s_init.add_memlet_path(root_in, map_entry, tasklet, dst_conn='root_idx', memlet=dace.Memlet.simple(root_in.data, '0')) + s_init.add_memlet_path(tasklet, + map_exit, + result_out, + src_conn='result_out', + memlet=dace.Memlet.simple(result_out.data, 'i')) -# ============================================================= -# State: init -# Filling init state with init of result, frontier1, and count1 + # ------------------------------------------------------------- -root_in = s_init.add_read('root') + # ============================================================= + # State: reset + # Filling reset states, respective count is reset to 0 -count1_out = s_init.add_write('count1') -result_out = s_init.add_write('result') -frontier_out = s_init.add_write('frontier1') + count2_out = s_reset1.add_write('count2') + init_scalar(s_reset1, count2_out, 0) -s_init.add_memlet_path(root_in, frontier_out, memlet=dace.Memlet.simple(root_in.data, '0', other_subset_str='0')) + count1_out = s_reset2.add_write('count1') + init_scalar(s_reset2, count1_out, 0) -tasklet = s_init.add_tasklet( - 'set_count1', - {}, - {'out'}, - 'out = 1', -) + # ------------------------------------------------------------- -s_init.add_memlet_path(tasklet, count1_out, src_conn='out', memlet=dace.Memlet.simple(count1_out.data, '0')) + # Filling update states, only difference is which frontier/count they read/write from/to -map_entry, map_exit = s_init.add_map( - 'set_result_map', - dict(i='0:N'), -) + front_in = s_update1.add_read('frontier1') + count_in = s_update1.add_read('count1') -tasklet = s_init.add_tasklet('set_result', {'root_idx'}, {'result_out'}, 'result_out = 0 if i == root_idx else -1') + front_out = s_update1.add_write('frontier2') + count_out = s_update1.add_write('count2') -s_init.add_memlet_path(root_in, map_entry, tasklet, dst_conn='root_idx', memlet=dace.Memlet.simple(root_in.data, '0')) + stream2_io = s_update1.add_access('stream2') -s_init.add_memlet_path(tasklet, - map_exit, - result_out, - src_conn='result_out', - memlet=dace.Memlet.simple(result_out.data, 'i')) + temp_ids1_io = s_update1.add_access('temp_ids1') + temp_ide1_io = s_update1.add_access('temp_ide1') -# ------------------------------------------------------------- + fill_update_state(s_update1, front_in, count_in, front_out, count_out, stream2_io, temp_ids1_io, temp_ide1_io) -# ============================================================= -# State: reset -# Filling reset states, respective count is reset to 0 + front_in = s_update2.add_read('frontier2') + count_in = s_update2.add_read('count2') -count2_out = s_reset1.add_write('count2') -init_scalar(s_reset1, count2_out, 0) + front_out = s_update2.add_write('frontier1') + count_out = s_update2.add_write('count1') -count1_out = s_reset2.add_write('count1') -init_scalar(s_reset2, count1_out, 0) + stream1_io = s_update2.add_access('stream1') -# ------------------------------------------------------------- + temp_ids2_io = s_update2.add_access('temp_ids2') + temp_ide2_io = s_update2.add_access('temp_ide2') + + fill_update_state(s_update2, front_in, count_in, front_out, count_out, stream1_io, temp_ids2_io, temp_ide2_io) + + # validate and generate sdfg + bfs.fill_scope_connectors() + bfs.validate() + return bfs, s_init + +# ----------------------------- +# Helper functions to init data +# ----------------------------- + + +def init_scalar(state, node, value): + tasklet = state.add_tasklet('set_%s' % node.data, {}, {'out'}, ''' +out = %d + ''' % value) + + state.add_memlet_path(tasklet, node, src_conn='out', memlet=dace.Memlet.simple(node.data, '0')) # Here the state is duplicated so the memory doesn't have to be copied from one to another @@ -233,42 +267,10 @@ def fill_update_state(state, front_in, front_in_count, front_out, front_out_coun state.add_memlet_path(s_frontier_io, front_out, memlet=dace.Memlet.simple(front_out.data, '0')) -# Filling update states, only difference is which frontier/count they read/write from/to - -front_in = s_update1.add_read('frontier1') -count_in = s_update1.add_read('count1') - -front_out = s_update1.add_write('frontier2') -count_out = s_update1.add_write('count2') - -stream2_io = s_update1.add_access('stream2') - -temp_ids1_io = s_update1.add_access('temp_ids1') -temp_ide1_io = s_update1.add_access('temp_ide1') - -fill_update_state(s_update1, front_in, count_in, front_out, count_out, stream2_io, temp_ids1_io, temp_ide1_io) - -front_in = s_update2.add_read('frontier2') -count_in = s_update2.add_read('count2') - -front_out = s_update2.add_write('frontier1') -count_out = s_update2.add_write('count1') - -stream1_io = s_update2.add_access('stream1') - -temp_ids2_io = s_update2.add_access('temp_ids2') -temp_ide2_io = s_update2.add_access('temp_ide2') - -fill_update_state(s_update2, front_in, count_in, front_out, count_out, stream1_io, temp_ids2_io, temp_ide2_io) - -# validate and generate sdfg -bfs.fill_scope_connectors() -bfs.validate() - @pytest.mark.gpu def test_persistent_fusion(): - sdfg = bfs + sdfg, s_init = _make_sdfg() sdfg.apply_gpu_transformations(validate=False, simplify=False) # Only validate after fusion @@ -320,7 +322,43 @@ def test_persistent_fusion(): assert np.allclose(depth, reference), "Result doesn't match!" +def test_persistent_fusion_interstate(): + N = dace.symbol('N', dtype=dace.int64) + + + @dace.program(auto_optimize=False, device=dace.DeviceType.GPU) + def func(A: dace.float64[N], B: dace.float64[N]): + a = 10.2 + + for t in range(1, 10): + if t < N: + A[:] = (A + B + a) / 2 + a += 1 + + # Initialization + N = 100 + A = np.random.rand(N) + B = np.random.rand(N) + + sdfg = func.to_sdfg() + sdfg.apply_gpu_transformations() + content_nodes = set(sdfg.nodes()) - {sdfg.start_state, sdfg.sink_nodes()[0]} + subgraph = SubgraphView(sdfg, content_nodes) + + transform = GPUPersistentKernel() + transform.setup_match(subgraph) + transform.kernel_prefix = 'stuff' + transform.apply(sdfg) + + aref = np.copy(A) + func.f(aref, B) + + sdfg(A=A, B=B, N=N) + + assert np.allclose(A, aref) + # Actual execution if __name__ == "__main__": test_persistent_fusion() + test_persistent_fusion_interstate() From 37b65669927c3476ee3bfa2ca60bd44d4999a427 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 09:13:06 -0700 Subject: [PATCH 040/127] Add test --- .../gpu_scalar_execution_context_test.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 tests/codegen/gpu_scalar_execution_context_test.py diff --git a/tests/codegen/gpu_scalar_execution_context_test.py b/tests/codegen/gpu_scalar_execution_context_test.py new file mode 100644 index 0000000000..f738bfe26c --- /dev/null +++ b/tests/codegen/gpu_scalar_execution_context_test.py @@ -0,0 +1,91 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" +Tests how code is generated for free tasklets inside a GPU kernel nested SDFG. +""" + +import dace +from dace.sdfg.graph import SubgraphView +from dace.transformation.subgraph import GPUPersistentKernel +import numpy as np +import pytest + + +def _tester(A: dace.float64[64]): + t = 12.3 + for _ in range(5): + A += t + t += 1.01 + + +def _modify_array(sdfg: dace.SDFG, storage: dace.StorageType): + for nsdfg, aname, aval in sdfg.arrays_recursive(): + if aname == 't': + if storage == dace.StorageType.GPU_Shared: + aval = dace.data.Array(aval.dtype, [1], transient=aval.transient) + nsdfg.arrays[aname] = aval + aval.storage = storage + break + else: + raise ValueError('Array not found') + + +def _make_program(storage: dace.StorageType, persistent=False): + sdfg = dace.program(_tester).to_sdfg() + sdfg.apply_gpu_transformations(simplify=False) + _modify_array(sdfg, storage) + + if persistent: + content_nodes = set(sdfg.nodes()) - {sdfg.start_state, sdfg.sink_nodes()[0]} + subgraph = SubgraphView(sdfg, content_nodes) + transform = GPUPersistentKernel() + transform.setup_match(subgraph) + transform.apply(sdfg) + + return sdfg + + +@pytest.mark.gpu +def test_global_scalar_update(): + sdfg = _make_program(dace.StorageType.GPU_Global, True) + a = np.random.rand(64) + aref = np.copy(a) + _tester(aref) + sdfg(a) + assert np.allclose(a, aref) + + +@pytest.mark.gpu +def test_shared_scalar_update(): + sdfg = _make_program(dace.StorageType.GPU_Shared, persistent=True) + + a = np.random.rand(64) + aref = np.copy(a) + _tester(aref) + + # Ensure block size will create at least two thread-blocks + with dace.config.set_temporary('compiler', 'cuda', 'persistent_map_SM_fraction', value=0.0001): + with dace.config.set_temporary('compiler', 'cuda', 'persistent_map_occupancy', value=2): + with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32,1,1'): + sdfg(a) + + assert np.allclose(a, aref) + + +@pytest.mark.gpu +@pytest.mark.parametrize('persistent', (False, True)) +def test_register_scalar_update(persistent): + sdfg = _make_program(dace.StorageType.Register, persistent) + + a = np.random.rand(64) + aref = np.copy(a) + _tester(aref) + sdfg(a) + + assert np.allclose(a, aref) + + +if __name__ == '__main__': + test_global_scalar_update() + test_shared_scalar_update() + test_register_scalar_update(False) + test_register_scalar_update(True) From 6ae012ab285d92944bcd036b7edc97a758ce1952 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 11:01:21 -0700 Subject: [PATCH 041/127] Fix test condition --- tests/persistent_fusion_cudatest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py index 415162e4f8..4d2f38ceb1 100644 --- a/tests/persistent_fusion_cudatest.py +++ b/tests/persistent_fusion_cudatest.py @@ -322,6 +322,8 @@ def test_persistent_fusion(): assert np.allclose(depth, reference), "Result doesn't match!" + +@pytest.mark.gpu def test_persistent_fusion_interstate(): N = dace.symbol('N', dtype=dace.int64) From 152b69c9121a2fdd277674fd9e678e413e1bb1f7 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 17 Jul 2023 12:26:17 -0700 Subject: [PATCH 042/127] Handle missing symbols better --- dace/transformation/passes/prune_symbols.py | 8 ++++++-- tests/persistent_fusion_cudatest.py | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dace/transformation/passes/prune_symbols.py b/dace/transformation/passes/prune_symbols.py index 05220763a9..94fcbdbc58 100644 --- a/dace/transformation/passes/prune_symbols.py +++ b/dace/transformation/passes/prune_symbols.py @@ -51,8 +51,9 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]: # Remove unused symbols for sym in symbols_to_consider - used_symbols: - sdfg.remove_symbol(sym) - result.add(sym) + if sym in sdfg.symbols: + sdfg.remove_symbol(sym) + result.add(sym) if self.recursive: # Prune nested SDFGs recursively @@ -62,7 +63,10 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]: for state in sdfg.nodes(): for node in state.nodes(): if isinstance(node, nodes.NestedSDFG): + old_symbols = self.symbols + self.symbols = set() nres = self.apply_pass(node.sdfg, _) + self.symbols = old_symbols if nres: result.update(nres) diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py index 4d2f38ceb1..e193b7431c 100644 --- a/tests/persistent_fusion_cudatest.py +++ b/tests/persistent_fusion_cudatest.py @@ -19,6 +19,7 @@ def _make_sdfg(): bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32) bfs.add_scalar('root', dtype=dace.int32) bfs.add_array('result', shape=[N], dtype=dace.int32) + bfs.add_symbol('depth', dace.int32) # Transients fot interstate data transfers # TODO: Replace may_alias with better code generation From 57abd284500c1990ad2744160eb92aab5d08756d Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 18 Jul 2023 14:41:47 +0200 Subject: [PATCH 043/127] Added NestedDataClassProperty for nested data. --- dace/properties.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/dace/properties.py b/dace/properties.py index 6e883f8549..30a3e0913b 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -1381,6 +1381,45 @@ def from_json(obj, context=None): raise TypeError("Cannot parse type from: {}".format(obj)) +class NestedDataClassProperty(Property): + """ Custom property type for nested data. """ + + def __get__(self, obj, objtype=None) -> 'Data': + return super().__get__(obj, objtype) + + @property + def dtype(self): + return pydoc.locate("dace.data.Data") + + @staticmethod + def from_string(s): + dtype = pydoc.locate("dace.data.{}".format(s)) + if dtype is None or not isinstance(dtype, pydoc.locate("dace.data.Data")): + raise ValueError("Not a valid data type: {}".format(s)) + return dtype + + @staticmethod + def to_string(obj): + return obj.to_string() + + def to_json(self, obj): + if obj is None: + return None + return obj.dtype.to_json() + + @staticmethod + def from_json(obj, context=None): + if obj is None: + return None + elif isinstance(obj, str): + return NestedDataClassProperty.from_string(obj) + elif isinstance(obj, dict): + # Let the deserializer handle this + return dace.serialize.from_json(obj) + else: + raise TypeError("Cannot parse type from: {}".format(obj)) + + class LibraryImplementationProperty(Property): """ Property for choosing an implementation type for a library node. On the From 09465d242fbf33036ebf35e1c9b43357c60648ca Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 18 Jul 2023 14:42:33 +0200 Subject: [PATCH 044/127] Added Structures and StructArrays. --- dace/data.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 6 deletions(-) diff --git a/dace/data.py b/dace/data.py index 2fc5f334c6..886fed75de 100644 --- a/dace/data.py +++ b/dace/data.py @@ -1,10 +1,10 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import copy as cp import ctypes import functools -import re + from numbers import Number -from typing import Any, Dict, Optional, Sequence, Set, Tuple +from typing import Any, Dict, Optional, Sequence, Set, Tuple, Union import numpy import sympy as sp @@ -17,9 +17,8 @@ import dace.dtypes as dtypes from dace import serialize, symbolic from dace.codegen import cppunparse -from dace.properties import (CodeProperty, DebugInfoProperty, DictProperty, EnumProperty, ListProperty, Property, - ReferenceProperty, ShapeProperty, SubsetProperty, SymbolicProperty, TypeClassProperty, - make_properties) +from dace.properties import (DebugInfoProperty, DictProperty, EnumProperty, ListProperty, NestedDataClassProperty, + Property, ShapeProperty, SymbolicProperty, TypeClassProperty, make_properties) def create_datadescriptor(obj, no_custom_desc=False): @@ -342,6 +341,86 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global): return new_desc +class Structure(Data): + """ Base class for structures. """ + + def __init__(self, + shape: Sequence[Union[int, symbolic.SymbolicType]] = None, + transient: bool = False, + storage: dtypes.StorageType = dtypes.StorageType.Default, + location: Dict[str, str] = None, + lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, + debuginfo: dtypes.DebugInfo = None): + fields = { + attr: getattr(self, attr) + for attr in dir(self) if ( + not attr in dir(Data) and + not attr.startswith("_") and + not attr in ('total_size', 'offset', 'start_offset', 'strides'))} + fields_and_types = dict() + symbols = set() + for attr in dir(self): + if (attr in dir(Data) or attr.startswith("__") or + attr in ('total_size', 'offset', 'start_offset', 'strides')): + continue + value = getattr(self, attr) + if isinstance(value, Array): + symbols |= value.free_symbols + fields_and_types[attr] = (dtypes.pointer(value.dtype), str(_prod(value.shape))) + elif isinstance(value, Scalar): + symbols |= value.free_symbols + fields_and_types[attr] = value.dtype + elif isinstance(value, (sp.Basic, symbolic.SymExpr)): + symbols |= value.free_symbols + fields_and_types[attr] = symbolic.symtype(value) + elif isinstance(value, (int, numpy.integer)): + fields_and_types[attr] = dtypes.typeclass(type(value)) + else: + raise TypeError(f"Attribute {attr}'s value {value} has unsupported type: {type(value)}") + for s in symbols: + if str(s) in fields_and_types: + continue + if hasattr(s, "dtype"): + fields_and_types[str(s)] = s.dtype + else: + fields_and_types[str(s)] = dtypes.int32 + dtype = dtypes.struct(self.__class__.__name__, **fields_and_types) + shape = shape or (1,) + super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) + + @property + def total_size(self): + return -1 + + @property + def offset(self): + return [0] + + @property + def start_offset(self): + return 0 + + @property + def strides(self): + return [1] + + def as_arg(self, with_types=True, for_call=False, name=None): + if self.storage is dtypes.StorageType.GPU_Global: + return Array(self.dtype, [1]).as_arg(with_types, for_call, name) + if not with_types or for_call: + return name + return self.dtype.as_arg(name) + + def __getitem__(self, s): + """ This is syntactic sugar that allows us to define an array type + with the following syntax: ``Structure[N,M]`` + :return: A ``data.Array`` data descriptor. + """ + if isinstance(s, list) or isinstance(s, tuple): + return StructArray(self, tuple(s)) + return StructArray(self, (s, )) + + @make_properties class Scalar(Data): """ Data descriptor of a scalar value. """ @@ -902,6 +981,36 @@ def free_symbols(self): return result +@make_properties +class StructArray(Array): + """ Array of Structures. """ + + stype = NestedDataClassProperty(allow_none=True, default=None) + + def __init__(self, + stype, + shape, + transient=False, + allow_conflicts=False, + storage=dtypes.StorageType.Default, + location=None, + strides=None, + offset=None, + may_alias=False, + lifetime=dtypes.AllocationLifetime.Scope, + alignment=0, + debuginfo=None, + total_size=-1, + start_offset=None, + optional=None, + pool=False): + + self.stype = stype + dtype = stype.dtype + super(StructArray, self).__init__(dtype, shape, transient, allow_conflicts, storage, location, strides, offset, + may_alias, lifetime, alignment, debuginfo, total_size, start_offset, optional, pool) + + @make_properties class View(Array): """ From 51776a1b746126194fc1eebcece20adbe88be302 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 18 Jul 2023 15:09:00 +0200 Subject: [PATCH 045/127] Break array lengths down to their symbolic tokents. --- dace/dtypes.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index dee2283f25..230197bc6f 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -791,6 +791,7 @@ def from_json(json_obj, context=None): return ret def _parse_field_and_types(self, **fields_and_types): + from dace.symbolic import pystr_to_symbolic self._data = dict() self._length = dict() self.bytes = 0 @@ -799,8 +800,12 @@ def _parse_field_and_types(self, **fields_and_types): t, l = v if not isinstance(t, pointer): raise TypeError("Only pointer types may have a length.") - if l not in fields_and_types.keys(): - raise ValueError("Length {} not a field of struct {}".format(l, self.name)) + sym_tokens = pystr_to_symbolic(l).free_symbols + for sym in sym_tokens: + if str(sym) not in fields_and_types.keys(): + raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}") + # if l not in fields_and_types.keys(): + # raise ValueError("Length {} not a field of struct {}".format(l, self.name)) self._data[k] = t self._length[k] = l self.bytes += t.bytes From b23ed86de823398321ef6f620e3db0d3fd7f857b Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 18 Jul 2023 15:11:09 +0200 Subject: [PATCH 046/127] Allow structures to have fields whose name doesn't start with underscore. --- dace/properties.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dace/properties.py b/dace/properties.py index 30a3e0913b..679c0b9596 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import ast from collections import OrderedDict import copy @@ -412,12 +412,12 @@ def initialize_properties(obj, *args, **kwargs): except AttributeError: if not prop.unmapped: raise PropertyError("Property {} is unassigned in __init__ for {}".format(name, cls.__name__)) - # Assert that there are no fields in the object not captured by - # properties, unless they are prefixed with "_" - for name, prop in obj.__dict__.items(): - if (name not in properties and not name.startswith("_") and name not in dir(type(obj))): - raise PropertyError("{} : Variable {} is neither a Property nor " - "an internal variable (prefixed with \"_\")".format(str(type(obj)), name)) + # Assert that there are no fields in the object not captured by properties, unless they are prefixed with "_" + if not isinstance(obj, dace.data.Structure): + for name, prop in obj.__dict__.items(): + if (name not in properties and not name.startswith("_") and name not in dir(type(obj))): + raise PropertyError("{} : Variable {} is neither a Property nor " + "an internal variable (prefixed with \"_\")".format(str(type(obj)), name)) # Replace the __init__ method cls.__init__ = initialize_properties From 777821f0a940bc2f981ef5c04749c0f49968e0d1 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:21:54 +0200 Subject: [PATCH 047/127] Structures now have a "members" dictionary. Their dtype is a pointer to the corresponding dtypes.struct typeclass. --- dace/data.py | 64 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/dace/data.py b/dace/data.py index 886fed75de..0f1ef1f266 100644 --- a/dace/data.py +++ b/dace/data.py @@ -341,42 +341,54 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global): return new_desc +def _arrays_to_json(arrays): + if arrays is None: + return None + return {k: serialize.to_json(v) for k, v in arrays.items()} + + +def _arrays_from_json(obj, context=None): + if obj is None: + return {} + return {k: serialize.from_json(v, context) for k, v in obj.items()} + + +@make_properties class Structure(Data): """ Base class for structures. """ + members = Property(dtype=dict, + desc="Dictionary of structure members", + from_json=_arrays_from_json, + to_json=_arrays_to_json) + def __init__(self, - shape: Sequence[Union[int, symbolic.SymbolicType]] = None, + members: Dict[str, Any], transient: bool = False, storage: dtypes.StorageType = dtypes.StorageType.Default, location: Dict[str, str] = None, lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, debuginfo: dtypes.DebugInfo = None): - fields = { - attr: getattr(self, attr) - for attr in dir(self) if ( - not attr in dir(Data) and - not attr.startswith("_") and - not attr in ('total_size', 'offset', 'start_offset', 'strides'))} + self.members = members or {} fields_and_types = dict() symbols = set() - for attr in dir(self): - if (attr in dir(Data) or attr.startswith("__") or - attr in ('total_size', 'offset', 'start_offset', 'strides')): - continue - value = getattr(self, attr) - if isinstance(value, Array): - symbols |= value.free_symbols - fields_and_types[attr] = (dtypes.pointer(value.dtype), str(_prod(value.shape))) - elif isinstance(value, Scalar): - symbols |= value.free_symbols - fields_and_types[attr] = value.dtype - elif isinstance(value, (sp.Basic, symbolic.SymExpr)): - symbols |= value.free_symbols - fields_and_types[attr] = symbolic.symtype(value) - elif isinstance(value, (int, numpy.integer)): - fields_and_types[attr] = dtypes.typeclass(type(value)) + for k, v in members.items(): + if isinstance(v, Structure): + symbols |= v.free_symbols + fields_and_types[k] = (v.dtype, str(v.total_size)) + elif isinstance(v, Array): + symbols |= v.free_symbols + fields_and_types[k] = (dtypes.pointer(v.dtype), str(_prod(v.shape))) + elif isinstance(v, Scalar): + symbols |= v.free_symbols + fields_and_types[k] = v.dtype + elif isinstance(v, (sp.Basic, symbolic.SymExpr)): + symbols |= v.free_symbols + fields_and_types[k] = symbolic.symtype(v) + elif isinstance(v, (int, numpy.integer)): + fields_and_types[k] = dtypes.typeclass(type(v)) else: - raise TypeError(f"Attribute {attr}'s value {value} has unsupported type: {type(value)}") + raise TypeError(f"Attribute {k}'s value {v} has unsupported type: {type(v)}") for s in symbols: if str(s) in fields_and_types: continue @@ -384,8 +396,8 @@ def __init__(self, fields_and_types[str(s)] = s.dtype else: fields_and_types[str(s)] = dtypes.int32 - dtype = dtypes.struct(self.__class__.__name__, **fields_and_types) - shape = shape or (1,) + dtype = dtypes.pointer(dtypes.struct(self.__class__.__name__, **fields_and_types)) + shape = (1,) super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) @property From ebf72068e4b27ed777fb835bc75c835980d502d6 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:24:37 +0200 Subject: [PATCH 048/127] dtype.structs store their ctype in `_FFI_CTYPES`. --- dace/dtypes.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index 230197bc6f..d01209469f 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ A module that contains various DaCe type definitions. """ from __future__ import print_function import ctypes @@ -654,6 +654,8 @@ def from_json(json_obj, context=None): def as_ctypes(self): """ Returns the ctypes version of the typeclass. """ + if isinstance(self._typeclass, struct): + return ctypes.POINTER(self._typeclass.as_ctypes()) return ctypes.POINTER(_FFI_CTYPES[self.type]) def as_numpy_dtype(self): @@ -804,8 +806,6 @@ def _parse_field_and_types(self, **fields_and_types): for sym in sym_tokens: if str(sym) not in fields_and_types.keys(): raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}") - # if l not in fields_and_types.keys(): - # raise ValueError("Length {} not a field of struct {}".format(l, self.name)) self._data[k] = t self._length[k] = l self.bytes += t.bytes @@ -817,16 +817,24 @@ def _parse_field_and_types(self, **fields_and_types): def as_ctypes(self): """ Returns the ctypes version of the typeclass. """ + if self in _FFI_CTYPES: + return _FFI_CTYPES[self] # Populate the ctype fields for the struct class. fields = [] for k, v in self._data.items(): if isinstance(v, pointer): - fields.append((k, ctypes.c_void_p)) # ctypes.POINTER(_FFI_CTYPES[v.type]))) + if isinstance(v._typeclass, struct): + fields.append((k, ctypes.POINTER(v._typeclass.as_ctypes()))) + else: + fields.append((k, ctypes.c_void_p)) + elif isinstance(v, struct): + fields.append((k, v.as_ctypes())) else: fields.append((k, _FFI_CTYPES[v.type])) fields = sorted(fields, key=lambda f: f[0]) # Create new struct class. struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields}) + _FFI_CTYPES[self] = struct_class return struct_class def as_numpy_dtype(self): From c52a48257ffbb7933aec3b04fd7029cdafce77a8 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:26:03 +0200 Subject: [PATCH 049/127] Reverted underscore exception for Structures. --- dace/properties.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dace/properties.py b/dace/properties.py index 679c0b9596..2225b6d853 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -413,11 +413,10 @@ def initialize_properties(obj, *args, **kwargs): if not prop.unmapped: raise PropertyError("Property {} is unassigned in __init__ for {}".format(name, cls.__name__)) # Assert that there are no fields in the object not captured by properties, unless they are prefixed with "_" - if not isinstance(obj, dace.data.Structure): - for name, prop in obj.__dict__.items(): - if (name not in properties and not name.startswith("_") and name not in dir(type(obj))): - raise PropertyError("{} : Variable {} is neither a Property nor " - "an internal variable (prefixed with \"_\")".format(str(type(obj)), name)) + for name, prop in obj.__dict__.items(): + if (name not in properties and not name.startswith("_") and name not in dir(type(obj))): + raise PropertyError("{} : Variable {} is neither a Property nor " + "an internal variable (prefixed with \"_\")".format(str(type(obj)), name)) # Replace the __init__ method cls.__init__ = initialize_properties From 40cc858f992d71a49730d934268c31d380d8e82b Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:26:40 +0200 Subject: [PATCH 050/127] Small fixes. --- dace/codegen/compiled_sdfg.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index d0d29cfa1e..863e804802 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -452,9 +452,10 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: # GPU scalars are pointers, so this is fine if atype.storage != dtypes.StorageType.GPU_Global: raise TypeError('Passing an array to a scalar (type %s) in argument "%s"' % (atype.dtype.ctype, a)) - elif not isinstance(atype, dt.Array) and not isinstance(atype.dtype, dtypes.callback) and not isinstance( - arg, - (atype.dtype.type, sp.Basic)) and not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype): + elif (not isinstance(atype, (dt.Array, dt.Structure)) and + not isinstance(atype.dtype, dtypes.callback) and + not isinstance(arg, (atype.dtype.type, sp.Basic)) and + not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype)): if isinstance(arg, int) and atype.dtype.type == np.int64: pass elif isinstance(arg, float) and atype.dtype.type == np.float64: @@ -521,7 +522,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: # Construct init args, which only consist of the symbols symbols = self._free_symbols initargs = tuple( - actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg + actype(arg) if not isinstance(arg, ctypes._SimpleCData) else arg for arg, actype, atype, aname in callparams if aname in symbols) # Replace arrays with their base host/device pointers @@ -531,7 +532,8 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: try: newargs = tuple( - actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg for arg, actype, atype in newargs) + actype(arg) if not isinstance(arg, (ctypes._SimpleCData)) else arg + for arg, actype, atype in newargs) except TypeError: # Pinpoint bad argument for i, (arg, actype, _) in enumerate(newargs): From dd73aaa8816864958fc4fd547e16d5372519f167 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:27:17 +0200 Subject: [PATCH 051/127] WIP: Replace ',' with '->' to quickly support nested data. --- dace/codegen/targets/cpp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index afbc6fca12..7d54e985f5 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -370,6 +370,8 @@ def make_const(expr: str) -> str: # Register defined variable dispatcher.defined_vars.add(pointer_name, defined_type, typedef, allow_shadowing=True) + expr = expr.replace('.', '->') + return (typedef + ref, pointer_name, expr) From 623a7f88838f0a3bc033333bef28e4de03544d37 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:28:08 +0200 Subject: [PATCH 052/127] Recursively add to arglist nested data descriptors. --- dace/codegen/targets/cpu.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index eb7d232966..2759c9744c 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -55,10 +55,30 @@ def __init__(self, frame_codegen, sdfg): # Keep track of generated NestedSDG, and the name of the assigned function self._generated_nested_sdfg = dict() - # Keeps track of generated connectors, so we know how to access them in - # nested scopes + def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''): + for k, v in struct.members.items(): + if isinstance(v, data.Structure): + _visit_structure(v, args, f'{prefix}.{k}') + elif isinstance(v, data.Data): + args[f'{prefix}.{k}'] = v + + # Keeps track of generated connectors, so we know how to access them in nested scopes + arglist = dict(self._frame.arglist) for name, arg_type in self._frame.arglist.items(): - if isinstance(arg_type, data.Scalar): + if isinstance(arg_type, data.Structure): + desc = sdfg.arrays[name] + _visit_structure(arg_type, arglist, name) + elif isinstance(arg_type, data.StructArray): + desc = sdfg.arrays[name] + desc = desc.stype + for attr in dir(desc): + value = getattr(desc, attr) + if isinstance(value, data.Data): + assert attr in sdfg.arrays + arglist[attr] = value + + for name, arg_type in arglist.items(): + if isinstance(arg_type, (data.Scalar, data.Structure)): # GPU global memory is only accessed via pointers # TODO(later): Fix workaround somehow if arg_type.storage is dtypes.StorageType.GPU_Global: From 1e5baddcbda6e0d78bd9526af7e1a0b78627a4e3 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:28:50 +0200 Subject: [PATCH 053/127] Recursively look into nested data to emit definitions. --- dace/codegen/targets/framecode.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 6f302c11ba..be6b85602a 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -150,15 +150,23 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) + + def _emit_definitions(dtype: dtypes.typeclass, wrote_something: bool) -> bool: + if isinstance(dtype, dtypes.pointer): + wrote_something = _emit_definitions(dtype._typeclass, wrote_something) + elif isinstance(dtype, dtypes.struct): + for field in dtype.fields.values(): + wrote_something = _emit_definitions(field, wrote_something) + if hasattr(dtype, 'emit_definition'): + if not wrote_something: + global_stream.write("", sdfg) + global_stream.write(dtype.emit_definition(), sdfg) + return wrote_something # Emit unique definitions wrote_something = False for typ in datatypes: - if hasattr(typ, 'emit_definition'): - if not wrote_something: - global_stream.write("", sdfg) - wrote_something = True - global_stream.write(typ.emit_definition(), sdfg) + wrote_something = _emit_definitions(typ, wrote_something) if wrote_something: global_stream.write("", sdfg) From 36d4e826ac769f1cb99ecc3c8fe8206c0690cdab Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:30:21 +0200 Subject: [PATCH 054/127] SDFG data (_arrays) are now stored in a NestedDict. --- dace/sdfg/sdfg.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 18763e385a..6e4c3587f4 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -48,6 +48,35 @@ from dace.codegen.compiled_sdfg import CompiledSDFG +class NestedDict(dict): + + def __init__(self): + super(NestedDict, self).__init__() + + def __getitem__(self, key): + tokens = key.split('.') + token = tokens.pop(0) + result = super(NestedDict, self).__getitem__(token) + while tokens: + token = tokens.pop(0) + result = result.members[token] + return result + + def __contains__(self, key): + tokens = key.split('.') + token = tokens.pop(0) + result = super(NestedDict, self).__contains__(token) + desc = None + while tokens and result: + if desc is None: + desc = super(NestedDict, self).__getitem__(token) + else: + desc = desc.members[token] + token = tokens.pop(0) + result = token in desc.members + return result + + def _arrays_to_json(arrays): if arrays is None: return None @@ -375,7 +404,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]): name = Property(dtype=str, desc="Name of the SDFG") arg_names = ListProperty(element_type=str, desc='Ordered argument names (used for calling conventions).') constants_prop = Property(dtype=dict, default={}, desc="Compile-time constants") - _arrays = Property(dtype=dict, + _arrays = Property(dtype=NestedDict, desc="Data descriptors for this SDFG", to_json=_arrays_to_json, from_json=_arrays_from_json) @@ -456,7 +485,7 @@ def __init__(self, self._sdfg_list = [self] self._start_state: Optional[int] = None self._cached_start_state: Optional[SDFGState] = None - self._arrays = {} # type: Dict[str, dt.Array] + self._arrays = NestedDict() # type: Dict[str, dt.Array] self._labels: Set[str] = set() self.global_code = {'frame': CodeBlock("", dtypes.Language.CPP)} self.init_code = {'frame': CodeBlock("", dtypes.Language.CPP)} From 38a4265a29c64f6100e03f536aecdd09fd160dca Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:31:11 +0200 Subject: [PATCH 055/127] Adjusted the matching check for memlet data and src/dst nodes to not fail for Structures. --- dace/sdfg/validation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 3bac646479..c963df9d7e 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -587,9 +587,14 @@ def validate_state(state: 'dace.sdfg.SDFGState', break # Check if memlet data matches src or dst nodes - if (e.data.data is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode)) - and (not isinstance(src_node, nd.AccessNode) or e.data.data != src_node.data) - and (not isinstance(dst_node, nd.AccessNode) or e.data.data != dst_node.data)): + name = e.data.data + if isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Structure): + name = None + if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure): + name = None + if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode)) + and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn)) + and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))): raise InvalidSDFGEdgeError( "Memlet data does not match source or destination " "data nodes)", From 479cb2ad240dd167a7b26d2665527e04727cffe6 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 19:32:51 +0200 Subject: [PATCH 056/127] Added tests. --- tests/sdfg/data/structure_test.py | 240 ++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 tests/sdfg/data/structure_test.py diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py new file mode 100644 index 0000000000..3783a98068 --- /dev/null +++ b/tests/sdfg/data/structure_test.py @@ -0,0 +1,240 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np + +from scipy import sparse + + +def create_structure(name: str, **members) -> dace.data.Structure: + + StructureClass = type(name, (dace.data.Structure, ), {}) + return StructureClass(members) + + +def test_read_structure(): + + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix', + indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz) + + sdfg = dace.SDFG('csr_to_dense') + + sdfg.add_datadesc('A', CSR) + sdfg.add_array('B', [M, N], dace.float32) + + sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype) + sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype) + sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype) + + state = sdfg.add_state() + + A = state.add_access('A') + B = state.add_access('B') + + indptr = state.add_access('vindptr') + indices = state.add_access('vindices') + data = state.add_access('vdata') + + state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', CSR.members['indptr'])) + state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', CSR.members['indices'])) + state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', CSR.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + B = np.zeros((20, 20), dtype=np.float32) + + inpA = CSR.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + N=A.shape[1], + nnz=A.nnz) + + func(A=inpA, B=B, M=20, N=20, nnz=A.nnz) + ref = A.toarray() + + assert np.allclose(B, ref) + + +def test_write_structure(): + + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix', + indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz) + + sdfg = dace.SDFG('dense_to_csr') + + sdfg.add_array('A', [M, N], dace.float32) + sdfg.add_datadesc('B', CSR) + + sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype) + sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype) + sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype) + + # Make If + if_before = sdfg.add_state('if_before') + if_guard = sdfg.add_state('if_guard') + if_body = sdfg.add_state('if_body') + if_after = sdfg.add_state('if_after') + sdfg.add_edge(if_before, if_guard, dace.InterstateEdge()) + sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0')) + sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'})) + sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0')) + A = if_body.add_access('A') + B = if_body.add_access('B') + indices = if_body.add_access('vindices') + data = if_body.add_access('vdata') + if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) + if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz')) + t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') + if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx')) + if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz')) + # Make For Loop for j + j_before, j_guard, j_after = sdfg.add_loop(None, + if_before, + None, + 'j', + '0', + 'j < N', + 'j + 1', + loop_end_state=if_after) + # Make For Loop for i + i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after) + sdfg.start_state = sdfg.node_id(i_before) + i_before_guard = sdfg.edges_between(i_before, i_guard)[0] + i_before_guard.data.assignments['idx'] = '0' + B = i_guard.add_access('B') + indptr = i_guard.add_access('vindptr') + t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') + i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i')) + i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) + B = i_after.add_access('B') + indptr = i_after.add_access('vindptr') + t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') + i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) + i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) + + func = sdfg.compile() + + rng = np.random.default_rng(42) + tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + A = tmp.toarray() + B = tmp.tocsr(copy=True) + B.indptr[:] = -1 + B.indices[:] = -1 + B.data[:] = -1 + + outB = CSR.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) + + func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) + + assert np.allclose(A, B.toarray()) + + +def test_read_nested_structure(): + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix', + indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz) + Wrapper = create_structure('WrapperClass', csr=CSR) + + sdfg = dace.SDFG('nested_csr_to_dense') + + sdfg.add_datadesc('A', Wrapper) + sdfg.add_array('B', [M, N], dace.float32) + + spmat = Wrapper.members['csr'] + sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) + sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) + sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) + + state = sdfg.add_state() + + A = state.add_access('A') + B = state.add_access('B') + + indptr = state.add_access('vindptr') + indices = state.add_access('vindices') + data = state.add_access('vdata') + + state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) + state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) + state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + B = np.zeros((20, 20), dtype=np.float32) + + structclass = CSR.dtype._typeclass.as_ctypes() + inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + K=A.shape[1], + nnz=A.nnz) + import ctypes + inpW = Wrapper.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) + + func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) + ref = A.toarray() + + assert np.allclose(B, ref) + + +if __name__ == "__main__": + test_read_structure() + test_write_structure() + test_read_nested_structure() From 8365ab34926a01d65a67d93d1b1bbaf2e67eac11 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 20:25:26 +0200 Subject: [PATCH 057/127] Serialization fixes. --- dace/sdfg/sdfg.py | 13 ++++++++++--- tests/sdfg/data/structure_test.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 6e4c3587f4..b5598870ec 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -50,8 +50,9 @@ class NestedDict(dict): - def __init__(self): - super(NestedDict, self).__init__() + def __init__(self, mapping=None): + mapping = mapping or {} + super(NestedDict, self).__init__(mapping) def __getitem__(self, key): tokens = key.split('.') @@ -89,6 +90,12 @@ def _arrays_from_json(obj, context=None): return {k: dace.serialize.from_json(v, context) for k, v in obj.items()} +def _nested_arrays_from_json(obj, context=None): + if obj is None: + return NestedDict({}) + return NestedDict({k: dace.serialize.from_json(v, context) for k, v in obj.items()}) + + def _replace_dict_keys(d, old, new): if old in d: if new in d: @@ -407,7 +414,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]): _arrays = Property(dtype=NestedDict, desc="Data descriptors for this SDFG", to_json=_arrays_to_json, - from_json=_arrays_from_json) + from_json=_nested_arrays_from_json) symbols = DictProperty(str, dtypes.typeclass, desc="Global symbols for this SDFG") instrument = EnumProperty(dtype=dtypes.InstrumentationType, diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 3783a98068..5348ecaa5a 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -2,12 +2,29 @@ import dace import numpy as np +from dace import serialize +from dace.properties import make_properties from scipy import sparse def create_structure(name: str, **members) -> dace.data.Structure: StructureClass = type(name, (dace.data.Structure, ), {}) + + @staticmethod + def from_json(json_obj, context=None): + if json_obj['type'] != name: + raise TypeError("Invalid data type") + + # Create dummy object + ret = StructureClass({}) + serialize.set_properties_from_json(ret, json_obj, context=context) + + return ret + + setattr(StructureClass, 'from_json', from_json) + StructureClass = make_properties(StructureClass) + return StructureClass(members) From 14ba6655c883f2f0761ca4ccacfb722d82b7eac3 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 19 Jul 2023 20:29:36 +0200 Subject: [PATCH 058/127] Fixed NestedDict for non-str keys. --- dace/sdfg/sdfg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index b5598870ec..a4c29c2e89 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -55,16 +55,17 @@ def __init__(self, mapping=None): super(NestedDict, self).__init__(mapping) def __getitem__(self, key): - tokens = key.split('.') + tokens = key.split('.') if isinstance(key, str) else [key] token = tokens.pop(0) result = super(NestedDict, self).__getitem__(token) while tokens: token = tokens.pop(0) result = result.members[token] return result + def __contains__(self, key): - tokens = key.split('.') + tokens = key.split('.') if isinstance(key, str) else [key] token = tokens.pop(0) result = super(NestedDict, self).__contains__(token) desc = None From 7343f55e2dca5c06721c7b9bf3448b2ca0f1637e Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Thu, 20 Jul 2023 13:00:02 +0200 Subject: [PATCH 059/127] state.read_and_write_sets() take into account when read happens directly after write --- dace/sdfg/state.py | 16 +++++++++++++--- tests/sdfg/state_test.py | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 tests/sdfg/state_test.py diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 0796bf00d0..0354dd107b 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -298,10 +298,11 @@ def scope_tree(self) -> 'dace.sdfg.scope.ScopeTree': # Get scopes for node, scopenodes in sdc.items(): + scope_exit_nodes = [v for v in scopenodes if isinstance(v, nd.ExitNode)] if node is None: exit_node = None else: - exit_node = next(v for v in scopenodes if isinstance(v, nd.ExitNode)) + exit_node = next(iter(scope_exit_nodes)) scope = ScopeTree(node, exit_node) result[node] = scope @@ -502,13 +503,22 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr, # is read is not counted in the read set for n in utils.dfs_topological_sort(sg, sources=sg.source_nodes()): if isinstance(n, nd.AccessNode): - for e in sg.in_edges(n): + in_edges = sg.in_edges(n) + out_edges = sg.out_edges(n) + # Filter out memlets which go out but the same data is written to the AccessNode by another memlet + for out_edge in out_edges: + for in_edge in in_edges: + if in_edge.data.data == out_edge.data.data and \ + in_edge.data.dst_subset.covers(out_edge.data.src_subset): + out_edges.remove(out_edge) + + for e in in_edges: # skip empty memlets if e.data.is_empty(): continue # Store all subsets that have been written ws[n.data].append(e.data.subset) - for e in sg.out_edges(n): + for e in out_edges: # skip empty memlets if e.data.is_empty(): continue diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py new file mode 100644 index 0000000000..07e2e8c4c7 --- /dev/null +++ b/tests/sdfg/state_test.py @@ -0,0 +1,23 @@ +import dace + + +def test_read_write_set(): + sdfg = dace.SDFG('graph') + A = sdfg.add_array('A', [10], dace.float64) + B = sdfg.add_array('B', [10], dace.float64) + C = sdfg.add_array('C', [10], dace.float64) + state = sdfg.add_state('state') + task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1') + task2 = state.add_tasklet('work2', {'B'}, {'C'}, 'C = B + 1') + read_a = state.add_access('A') + rw_b = state.add_access('B') + write_c = state.add_access('C') + state.add_memlet_path(read_a, task1, dst_conn='A', memlet=dace.Memlet('A[2]')) + state.add_memlet_path(task1, rw_b, src_conn='B', memlet=dace.Memlet('B[2]')) + state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet('B[2]')) + state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet('C[2]')) + + assert 'B' not in state.read_and_write_sets()[0] + +if __name__ == '__main__': + test_read_write_set() From bb00fea35013b66c153d89c5ba31ede70c0120d2 Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Thu, 20 Jul 2023 14:07:48 +0200 Subject: [PATCH 060/127] Change RefineNestedAccess to only look at memlets which are in the read and write set --- dace/transformation/interstate/sdfg_nesting.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py index 1b9324546a..71d9e22aca 100644 --- a/dace/transformation/interstate/sdfg_nesting.py +++ b/dace/transformation/interstate/sdfg_nesting.py @@ -925,7 +925,12 @@ def _candidates( continue # For now we only detect one element + read_set, write_set = nstate.read_and_write_sets() for e in nstate.in_edges(dnode): + if e.data.data not in write_set: + # Skip data which is not in the read and write set of the state -> there also won't be a + # connector + continue # If more than one unique element detected, remove from # candidates if e.data.data in out_candidates: @@ -941,6 +946,10 @@ def _candidates( continue out_candidates[e.data.data] = (e.data, nstate, set(range(len(e.data.subset)))) for e in nstate.out_edges(dnode): + if e.data.data not in read_set: + # Skip data which is not in the read and write set of the state -> there also won't be a + # connector + continue # If more than one unique element detected, remove from # candidates if e.data.data in in_candidates: From 80d6f10af1efe172560d64b976c451a91670b2fb Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 14:56:21 +0200 Subject: [PATCH 061/127] Added support for transient Structures. --- dace/codegen/targets/cpu.py | 16 ++++++++++++++-- dace/data.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 2759c9744c..7ff91cbc7b 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -286,16 +286,17 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d name = node.data alloc_name = cpp.ptr(name, nodedesc, sdfg, self._frame) name = alloc_name + alloc_name = alloc_name.replace('.', '->') if nodedesc.transient is False: return # Check if array is already allocated - if self._dispatcher.defined_vars.has(alloc_name): + if self._dispatcher.defined_vars.has(name): return # Check if array is already declared - declared = self._dispatcher.declared_arrays.has(alloc_name) + declared = self._dispatcher.declared_arrays.has(name) define_var = self._dispatcher.defined_vars.add if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): @@ -308,6 +309,17 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d if not isinstance(nodedesc.dtype, dtypes.opaque): arrsize_bytes = arrsize * nodedesc.dtype.bytes + if isinstance(nodedesc, data.Structure): + declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n") + define_var(name, DefinedType.Pointer, nodedesc.ctype) + for k, v in nodedesc.members.items(): + if isinstance(v, data.Data): + ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype + defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer + self._dispatcher.declared_arrays.add(f"{name}.{k}", defined_type, ctypedef) + self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream, + declaration_stream, allocation_stream) + return if isinstance(nodedesc, data.View): return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) if isinstance(nodedesc, data.Reference): diff --git a/dace/data.py b/dace/data.py index 0f1ef1f266..838fc43542 100644 --- a/dace/data.py +++ b/dace/data.py @@ -369,7 +369,10 @@ def __init__(self, location: Dict[str, str] = None, lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, debuginfo: dtypes.DebugInfo = None): + # TODO: Should we make a deep-copy here? self.members = members or {} + for k, v in self.members.items(): + v.transient = transient fields_and_types = dict() symbols = set() for k, v in members.items(): @@ -433,6 +436,31 @@ def __getitem__(self, s): return StructArray(self, (s, )) +@make_properties +class StructureView(Structure): + """ + Data descriptor that acts as a reference (or view) of another structure. + """ + + @staticmethod + def from_json(json_obj, context=None): + if json_obj['type'] != 'StructureView': + raise TypeError("Invalid data type") + + # Create dummy object + ret = StructureView({}) + serialize.set_properties_from_json(ret, json_obj, context=context) + + return ret + + def validate(self): + super().validate() + + # We ensure that allocation lifetime is always set to Scope, since the + # view is generated upon "allocation" + if self.lifetime != dtypes.AllocationLifetime.Scope: + raise ValueError('Only Scope allocation lifetime is supported for Views') + @make_properties class Scalar(Data): """ Data descriptor of a scalar value. """ From 9658c2236b7ba154bccbbd3b839944f4f88c2668 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 14:56:40 +0200 Subject: [PATCH 062/127] Edited tests. --- tests/sdfg/data/structure_test.py | 346 +++++++++++++++++++++++++++--- 1 file changed, 321 insertions(+), 25 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 5348ecaa5a..462c6a8e7b 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -7,7 +7,7 @@ from scipy import sparse -def create_structure(name: str, **members) -> dace.data.Structure: +def create_structure(name: str) -> dace.data.Structure: StructureClass = type(name, (dace.data.Structure, ), {}) @@ -25,28 +25,28 @@ def from_json(json_obj, context=None): setattr(StructureClass, 'from_json', from_json) StructureClass = make_properties(StructureClass) - return StructureClass(members) + return StructureClass def test_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix', - indptr=dace.int32[M + 1], + CSR = create_structure('CSRMatrix') + csr_obj = CSR(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz], rows=M, cols=N, - nnz=nnz) + nnz=nnz)) sdfg = dace.SDFG('csr_to_dense') - sdfg.add_datadesc('A', CSR) + sdfg.add_datadesc('A', csr_obj) sdfg.add_array('B', [M, N], dace.float32) - sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype) - sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype) - sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype) + sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) state = sdfg.add_state() @@ -57,9 +57,9 @@ def test_read_structure(): indices = state.add_access('vindices') data = state.add_access('vdata') - state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', CSR.members['indptr'])) - state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', CSR.members['indices'])) - state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', CSR.members['data'])) + state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr'])) + state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices'])) + state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data'])) ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) @@ -79,7 +79,7 @@ def test_read_structure(): A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) B = np.zeros((20, 20), dtype=np.float32) - inpA = CSR.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], + inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], indices=A.indices.__array_interface__['data'][0], data=A.data.__array_interface__['data'][0], rows=A.shape[0], @@ -97,22 +97,22 @@ def test_read_structure(): def test_write_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix', - indptr=dace.int32[M + 1], + CSR = create_structure('CSRMatrix') + csr_obj = CSR(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz], rows=M, cols=N, - nnz=nnz) + nnz=nnz)) sdfg = dace.SDFG('dense_to_csr') sdfg.add_array('A', [M, N], dace.float32) - sdfg.add_datadesc('B', CSR) + sdfg.add_datadesc('B', csr_obj) - sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype) - sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype) - sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype) + sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) # Make If if_before = sdfg.add_state('if_before') @@ -167,7 +167,7 @@ def test_write_structure(): B.indices[:] = -1 B.data[:] = -1 - outB = CSR.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], + outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], indices=B.indices.__array_interface__['data'][0], data=B.data.__array_interface__['data'][0], rows=tmp.shape[0], @@ -181,7 +181,204 @@ def test_write_structure(): assert np.allclose(A, B.toarray()) +def test_local_structure(): + + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix') + csr_obj = CSR(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz)) + tmp_obj = CSR(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), transient=True) + + sdfg = dace.SDFG('dense_to_csr') + + sdfg.add_array('A', [M, N], dace.float32) + sdfg.add_datadesc('B', csr_obj) + sdfg.add_datadesc('tmp', tmp_obj) + + sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) + + sdfg.add_view('tmp_vindptr', tmp_obj.members['indptr'].shape, tmp_obj.members['indptr'].dtype) + sdfg.add_view('tmp_vindices', tmp_obj.members['indices'].shape, tmp_obj.members['indices'].dtype) + sdfg.add_view('tmp_vdata', tmp_obj.members['data'].shape, tmp_obj.members['data'].dtype) + + # Make If + if_before = sdfg.add_state('if_before') + if_guard = sdfg.add_state('if_guard') + if_body = sdfg.add_state('if_body') + if_after = sdfg.add_state('if_after') + sdfg.add_edge(if_before, if_guard, dace.InterstateEdge()) + sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0')) + sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'})) + sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0')) + A = if_body.add_access('A') + tmp = if_body.add_access('tmp') + indices = if_body.add_access('tmp_vindices') + data = if_body.add_access('tmp_vdata') + if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) + if_body.add_edge(data, 'views', tmp, 'data', dace.Memlet(data='tmp.data', subset='0:nnz')) + t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') + if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='tmp_vindices', subset='idx')) + if_body.add_edge(indices, 'views', tmp, 'indices', dace.Memlet(data='tmp.indices', subset='0:nnz')) + # Make For Loop for j + j_before, j_guard, j_after = sdfg.add_loop(None, + if_before, + None, + 'j', + '0', + 'j < N', + 'j + 1', + loop_end_state=if_after) + # Make For Loop for i + i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after) + sdfg.start_state = sdfg.node_id(i_before) + i_before_guard = sdfg.edges_between(i_before, i_guard)[0] + i_before_guard.data.assignments['idx'] = '0' + tmp = i_guard.add_access('tmp') + indptr = i_guard.add_access('tmp_vindptr') + t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') + i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='i')) + i_guard.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + tmp = i_after.add_access('tmp') + indptr = i_after.add_access('tmp_vindptr') + t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') + i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='M')) + i_after.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + + set_B = sdfg.add_state('set_B') + sdfg.add_edge(i_after, set_B, dace.InterstateEdge()) + tmp = set_B.add_access('tmp') + tmp_indptr = set_B.add_access('tmp_vindptr') + tmp_indices = set_B.add_access('tmp_vindices') + tmp_data = set_B.add_access('tmp_vdata') + set_B.add_edge(tmp, 'indptr', tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + set_B.add_edge(tmp, 'indices', tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz')) + set_B.add_edge(tmp, 'data', tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz')) + B = set_B.add_access('B') + B_indptr = set_B.add_access('vindptr') + B_indices = set_B.add_access('vindices') + B_data = set_B.add_access('vdata') + set_B.add_edge(B_indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) + set_B.add_edge(B_indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz')) + set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz')) + set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1')) + set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz')) + t, me, mx = set_B.add_mapped_tasklet('set_data', + {'idx': '0:nnz'}, + {'__inp': dace.Memlet(data='tmp_vdata', subset='idx')}, + '__out = 2 * __inp', + {'__out': dace.Memlet(data='vdata', subset='idx')}, + external_edges=True, + input_nodes={'tmp_vdata': tmp_data}, + output_nodes={'vdata': B_data}) + + + func = sdfg.compile() + + rng = np.random.default_rng(42) + tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + A = tmp.toarray() + B = tmp.tocsr(copy=True) + B.indptr[:] = -1 + B.indices[:] = -1 + B.data[:] = -1 + + outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) + + func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) + + assert np.allclose(A * 2, B.toarray()) + + def test_read_nested_structure(): + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix') + csr_obj = CSR(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz)) + Wrapper = create_structure('WrapperClass') + wrapper_obj = Wrapper(dict(csr=csr_obj)) + + sdfg = dace.SDFG('nested_csr_to_dense') + + sdfg.add_datadesc('A', wrapper_obj) + sdfg.add_array('B', [M, N], dace.float32) + + spmat = wrapper_obj.members['csr'] + sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) + sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) + sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) + + state = sdfg.add_state() + + A = state.add_access('A') + B = state.add_access('B') + + indptr = state.add_access('vindptr') + indices = state.add_access('vindices') + data = state.add_access('vdata') + + state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) + state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) + state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + B = np.zeros((20, 20), dtype=np.float32) + + structclass = csr_obj.dtype._typeclass.as_ctypes() + inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + K=A.shape[1], + nnz=A.nnz) + import ctypes + inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) + + func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) + ref = A.toarray() + + assert np.allclose(B, ref) + + +def test_read_nested_structure_2(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) CSR = create_structure('CSRMatrix', indptr=dace.int32[M + 1], @@ -190,14 +387,16 @@ def test_read_nested_structure(): rows=M, cols=N, nnz=nnz) + CSRView = dace.data.StructureView(CSR.members, transient=True) Wrapper = create_structure('WrapperClass', csr=CSR) - sdfg = dace.SDFG('nested_csr_to_dense') + sdfg = dace.SDFG('nested_csr_to_dense_2') sdfg.add_datadesc('A', Wrapper) sdfg.add_array('B', [M, N], dace.float32) spmat = Wrapper.members['csr'] + sdfg.add_datadesc('vcsr', CSRView) sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) @@ -207,13 +406,15 @@ def test_read_nested_structure(): A = state.add_access('A') B = state.add_access('B') + csr = state.add_access('vcsr') indptr = state.add_access('vindptr') indices = state.add_access('vindices') data = state.add_access('vdata') - state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) - state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) - state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) + state.add_edge(A, 'csr', csr, 'views', dace.Memlet.from_array('A.csr', spmat)) + state.add_edge(csr, 'indptr', indptr, 'views', dace.Memlet.from_array('vcsr.indptr', spmat.members['indptr'])) + state.add_edge(csr, 'indices', indices, 'views', dace.Memlet.from_array('vcsr.indices', spmat.members['indices'])) + state.add_edge(csr, 'data', data, 'views', dace.Memlet.from_array('vcsr.data', spmat.members['data'])) ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) @@ -251,7 +452,102 @@ def test_read_nested_structure(): assert np.allclose(B, ref) +def test_write_nested_structure(): + + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + CSR = create_structure('CSRMatrix') + csr_obj = CSR(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz)) + Wrapper = create_structure('WrapperClass') + wrapper_obj = Wrapper(dict(csr=csr_obj)) + + sdfg = dace.SDFG('dense_to_csr') + + sdfg.add_array('A', [M, N], dace.float32) + sdfg.add_datadesc('B', wrapper_obj) + + spmat = wrapper_obj.members['csr'] + sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) + sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) + sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) + + # Make If + if_before = sdfg.add_state('if_before') + if_guard = sdfg.add_state('if_guard') + if_body = sdfg.add_state('if_body') + if_after = sdfg.add_state('if_after') + sdfg.add_edge(if_before, if_guard, dace.InterstateEdge()) + sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0')) + sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'})) + sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0')) + A = if_body.add_access('A') + B = if_body.add_access('B') + indices = if_body.add_access('vindices') + data = if_body.add_access('vdata') + if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) + if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.csr.data', subset='0:nnz')) + t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') + if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx')) + if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.csr.indices', subset='0:nnz')) + # Make For Loop for j + j_before, j_guard, j_after = sdfg.add_loop(None, + if_before, + None, + 'j', + '0', + 'j < N', + 'j + 1', + loop_end_state=if_after) + # Make For Loop for i + i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after) + sdfg.start_state = sdfg.node_id(i_before) + i_before_guard = sdfg.edges_between(i_before, i_guard)[0] + i_before_guard.data.assignments['idx'] = '0' + B = i_guard.add_access('B') + indptr = i_guard.add_access('vindptr') + t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') + i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i')) + i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1')) + B = i_after.add_access('B') + indptr = i_after.add_access('vindptr') + t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') + i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) + i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1')) + + func = sdfg.compile() + + rng = np.random.default_rng(42) + tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + A = tmp.toarray() + B = tmp.tocsr(copy=True) + B.indptr[:] = -1 + B.indices[:] = -1 + B.data[:] = -1 + + outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) + import ctypes + outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR)) + + func(A=A, B=outW, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) + + assert np.allclose(A, B.toarray()) + + if __name__ == "__main__": test_read_structure() test_write_structure() + test_local_structure() test_read_nested_structure() + # test_read_nested_structure_2() + test_write_nested_structure() From b1dbb6b385c5186ac16b5be1ea3d394953c6bf17 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 15:32:40 +0200 Subject: [PATCH 063/127] Structures have name attribute (instead of subclassing). --- dace/data.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dace/data.py b/dace/data.py index 838fc43542..e424aca66a 100644 --- a/dace/data.py +++ b/dace/data.py @@ -361,9 +361,11 @@ class Structure(Data): desc="Dictionary of structure members", from_json=_arrays_from_json, to_json=_arrays_to_json) + name = Property(dtype=str, desc="Structure name") def __init__(self, members: Dict[str, Any], + name: str = 'Structure', transient: bool = False, storage: dtypes.StorageType = dtypes.StorageType.Default, location: Dict[str, str] = None, @@ -373,6 +375,7 @@ def __init__(self, self.members = members or {} for k, v in self.members.items(): v.transient = transient + self.name = name fields_and_types = dict() symbols = set() for k, v in members.items(): @@ -399,9 +402,20 @@ def __init__(self, fields_and_types[str(s)] = s.dtype else: fields_and_types[str(s)] = dtypes.int32 - dtype = dtypes.pointer(dtypes.struct(self.__class__.__name__, **fields_and_types)) + dtype = dtypes.pointer(dtypes.struct(name, **fields_and_types)) shape = (1,) super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) + + @staticmethod + def from_json(json_obj, context=None): + if json_obj['type'] != 'Structure': + raise TypeError("Invalid data type") + + # Create dummy object + ret = Structure({}) + serialize.set_properties_from_json(ret, json_obj, context=context) + + return ret @property def total_size(self): From 5de2ae35d25b9f78eeecb0080504be34b6577cec Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 15:32:59 +0200 Subject: [PATCH 064/127] Updated tests. --- tests/sdfg/data/structure_test.py | 192 +++++++++++++++--------------- 1 file changed, 96 insertions(+), 96 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 462c6a8e7b..b3d72b9d7a 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -1,6 +1,7 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np +import pytest from dace import serialize from dace.properties import make_properties @@ -21,7 +22,7 @@ def from_json(json_obj, context=None): serialize.set_properties_from_json(ret, json_obj, context=context) return ret - + setattr(StructureClass, 'from_json', from_json) StructureClass = make_properties(StructureClass) @@ -31,13 +32,13 @@ def from_json(json_obj, context=None): def test_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix') - csr_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz)) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense') @@ -80,13 +81,13 @@ def test_read_structure(): B = np.zeros((20, 20), dtype=np.float32) inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], - indices=A.indices.__array_interface__['data'][0], - data=A.data.__array_interface__['data'][0], - rows=A.shape[0], - cols=A.shape[1], - M=A.shape[0], - N=A.shape[1], - nnz=A.nnz) + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + N=A.shape[1], + nnz=A.nnz) func(A=inpA, B=B, M=20, N=20, nnz=A.nnz) ref = A.toarray() @@ -97,13 +98,13 @@ def test_read_structure(): def test_write_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix') - csr_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz)) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') sdfg = dace.SDFG('dense_to_csr') @@ -168,13 +169,13 @@ def test_write_structure(): B.data[:] = -1 outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], - indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) @@ -182,23 +183,25 @@ def test_write_structure(): def test_local_structure(): - - M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix') - csr_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz)) - tmp_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), transient=True) - sdfg = dace.SDFG('dense_to_csr') + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix', + transient=True) + + sdfg = dace.SDFG('dense_to_csr_local') sdfg.add_array('A', [M, N], dace.float32) sdfg.add_datadesc('B', csr_obj) @@ -273,16 +276,13 @@ def test_local_structure(): set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz')) set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1')) set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz')) - t, me, mx = set_B.add_mapped_tasklet('set_data', - {'idx': '0:nnz'}, + t, me, mx = set_B.add_mapped_tasklet('set_data', {'idx': '0:nnz'}, {'__inp': dace.Memlet(data='tmp_vdata', subset='idx')}, - '__out = 2 * __inp', - {'__out': dace.Memlet(data='vdata', subset='idx')}, + '__out = 2 * __inp', {'__out': dace.Memlet(data='vdata', subset='idx')}, external_edges=True, input_nodes={'tmp_vdata': tmp_data}, output_nodes={'vdata': B_data}) - func = sdfg.compile() rng = np.random.default_rng(42) @@ -294,13 +294,13 @@ def test_local_structure(): B.data[:] = -1 outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], - indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) @@ -309,15 +309,14 @@ def test_local_structure(): def test_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix') - csr_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz)) - Wrapper = create_structure('WrapperClass') - wrapper_obj = Wrapper(dict(csr=csr_obj)) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') sdfg = dace.SDFG('nested_csr_to_dense') @@ -378,24 +377,25 @@ def test_read_nested_structure(): assert np.allclose(B, ref) +@pytest.mark.skip def test_read_nested_structure_2(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix', - indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz) - CSRView = dace.data.StructureView(CSR.members, transient=True) - Wrapper = create_structure('WrapperClass', csr=CSR) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + CSRView = dace.data.StructureView(csr_obj.members, transient=True) + wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') sdfg = dace.SDFG('nested_csr_to_dense_2') - sdfg.add_datadesc('A', Wrapper) + sdfg.add_datadesc('A', wrapper_obj) sdfg.add_array('B', [M, N], dace.float32) - spmat = Wrapper.members['csr'] + spmat = wrapper_obj.members['csr'] sdfg.add_datadesc('vcsr', CSRView) sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) @@ -428,13 +428,14 @@ def test_read_nested_structure_2(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) B = np.zeros((20, 20), dtype=np.float32) - structclass = CSR.dtype._typeclass.as_ctypes() + structclass = csr_obj.dtype._typeclass.as_ctypes() inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], indices=A.indices.__array_interface__['data'][0], data=A.data.__array_interface__['data'][0], @@ -444,7 +445,7 @@ def test_read_nested_structure_2(): K=A.shape[1], nnz=A.nnz) import ctypes - inpW = Wrapper.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) + inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) ref = A.toarray() @@ -455,15 +456,14 @@ def test_read_nested_structure_2(): def test_write_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - CSR = create_structure('CSRMatrix') - csr_obj = CSR(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz)) - Wrapper = create_structure('WrapperClass') - wrapper_obj = Wrapper(dict(csr=csr_obj)) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') sdfg = dace.SDFG('dense_to_csr') @@ -529,13 +529,13 @@ def test_write_nested_structure(): B.data[:] = -1 outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], - indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + indices=B.indices.__array_interface__['data'][0], + data=B.data.__array_interface__['data'][0], + rows=tmp.shape[0], + cols=tmp.shape[1], + M=tmp.shape[0], + N=tmp.shape[1], + nnz=tmp.nnz) import ctypes outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR)) @@ -549,5 +549,5 @@ def test_write_nested_structure(): test_write_structure() test_local_structure() test_read_nested_structure() - # test_read_nested_structure_2() + test_read_nested_structure_2() test_write_nested_structure() From 1fbc45f66ebcff4979f7cb05566de56b70e2b1b9 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 20:08:26 +0200 Subject: [PATCH 065/127] Removed nested data connectors. --- tests/sdfg/data/structure_test.py | 56 +++++++++++++++++-------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index b3d72b9d7a..8636dc1602 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -58,9 +58,9 @@ def test_read_structure(): indices = state.add_access('vindices') data = state.add_access('vdata') - state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr'])) - state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices'])) - state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data'])) + state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr'])) + state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices'])) + state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data'])) ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) @@ -74,6 +74,7 @@ def test_read_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -129,10 +130,10 @@ def test_write_structure(): indices = if_body.add_access('vindices') data = if_body.add_access('vdata') if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) - if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz')) + if_body.add_edge(data, 'views', B, None, dace.Memlet(data='B.data', subset='0:nnz')) t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx')) - if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz')) + if_body.add_edge(indices, 'views', B, None, dace.Memlet(data='B.indices', subset='0:nnz')) # Make For Loop for j j_before, j_guard, j_after = sdfg.add_loop(None, if_before, @@ -151,13 +152,14 @@ def test_write_structure(): indptr = i_guard.add_access('vindptr') t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i')) - i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) + i_guard.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1')) B = i_after.add_access('B') indptr = i_after.add_access('vindptr') t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) - i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) + i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1')) + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -229,10 +231,10 @@ def test_local_structure(): indices = if_body.add_access('tmp_vindices') data = if_body.add_access('tmp_vdata') if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) - if_body.add_edge(data, 'views', tmp, 'data', dace.Memlet(data='tmp.data', subset='0:nnz')) + if_body.add_edge(data, 'views', tmp, None, dace.Memlet(data='tmp.data', subset='0:nnz')) t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='tmp_vindices', subset='idx')) - if_body.add_edge(indices, 'views', tmp, 'indices', dace.Memlet(data='tmp.indices', subset='0:nnz')) + if_body.add_edge(indices, 'views', tmp, None, dace.Memlet(data='tmp.indices', subset='0:nnz')) # Make For Loop for j j_before, j_guard, j_after = sdfg.add_loop(None, if_before, @@ -251,12 +253,12 @@ def test_local_structure(): indptr = i_guard.add_access('tmp_vindptr') t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='i')) - i_guard.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + i_guard.add_edge(indptr, 'views', tmp, None, dace.Memlet(data='tmp.indptr', subset='0:M+1')) tmp = i_after.add_access('tmp') indptr = i_after.add_access('tmp_vindptr') t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='M')) - i_after.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + i_after.add_edge(indptr, 'views', tmp, None, dace.Memlet(data='tmp.indptr', subset='0:M+1')) set_B = sdfg.add_state('set_B') sdfg.add_edge(i_after, set_B, dace.InterstateEdge()) @@ -264,16 +266,16 @@ def test_local_structure(): tmp_indptr = set_B.add_access('tmp_vindptr') tmp_indices = set_B.add_access('tmp_vindices') tmp_data = set_B.add_access('tmp_vdata') - set_B.add_edge(tmp, 'indptr', tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1')) - set_B.add_edge(tmp, 'indices', tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz')) - set_B.add_edge(tmp, 'data', tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz')) + set_B.add_edge(tmp, None, tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1')) + set_B.add_edge(tmp, None, tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz')) + set_B.add_edge(tmp, None, tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz')) B = set_B.add_access('B') B_indptr = set_B.add_access('vindptr') B_indices = set_B.add_access('vindices') B_data = set_B.add_access('vdata') - set_B.add_edge(B_indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1')) - set_B.add_edge(B_indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz')) - set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz')) + set_B.add_edge(B_indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1')) + set_B.add_edge(B_indices, 'views', B, None, dace.Memlet(data='B.indices', subset='0:nnz')) + set_B.add_edge(B_data, 'views', B, None, dace.Memlet(data='B.data', subset='0:nnz')) set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1')) set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz')) t, me, mx = set_B.add_mapped_tasklet('set_data', {'idx': '0:nnz'}, @@ -283,6 +285,7 @@ def test_local_structure(): input_nodes={'tmp_vdata': tmp_data}, output_nodes={'vdata': B_data}) + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -337,9 +340,9 @@ def test_read_nested_structure(): indices = state.add_access('vindices') data = state.add_access('vdata') - state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) - state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) - state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) + state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) + state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) + state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) @@ -353,6 +356,7 @@ def test_read_nested_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -429,6 +433,7 @@ def test_read_nested_structure_2(): state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') sdfg.view() + return func = sdfg.compile() rng = np.random.default_rng(42) @@ -489,10 +494,10 @@ def test_write_nested_structure(): indices = if_body.add_access('vindices') data = if_body.add_access('vdata') if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx')) - if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.csr.data', subset='0:nnz')) + if_body.add_edge(data, 'views', B, None, dace.Memlet(data='B.csr.data', subset='0:nnz')) t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx')) - if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.csr.indices', subset='0:nnz')) + if_body.add_edge(indices, 'views', B, None, dace.Memlet(data='B.csr.indices', subset='0:nnz')) # Make For Loop for j j_before, j_guard, j_after = sdfg.add_loop(None, if_before, @@ -511,13 +516,14 @@ def test_write_nested_structure(): indptr = i_guard.add_access('vindptr') t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i')) - i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1')) + i_guard.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1')) B = i_after.add_access('B') indptr = i_after.add_access('vindptr') t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) - i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1')) + i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1')) + sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -549,5 +555,5 @@ def test_write_nested_structure(): test_write_structure() test_local_structure() test_read_nested_structure() - test_read_nested_structure_2() + # test_read_nested_structure_2() test_write_nested_structure() From 6fa7e53ea4c39752c60b386895a6c9ba4a542b80 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 20:27:41 +0200 Subject: [PATCH 066/127] Added support for direct access to nested data. --- dace/codegen/targets/cpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 7ff91cbc7b..137de75c55 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1169,6 +1169,7 @@ def memlet_definition(self, if not types: types = self._dispatcher.defined_vars.get(ptr, is_global=True) var_type, ctypedef = types + ptr = ptr.replace('.', '->') if fpga.is_fpga_array(desc): decouple_array_interfaces = Config.get_bool("compiler", "xilinx", "decouple_array_interfaces") From 71d7c3db0f2391b79281a89732b64d0d4b861e14 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 20:28:20 +0200 Subject: [PATCH 067/127] WIP: Add nested data free symbols to SDFG. --- dace/sdfg/sdfg.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index a4c29c2e89..1f385a4b75 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -2005,10 +2005,20 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str raise NameError(f'Array or Stream with name "{name}" already exists in SDFG') self._arrays[name] = datadesc + def _add_symbols(desc: dt.Data): + if isinstance(desc, dt.Structure): + for v in desc.members.values(): + if isinstance(v, dt.Data): + _add_symbols(v) + for sym in desc.free_symbols: + if sym.name not in self.symbols: + self.add_symbol(sym.name, sym.dtype) + # Add free symbols to the SDFG global symbol storage - for sym in datadesc.free_symbols: - if sym.name not in self.symbols: - self.add_symbol(sym.name, sym.dtype) + # for sym in datadesc.free_symbols: + # if sym.name not in self.symbols: + # self.add_symbol(sym.name, sym.dtype) + _add_symbols(datadesc) return name From e0a4409ff4a2b909f901f1a1592d3b9669387807 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 20:29:13 +0200 Subject: [PATCH 068/127] Added test for direct nested data access. --- tests/sdfg/data/structure_test.py | 82 ++++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 6 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 8636dc1602..3116a5764a 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -550,10 +550,80 @@ def test_write_nested_structure(): assert np.allclose(A, B.toarray()) +def test_direct_read_structure(): + + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + + sdfg = dace.SDFG('csr_to_dense_direct') + + sdfg.add_datadesc('A', csr_obj) + sdfg.add_array('B', [M, N], dace.float32) + + # sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + # sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + # sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) + + state = sdfg.add_state() + + # A = state.add_access('A') + indptr = state.add_access('A.indptr') + indices = state.add_access('A.indices') + data = state.add_access('A.data') + B = state.add_access('B') + + # indptr = state.add_access('vindptr') + # indices = state.add_access('vindices') + # data = state.add_access('vdata') + + # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr'])) + # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices'])) + # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.indptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.indptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='A.indices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.data', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + + sdfg.view() + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + B = np.zeros((20, 20), dtype=np.float32) + + inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + N=A.shape[1], + nnz=A.nnz) + + func(A=inpA, B=B, M=20, N=20, nnz=A.nnz) + ref = A.toarray() + + assert np.allclose(B, ref) + + if __name__ == "__main__": - test_read_structure() - test_write_structure() - test_local_structure() - test_read_nested_structure() - # test_read_nested_structure_2() - test_write_nested_structure() + # test_read_structure() + # test_write_structure() + # test_local_structure() + # test_read_nested_structure() + # test_write_nested_structure() + test_direct_read_structure() From 0593ea4f1a86951b210c727c95931ca3664f7423 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 20 Jul 2023 20:55:42 +0200 Subject: [PATCH 069/127] Added test for direct double-nested data accesses. --- tests/sdfg/data/structure_test.py | 75 +++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 3116a5764a..91429e8bbc 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -620,6 +620,80 @@ def test_direct_read_structure(): assert np.allclose(B, ref) +def test_direct_read_nested_structure(): + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], + indices=dace.int32[nnz], + data=dace.float32[nnz], + rows=M, + cols=N, + nnz=nnz), + name='CSRMatrix') + wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') + + sdfg = dace.SDFG('nested_csr_to_dense_direct') + + sdfg.add_datadesc('A', wrapper_obj) + sdfg.add_array('B', [M, N], dace.float32) + + spmat = wrapper_obj.members['csr'] + sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) + sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) + sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) + + state = sdfg.add_state() + + # A = state.add_access('A') + indptr = state.add_access('A.csr.indptr') + indices = state.add_access('A.csr.indices') + data = state.add_access('A.csr.data') + B = state.add_access('B') + + # indptr = state.add_access('vindptr') + # indices = state.add_access('vindices') + # data = state.add_access('vdata') + + # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) + # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) + # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.csr.indptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.csr.indptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='A.csr.indices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.csr.data', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') + + sdfg.view() + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + B = np.zeros((20, 20), dtype=np.float32) + + structclass = csr_obj.dtype._typeclass.as_ctypes() + inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], + indices=A.indices.__array_interface__['data'][0], + data=A.data.__array_interface__['data'][0], + rows=A.shape[0], + cols=A.shape[1], + M=A.shape[0], + K=A.shape[1], + nnz=A.nnz) + import ctypes + inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) + + func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) + ref = A.toarray() + + assert np.allclose(B, ref) + + if __name__ == "__main__": # test_read_structure() # test_write_structure() @@ -627,3 +701,4 @@ def test_direct_read_structure(): # test_read_nested_structure() # test_write_nested_structure() test_direct_read_structure() + test_direct_read_nested_structure() From 7d29defb511ffc7ba5cb88d440b8e45bc7988e99 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 20 Jul 2023 15:28:42 -0700 Subject: [PATCH 070/127] Relax test for inter-state edges in default schedules --- dace/sdfg/validation.py | 17 ++++++++++++----- tests/sdfg/disallowed_access_test.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 3bac646479..4fbc808fdd 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -42,7 +42,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context """ # Avoid import loop from dace.codegen.targets import fpga - from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga + from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope references = references or set() @@ -171,10 +171,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context for memlet in ise_memlets: container = memlet.data if not _accessible(sdfg, container, context): - eid = sdfg.edge_id(edge) - raise InvalidSDFGInterstateEdgeError( - f'Trying to read an inaccessible data container "{container}" ' - f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid) + # Check context w.r.t. maps + in_default_scope = False + if sdfg.parent_nsdfg_node is not None: + if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node, + [dtypes.ScheduleType.Default]): + in_default_scope = True + if not in_default_scope: + eid = sdfg.edge_id(edge) + raise InvalidSDFGInterstateEdgeError( + f'Trying to read an inaccessible data container "{container}" ' + f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid) # Add edge symbols into defined symbols symbols.update(issyms) diff --git a/tests/sdfg/disallowed_access_test.py b/tests/sdfg/disallowed_access_test.py index 8700e34db5..520481ea46 100644 --- a/tests/sdfg/disallowed_access_test.py +++ b/tests/sdfg/disallowed_access_test.py @@ -40,6 +40,7 @@ def test_gpu_access_on_host_interstate_invalid(): @pytest.mark.gpu def test_gpu_access_on_host_tasklet(): + @dace.program def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global): for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore: @@ -49,7 +50,29 @@ def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global): tester.to_sdfg(validate=True) +@pytest.mark.gpu +def test_gpu_access_on_device_interstate_edge_default(): + sdfg = dace.SDFG('tester') + sdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global) + state = sdfg.add_state() + + me, mx = state.add_map('test', dict(i='0:20')) + + nsdfg = dace.SDFG('nester') + nsdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global) + state1 = nsdfg.add_state() + state2 = nsdfg.add_state() + nsdfg.add_edge(state1, state2, dace.InterstateEdge(assignments=dict(s='A[4]'))) + + nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'A'}, {}) + state.add_memlet_path(state.add_read('A'), me, nsdfg_node, dst_conn='A', memlet=dace.Memlet('A[0:20]')) + state.add_nedge(nsdfg_node, mx, dace.Memlet()) + + sdfg.validate() + + if __name__ == '__main__': test_gpu_access_on_host_interstate_ok() test_gpu_access_on_host_interstate_invalid() test_gpu_access_on_host_tasklet() + test_gpu_access_on_device_interstate_edge_default() From cbe344c15dfd5668d0ddb5a419279fc12c0ef1da Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 20 Jul 2023 15:34:17 -0700 Subject: [PATCH 071/127] Lazy-evaluate in_default_scope --- dace/sdfg/validation.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 4fbc808fdd..aa7674ca45 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -111,6 +111,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context # Check if SDFG is located within a GPU kernel context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None) context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None) + in_default_scope = None # Check every state separately start_state = sdfg.start_state @@ -172,12 +173,13 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context container = memlet.data if not _accessible(sdfg, container, context): # Check context w.r.t. maps - in_default_scope = False - if sdfg.parent_nsdfg_node is not None: - if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node, - [dtypes.ScheduleType.Default]): - in_default_scope = True - if not in_default_scope: + if in_default_scope is None: # Lazy-evaluate in_default_scope + in_default_scope = False + if sdfg.parent_nsdfg_node is not None: + if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node, + [dtypes.ScheduleType.Default]): + in_default_scope = True + if in_default_scope is False: eid = sdfg.edge_id(edge) raise InvalidSDFGInterstateEdgeError( f'Trying to read an inaccessible data container "{container}" ' @@ -226,9 +228,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context for memlet in ise_memlets: container = memlet.data if not _accessible(sdfg, container, context): - raise InvalidSDFGInterstateEdgeError( - f'Trying to read an inaccessible data container "{container}" ' - f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid) + # Check context w.r.t. maps + if in_default_scope is None: # Lazy-evaluate in_default_scope + in_default_scope = False + if sdfg.parent_nsdfg_node is not None: + if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node, + [dtypes.ScheduleType.Default]): + in_default_scope = True + if in_default_scope is False: + raise InvalidSDFGInterstateEdgeError( + f'Trying to read an inaccessible data container "{container}" ' + f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid) except InvalidSDFGError as ex: # If the SDFG is invalid, save it From 08aac34a8fdd6cd38860d5907e7bca925248a20a Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Fri, 21 Jul 2023 08:47:39 +0200 Subject: [PATCH 072/127] Add review changes --- dace/sdfg/state.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 0354dd107b..8059609c36 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -298,11 +298,10 @@ def scope_tree(self) -> 'dace.sdfg.scope.ScopeTree': # Get scopes for node, scopenodes in sdc.items(): - scope_exit_nodes = [v for v in scopenodes if isinstance(v, nd.ExitNode)] if node is None: exit_node = None else: - exit_node = next(iter(scope_exit_nodes)) + exit_node = next(v for v in scopenodes if isinstance(v, nd.ExitNode)) scope = ScopeTree(node, exit_node) result[node] = scope @@ -506,10 +505,10 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr, in_edges = sg.in_edges(n) out_edges = sg.out_edges(n) # Filter out memlets which go out but the same data is written to the AccessNode by another memlet - for out_edge in out_edges: - for in_edge in in_edges: - if in_edge.data.data == out_edge.data.data and \ - in_edge.data.dst_subset.covers(out_edge.data.src_subset): + for out_edge in list(out_edges): + for in_edge in list(in_edges): + if (in_edge.data.data == out_edge.data.data and + in_edge.data.dst_subset.covers(out_edge.data.src_subset)): out_edges.remove(out_edge) for e in in_edges: From 851f1fa4f7041c4c3e1be564ae7c92929bb2dbc3 Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Fri, 21 Jul 2023 08:48:05 +0200 Subject: [PATCH 073/127] Apply suggestions from code review Co-authored-by: alexnick83 <31545860+alexnick83@users.noreply.github.com> --- tests/sdfg/state_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py index 07e2e8c4c7..c5cb953c4d 100644 --- a/tests/sdfg/state_test.py +++ b/tests/sdfg/state_test.py @@ -1,3 +1,4 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace From 9a1db886905fb037e92b96b0fbefa5c5074ba73d Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Fri, 21 Jul 2023 08:49:45 +0200 Subject: [PATCH 074/127] Added myself to AUTHORS file --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 573f142cf9..48cb4c05ec 100644 --- a/AUTHORS +++ b/AUTHORS @@ -36,5 +36,6 @@ Reid Wahl Yihang Luo Alexandru Calotoiu Phillip Lane +Samuel Martin and other contributors listed in https://github.com/spcl/dace/graphs/contributors From 0df9c3518c6d1ff307314a39dcbc8621423e3af4 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:17:02 +0200 Subject: [PATCH 075/127] Added free-symbols and repr. --- dace/data.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dace/data.py b/dace/data.py index e424aca66a..b54a4f9efb 100644 --- a/dace/data.py +++ b/dace/data.py @@ -364,7 +364,7 @@ class Structure(Data): name = Property(dtype=str, desc="Structure name") def __init__(self, - members: Dict[str, Any], + members: Dict[str, Data], name: str = 'Structure', transient: bool = False, storage: dtypes.StorageType = dtypes.StorageType.Default, @@ -432,6 +432,17 @@ def start_offset(self): @property def strides(self): return [1] + + @property + def free_symbols(self) -> Set[symbolic.SymbolicType]: + """ Returns a set of undefined symbols in this data descriptor. """ + result = set(self.symbols.keys()) + for k, v in self.members.items(): + result |= v.free_symbols + return result + + def __repr__(self): + return f"{self.name} ({', '.join([f'{k}: {v}' for k, v in self.members.items()])})" def as_arg(self, with_types=True, for_call=False, name=None): if self.storage is dtypes.StorageType.GPU_Global: From 909c1aaafd76622cecd4972cd2b3718caf2c261f Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:18:17 +0200 Subject: [PATCH 076/127] Recursively add free symbols from nested data. --- dace/sdfg/sdfg.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 1f385a4b75..ae85bff5d1 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -2015,9 +2015,6 @@ def _add_symbols(desc: dt.Data): self.add_symbol(sym.name, sym.dtype) # Add free symbols to the SDFG global symbol storage - # for sym in datadesc.free_symbols: - # if sym.name not in self.symbols: - # self.add_symbol(sym.name, sym.dtype) _add_symbols(datadesc) return name From e2b0d8b410e699692c1bf4863ae36a0b6f932e27 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:23:48 +0200 Subject: [PATCH 077/127] Updated tests. --- tests/sdfg/data/structure_test.py | 234 +++--------------------------- 1 file changed, 22 insertions(+), 212 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 91429e8bbc..2646fe3d03 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -8,36 +8,10 @@ from scipy import sparse -def create_structure(name: str) -> dace.data.Structure: - - StructureClass = type(name, (dace.data.Structure, ), {}) - - @staticmethod - def from_json(json_obj, context=None): - if json_obj['type'] != name: - raise TypeError("Invalid data type") - - # Create dummy object - ret = StructureClass({}) - serialize.set_properties_from_json(ret, json_obj, context=context) - - return ret - - setattr(StructureClass, 'from_json', from_json) - StructureClass = make_properties(StructureClass) - - return StructureClass - - def test_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense') @@ -83,14 +57,9 @@ def test_read_structure(): inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0], indices=A.indices.__array_interface__['data'][0], - data=A.data.__array_interface__['data'][0], - rows=A.shape[0], - cols=A.shape[1], - M=A.shape[0], - N=A.shape[1], - nnz=A.nnz) + data=A.data.__array_interface__['data'][0]) - func(A=inpA, B=B, M=20, N=20, nnz=A.nnz) + func(A=inpA, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz) ref = A.toarray() assert np.allclose(B, ref) @@ -99,12 +68,7 @@ def test_read_structure(): def test_write_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') sdfg = dace.SDFG('dense_to_csr') @@ -172,12 +136,7 @@ def test_write_structure(): outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + data=B.data.__array_interface__['data'][0]) func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) @@ -187,19 +146,9 @@ def test_write_structure(): def test_local_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') - tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix', transient=True) @@ -298,12 +247,7 @@ def test_local_structure(): outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + data=B.data.__array_interface__['data'][0]) func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz) @@ -312,12 +256,7 @@ def test_local_structure(): def test_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -366,93 +305,11 @@ def test_read_nested_structure(): structclass = csr_obj.dtype._typeclass.as_ctypes() inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], indices=A.indices.__array_interface__['data'][0], - data=A.data.__array_interface__['data'][0], - rows=A.shape[0], - cols=A.shape[1], - M=A.shape[0], - K=A.shape[1], - nnz=A.nnz) + data=A.data.__array_interface__['data'][0]) import ctypes inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) - func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) - ref = A.toarray() - - assert np.allclose(B, ref) - - -@pytest.mark.skip -def test_read_nested_structure_2(): - M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), - name='CSRMatrix') - CSRView = dace.data.StructureView(csr_obj.members, transient=True) - wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') - - sdfg = dace.SDFG('nested_csr_to_dense_2') - - sdfg.add_datadesc('A', wrapper_obj) - sdfg.add_array('B', [M, N], dace.float32) - - spmat = wrapper_obj.members['csr'] - sdfg.add_datadesc('vcsr', CSRView) - sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype) - sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype) - sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype) - - state = sdfg.add_state() - - A = state.add_access('A') - B = state.add_access('B') - - csr = state.add_access('vcsr') - indptr = state.add_access('vindptr') - indices = state.add_access('vindices') - data = state.add_access('vdata') - - state.add_edge(A, 'csr', csr, 'views', dace.Memlet.from_array('A.csr', spmat)) - state.add_edge(csr, 'indptr', indptr, 'views', dace.Memlet.from_array('vcsr.indptr', spmat.members['indptr'])) - state.add_edge(csr, 'indices', indices, 'views', dace.Memlet.from_array('vcsr.indices', spmat.members['indices'])) - state.add_edge(csr, 'data', data, 'views', dace.Memlet.from_array('vcsr.data', spmat.members['data'])) - - ime, imx = state.add_map('i', dict(i='0:M')) - jme, jmx = state.add_map('idx', dict(idx='start:stop')) - jme.add_in_connector('start') - jme.add_in_connector('stop') - t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') - - state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start') - state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop') - state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j') - state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') - state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') - - sdfg.view() - return - func = sdfg.compile() - - rng = np.random.default_rng(42) - A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) - B = np.zeros((20, 20), dtype=np.float32) - - structclass = csr_obj.dtype._typeclass.as_ctypes() - inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], - indices=A.indices.__array_interface__['data'][0], - data=A.data.__array_interface__['data'][0], - rows=A.shape[0], - cols=A.shape[1], - M=A.shape[0], - K=A.shape[1], - nnz=A.nnz) - import ctypes - inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) - - func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) + func(A=inpW, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz) ref = A.toarray() assert np.allclose(B, ref) @@ -461,12 +318,7 @@ def test_read_nested_structure_2(): def test_write_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -536,12 +388,7 @@ def test_write_nested_structure(): outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0], indices=B.indices.__array_interface__['data'][0], - data=B.data.__array_interface__['data'][0], - rows=tmp.shape[0], - cols=tmp.shape[1], - M=tmp.shape[0], - N=tmp.shape[1], - nnz=tmp.nnz) + data=B.data.__array_interface__['data'][0]) import ctypes outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR)) @@ -553,12 +400,7 @@ def test_write_nested_structure(): def test_direct_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense_direct') @@ -566,26 +408,13 @@ def test_direct_read_structure(): sdfg.add_datadesc('A', csr_obj) sdfg.add_array('B', [M, N], dace.float32) - # sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) - # sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) - # sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) - state = sdfg.add_state() - # A = state.add_access('A') indptr = state.add_access('A.indptr') indices = state.add_access('A.indices') data = state.add_access('A.data') B = state.add_access('B') - # indptr = state.add_access('vindptr') - # indices = state.add_access('vindices') - # data = state.add_access('vdata') - - # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr'])) - # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices'])) - # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data'])) - ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) jme.add_in_connector('start') @@ -622,12 +451,7 @@ def test_direct_read_structure(): def test_direct_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], - indices=dace.int32[nnz], - data=dace.float32[nnz], - rows=M, - cols=N, - nnz=nnz), + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -643,20 +467,11 @@ def test_direct_read_nested_structure(): state = sdfg.add_state() - # A = state.add_access('A') indptr = state.add_access('A.csr.indptr') indices = state.add_access('A.csr.indices') data = state.add_access('A.csr.data') B = state.add_access('B') - # indptr = state.add_access('vindptr') - # indices = state.add_access('vindices') - # data = state.add_access('vdata') - - # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr'])) - # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices'])) - # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data'])) - ime, imx = state.add_map('i', dict(i='0:M')) jme, jmx = state.add_map('idx', dict(idx='start:stop')) jme.add_in_connector('start') @@ -679,26 +494,21 @@ def test_direct_read_nested_structure(): structclass = csr_obj.dtype._typeclass.as_ctypes() inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0], indices=A.indices.__array_interface__['data'][0], - data=A.data.__array_interface__['data'][0], - rows=A.shape[0], - cols=A.shape[1], - M=A.shape[0], - K=A.shape[1], - nnz=A.nnz) + data=A.data.__array_interface__['data'][0]) import ctypes inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR)) - func(A=inpW, B=B, M=20, N=20, nnz=A.nnz) + func(A=inpW, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz) ref = A.toarray() assert np.allclose(B, ref) if __name__ == "__main__": - # test_read_structure() - # test_write_structure() - # test_local_structure() - # test_read_nested_structure() - # test_write_nested_structure() + test_read_structure() + test_write_structure() + test_local_structure() + test_read_nested_structure() + test_write_nested_structure() test_direct_read_structure() test_direct_read_nested_structure() From 52afc7250b02fb4b85eb3a62bf5104dce9a72995 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:24:14 +0200 Subject: [PATCH 078/127] Scrapped structure private symbols for now. --- dace/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/data.py b/dace/data.py index b54a4f9efb..9d3b6b86f3 100644 --- a/dace/data.py +++ b/dace/data.py @@ -436,7 +436,7 @@ def strides(self): @property def free_symbols(self) -> Set[symbolic.SymbolicType]: """ Returns a set of undefined symbols in this data descriptor. """ - result = set(self.symbols.keys()) + result = set() for k, v in self.members.items(): result |= v.free_symbols return result From 09246442f6e456b4b090651b895be53e3414a512 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:26:10 +0200 Subject: [PATCH 079/127] Updated tests. --- tests/sdfg/data/structure_test.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 2646fe3d03..02b8f0c174 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -48,7 +48,6 @@ def test_read_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -123,7 +122,6 @@ def test_write_structure(): i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1')) - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -234,7 +232,6 @@ def test_local_structure(): input_nodes={'tmp_vdata': tmp_data}, output_nodes={'vdata': B_data}) - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -295,7 +292,6 @@ def test_read_nested_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -375,7 +371,6 @@ def test_write_nested_structure(): i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1')) - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -427,7 +422,6 @@ def test_direct_read_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.data', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) @@ -484,7 +478,6 @@ def test_direct_read_nested_structure(): state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.csr.data', subset='idx'), dst_conn='__val') state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out') - sdfg.view() func = sdfg.compile() rng = np.random.default_rng(42) From 8296a6de765b2209cbd644b6017d68304016ef3c Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 21 Jul 2023 18:29:06 +0200 Subject: [PATCH 080/127] Added setitem. --- dace/sdfg/sdfg.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index ae85bff5d1..23964dbe41 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -62,8 +62,12 @@ def __getitem__(self, key): token = tokens.pop(0) result = result.members[token] return result - + def __setitem__(self, key, val): + if isinstance(key, str) and '.' in key: + raise KeyError('NestedDict does not support setting nested keys') + super(NestedDict, self).__setitem__(key, val) + def __contains__(self, key): tokens = key.split('.') if isinstance(key, str) else [key] token = tokens.pop(0) From e468a0ed965ae4d9b46a9b74d5f11fe66bf5406d Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Mon, 24 Jul 2023 12:05:06 +0200 Subject: [PATCH 081/127] Added testcase and fix --- dace/sdfg/state.py | 1 + tests/sdfg/state_test.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 8059609c36..c354cd9d1f 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -510,6 +510,7 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr, if (in_edge.data.data == out_edge.data.data and in_edge.data.dst_subset.covers(out_edge.data.src_subset)): out_edges.remove(out_edge) + break for e in in_edges: # skip empty memlets diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py index c5cb953c4d..48dea04d0b 100644 --- a/tests/sdfg/state_test.py +++ b/tests/sdfg/state_test.py @@ -4,9 +4,9 @@ def test_read_write_set(): sdfg = dace.SDFG('graph') - A = sdfg.add_array('A', [10], dace.float64) - B = sdfg.add_array('B', [10], dace.float64) - C = sdfg.add_array('C', [10], dace.float64) + sdfg.add_array('A', [10], dace.float64) + sdfg.add_array('B', [10], dace.float64) + sdfg.add_array('C', [10], dace.float64) state = sdfg.add_state('state') task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1') task2 = state.add_tasklet('work2', {'B'}, {'C'}, 'C = B + 1') @@ -20,5 +20,29 @@ def test_read_write_set(): assert 'B' not in state.read_and_write_sets()[0] + +def test_read_write_set_y_formation(): + sdfg = dace.SDFG('graph') + state = sdfg.add_state('state') + sdfg.add_array('A', [2], dace.float64) + sdfg.add_array('B', [2], dace.float64) + sdfg.add_array('C', [2], dace.float64) + task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1') + task2 = state.add_tasklet('work2', {'B'}, {'C'}, 'C += B + 1') + task3 = state.add_tasklet('work3', {'A'}, {'B'}, 'B = A + 2') + read_a = state.add_access('A') + rw_b = state.add_access('B') + write_c = state.add_access('C') + state.add_memlet_path(read_a, task1, dst_conn='A', memlet=dace.Memlet(data='A', subset='0')) + state.add_memlet_path(read_a, task3, dst_conn='A', memlet=dace.Memlet(data='A', subset='1')) + state.add_memlet_path(task1, rw_b, src_conn='B', memlet=dace.Memlet(data='B', subset='0')) + state.add_memlet_path(task3, rw_b, src_conn='B', memlet=dace.Memlet(data='B', subset='0')) + state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet(data='B', subset='0')) + state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet(data='C', subset='0')) + + assert 'B' not in state.read_and_write_sets()[0] + + if __name__ == '__main__': test_read_write_set() + test_read_write_set_y_formation() From e68c3423834fb5479f22c039de7fe167e57d3f37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Jul 2023 23:25:36 +0000 Subject: [PATCH 082/127] Bump certifi from 2023.5.7 to 2023.7.22 Bumps [certifi](https://github.com/certifi/python-certifi) from 2023.5.7 to 2023.7.22. - [Commits](https://github.com/certifi/python-certifi/compare/2023.05.07...2023.07.22) --- updated-dependencies: - dependency-name: certifi dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index da67189b70..33cd58a0bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ aenum==3.1.12 astunparse==1.6.3 blinker==1.6.2 -certifi==2023.5.7 +certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 dill==0.3.6 From e150571e453b5a4e646941ce650afd73b7632a20 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 13:44:58 +0200 Subject: [PATCH 083/127] Added test. --- tests/sdfg/validation/nested_sdfg_test.py | 38 +++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/sdfg/validation/nested_sdfg_test.py diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py new file mode 100644 index 0000000000..127543fc95 --- /dev/null +++ b/tests/sdfg/validation/nested_sdfg_test.py @@ -0,0 +1,38 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import numpy as np +import dace + + +def test_inout_connector(): + + sdfg = dace.SDFG("test_inout_connector") + sdfg.add_array("A", [1], dace.int32) + sdfg.add_array("B", [1], dace.int32) + + nsdfg = dace.SDFG("nested_sdfg") + nsdfg.add_array("C", [1], dace.int32) + + nstate = nsdfg.add_state() + read_c = nstate.add_access("C") + write_c = nstate.add_access("C") + tasklet = nstate.add_tasklet("tasklet", {"__inp"}, {"__out"}, "__out = __inp + 5") + nstate.add_edge(read_c, None, tasklet, '__inp', dace.Memlet.from_array('C', nsdfg.arrays['C'])) + nstate.add_edge(tasklet, '__out', write_c, None, dace.Memlet.from_array('C', nsdfg.arrays['C'])) + + state = sdfg.add_state() + read_a = state.add_access("A") + write_b = state.add_access("B") + tasklet = state.add_nested_sdfg(nsdfg, sdfg, {"C"}, {"C"}) + state.add_edge(read_a, None, tasklet, 'C', dace.Memlet.from_array('A', sdfg.arrays['A'])) + state.add_edge(tasklet, 'C', write_b, None, dace.Memlet.from_array('B', sdfg.arrays['B'])) + + try: + sdfg.validate() + except dace.sdfg.InvalidSDFGError: + return + + assert False, "SDFG should not validate" + + +if __name__ == "__main__": + test_inout_connector() From 16b64ccba063066c74782598e98ea9e05fe077ea Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 13:45:26 +0200 Subject: [PATCH 084/127] Removed unneeded import. --- tests/sdfg/validation/nested_sdfg_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py index 127543fc95..d4d34d0a15 100644 --- a/tests/sdfg/validation/nested_sdfg_test.py +++ b/tests/sdfg/validation/nested_sdfg_test.py @@ -1,5 +1,4 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. -import numpy as np import dace From dda325a4284a01fb780112f4ac8a5d43eac191cf Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 13:55:38 +0200 Subject: [PATCH 085/127] Added inout connector validation. --- dace/sdfg/nodes.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 5c270153e1..f9ccda46e1 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -634,6 +634,23 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname) if dname in connectors and desc.transient: raise NameError('"%s" is a connector but its corresponding array is transient' % dname) + + # Validate inout connectors + inout_connectors = self.in_connectors.keys() & self.out_connectors.keys() + for conn in inout_connectors: + inputs = set() + outputs = set() + for edge in state.in_edges_by_connector(self, conn): + src = state.memlet_path(edge)[0].src + if isinstance(src, AccessNode): + inputs.add(src.data) + for edge in state.out_edges_by_connector(self, conn): + dst = state.memlet_path(edge)[-1].dst + if isinstance(dst, AccessNode): + outputs.add(dst.data) + if len(inputs - outputs) > 0: + raise ValueError(f"Inout connector {conn} is connected to different input ({inputs}) and " + f"output ({outputs}) arrays") # Validate undefined symbols symbols = set(k for k in self.sdfg.free_symbols if k not in connectors) From b72c72249361225aeb6cfd1565de16937c9497cf Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 13:56:00 +0200 Subject: [PATCH 086/127] Added test. --- tests/sdfg/validation/nested_sdfg_test.py | 40 ++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py index d4d34d0a15..398a1635ef 100644 --- a/tests/sdfg/validation/nested_sdfg_test.py +++ b/tests/sdfg/validation/nested_sdfg_test.py @@ -2,9 +2,40 @@ import dace -def test_inout_connector(): +def test_inout_connector_validation_success(): - sdfg = dace.SDFG("test_inout_connector") + sdfg = dace.SDFG("test_inout_connector_validation_success") + sdfg.add_array("A", [1], dace.int32) + sdfg.add_array("B", [1], dace.int32) + + nsdfg = dace.SDFG("nested_sdfg") + nsdfg.add_array("C", [1], dace.int32) + + nstate = nsdfg.add_state() + read_c = nstate.add_access("C") + write_c = nstate.add_access("C") + tasklet = nstate.add_tasklet("tasklet", {"__inp"}, {"__out"}, "__out = __inp + 5") + nstate.add_edge(read_c, None, tasklet, '__inp', dace.Memlet.from_array('C', nsdfg.arrays['C'])) + nstate.add_edge(tasklet, '__out', write_c, None, dace.Memlet.from_array('C', nsdfg.arrays['C'])) + + state = sdfg.add_state() + read_b = state.add_access("B") + write_b = state.add_access("B") + tasklet = state.add_nested_sdfg(nsdfg, sdfg, {"C"}, {"C"}) + state.add_edge(read_b, None, tasklet, 'C', dace.Memlet.from_array('B', sdfg.arrays['B'])) + state.add_edge(tasklet, 'C', write_b, None, dace.Memlet.from_array('B', sdfg.arrays['B'])) + + try: + sdfg.validate() + except dace.sdfg.InvalidSDFGError: + assert False, "SDFG should validate" + + return + + +def test_inout_connector_validation_fail(): + + sdfg = dace.SDFG("test_inout_connector_validation_fail") sdfg.add_array("A", [1], dace.int32) sdfg.add_array("B", [1], dace.int32) @@ -29,9 +60,10 @@ def test_inout_connector(): sdfg.validate() except dace.sdfg.InvalidSDFGError: return - + assert False, "SDFG should not validate" if __name__ == "__main__": - test_inout_connector() + test_inout_connector_validation_success() + test_inout_connector_validation_fail() From 438dafdc9325eccbcfff488be579f144ba2e07b0 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 15:32:50 +0200 Subject: [PATCH 087/127] SubgraphFusion doesn't remove intermediate nodes whose data have also output accesses. --- dace/transformation/subgraph/subgraph_fusion.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dace/transformation/subgraph/subgraph_fusion.py b/dace/transformation/subgraph/subgraph_fusion.py index a56336fa8d..1ff286b85c 100644 --- a/dace/transformation/subgraph/subgraph_fusion.py +++ b/dace/transformation/subgraph/subgraph_fusion.py @@ -1146,10 +1146,15 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s # by reconnecting their adjacent edges to nodes outside the subgraph. # NOTE: Currently limited to cases where there is a single source and sink # if there are multiple intermediate accesses for the same data. + # NOTE: Currently limited to intermediate data that do not have a separate output node + + # Filter out outputs + output_data = set([n.data for n in out_nodes]) + true_intermediate_nodes = set([n for n in intermediate_nodes if n.data not in output_data]) # Sort intermediate nodes by data name intermediate_data = dict() - for acc in intermediate_nodes: + for acc in true_intermediate_nodes: if acc.data in intermediate_data: intermediate_data[acc.data].append(acc) else: From 4b14a733d10e7a121bd4a02ab2810e353b403272 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 17:25:54 +0200 Subject: [PATCH 088/127] Added utility methods for a finding (one of) the sources and destinations of a memlet path across nested SDFG levels. --- dace/sdfg/utils.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 8d251efd89..7eef600180 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -1797,3 +1797,29 @@ def get_thread_local_data(sdfg: SDFG) -> List[str]: if not sdfg.arrays[name].transient: warnings.warn(f'Found thread-local data "{name}" that is not transient.') return result + + +def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node: + src = state.memlet_path(edge)[0].src + if isinstance(src, nd.AccessNode) and not sdfg.arrays[src.data].transient and sdfg.parent is not None: + psdfg = sdfg.parent_sdfg + pstate = sdfg.parent + pnode = sdfg.parent_nsdfg_node + pedges = list(pstate.in_edges_by_connector(pnode, src.data)) + if len(pedges) > 0: + pedge = pedges[0] + return get_global_memlet_path_src(psdfg, pstate, pedge) + return src + + +def get_global_memlet_path_dst(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node: + dst = state.memlet_path(edge)[-1].dst + if isinstance(dst, nd.AccessNode) and not sdfg.arrays[dst.data].transient and sdfg.parent is not None: + psdfg = sdfg.parent_sdfg + pstate = sdfg.parent + pnode = sdfg.parent_nsdfg_node + pedges = list(pstate.out_edges_by_connector(pnode, dst.data)) + if len(pedges) > 0: + pedge = pedges[0] + return get_global_memlet_path_dst(psdfg, pstate, pedge) + return dst From 7fb6757ff3df535580158e5df9d04a6c4cf41c57 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 17:26:25 +0200 Subject: [PATCH 089/127] Amended validation to use new utility methods. --- dace/sdfg/nodes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index f9ccda46e1..6ba84d919e 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -636,16 +636,17 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context raise NameError('"%s" is a connector but its corresponding array is transient' % dname) # Validate inout connectors + from dace.sdfg import utils # Avoids circular import inout_connectors = self.in_connectors.keys() & self.out_connectors.keys() for conn in inout_connectors: inputs = set() outputs = set() for edge in state.in_edges_by_connector(self, conn): - src = state.memlet_path(edge)[0].src + src = utils.get_global_memlet_path_src(sdfg, state, edge) if isinstance(src, AccessNode): inputs.add(src.data) for edge in state.out_edges_by_connector(self, conn): - dst = state.memlet_path(edge)[-1].dst + dst = utils.get_global_memlet_path_dst(sdfg, state, edge) if isinstance(dst, AccessNode): outputs.add(dst.data) if len(inputs - outputs) > 0: From 70198d5fae51ff3884966a7c13ed475e0216ebc9 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 20:29:18 +0200 Subject: [PATCH 090/127] Added comm-comparison tests. --- .../mpi4py/comm_comparison_test.py | 200 ++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 tests/python_frontend/mpi4py/comm_comparison_test.py diff --git a/tests/python_frontend/mpi4py/comm_comparison_test.py b/tests/python_frontend/mpi4py/comm_comparison_test.py new file mode 100644 index 0000000000..e7d74e5981 --- /dev/null +++ b/tests/python_frontend/mpi4py/comm_comparison_test.py @@ -0,0 +1,200 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" Tests comparison operators with communicator objects. """ +import dace +import numpy as np +import pytest + + +@pytest.mark.mpi +def test_eq_commworld_0(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + + @dace.program + def eq_commworld_0(out: dace.bool[1]): + out[0] = comm == MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + eq_commworld_0(res) + assert res[0] == (comm == MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_eq_commworld_1(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + comm2 = comm.Dup() + + @dace.program + def eq_commworld_1(out: dace.bool[1]): + out[0] = comm2 == MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + eq_commworld_1(res) + assert res[0] == (comm2 == MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_eq_commworld_2(): + + from mpi4py import MPI + + @dace.program + def eq_commworld_2(out: dace.bool[1]): + out[0] = MPI.COMM_NULL == MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + eq_commworld_2(res) + assert res[0] == (MPI.COMM_NULL == MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_noteq_commworld_0(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + + @dace.program + def noteq_commworld_0(out: dace.bool[1]): + out[0] = comm != MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + noteq_commworld_0(res) + assert res[0] == (comm != MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_noteq_commworld_1(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + comm2 = comm.Dup() + + @dace.program + def noteq_commworld_1(out: dace.bool[1]): + out[0] = comm2 != MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + noteq_commworld_1(res) + assert res[0] == (comm2 != MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_noteq_commworld_2(): + + from mpi4py import MPI + + @dace.program + def noteq_commworld_2(out: dace.bool[1]): + out[0] = MPI.COMM_NULL != MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + noteq_commworld_2(res) + assert res[0] == (MPI.COMM_NULL != MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_is_commworld_0(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + + @dace.program + def is_commworld_0(out: dace.bool[1]): + out[0] = comm is MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + is_commworld_0(res) + assert res[0] == (comm is MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_is_commworld_1(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + comm2 = comm.Dup() + + @dace.program + def is_commworld_1(out: dace.bool[1]): + out[0] = comm2 is MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + is_commworld_1(res) + assert res[0] == (comm2 is MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_is_commworld_2(): + + from mpi4py import MPI + + @dace.program + def is_commworld_2(out: dace.bool[1]): + out[0] = MPI.COMM_NULL is MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + is_commworld_2(res) + assert res[0] == (MPI.COMM_NULL is MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_isnot_commworld_0(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + + @dace.program + def isnot_commworld_0(out: dace.bool[1]): + out[0] = comm is MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + isnot_commworld_0(res) + assert res[0] == (comm is MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_isnot_commworld_1(): + + from mpi4py import MPI + comm = MPI.COMM_WORLD + comm2 = comm.Dup() + + @dace.program + def isnot_commworld_1(out: dace.bool[1]): + out[0] = comm2 is not MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + isnot_commworld_1(res) + assert res[0] == (comm2 is not MPI.COMM_WORLD) + + +@pytest.mark.mpi +def test_isnot_commworld_2(): + + from mpi4py import MPI + + @dace.program + def isnot_commworld_2(out: dace.bool[1]): + out[0] = MPI.COMM_NULL is not MPI.COMM_WORLD + + res = np.zeros((1,), dtype=np.bool_) + isnot_commworld_2(res) + assert res[0] == (MPI.COMM_NULL is not MPI.COMM_WORLD) + + +if __name__ == "__main__": + test_eq_commworld_0() + test_eq_commworld_1() + test_eq_commworld_2() + test_noteq_commworld_0() + test_noteq_commworld_1() + test_noteq_commworld_2() + test_is_commworld_0() + test_is_commworld_1() + test_is_commworld_2() + test_isnot_commworld_0() + test_isnot_commworld_1() + test_isnot_commworld_2() From 727afa7d9a2dfcf0cd0ac3a81b1841a8a773fc54 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 20:30:47 +0200 Subject: [PATCH 091/127] Refactored communicator comparsion replacements. --- dace/frontend/common/distr.py | 68 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 68b6f120d8..b6868d3289 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1,16 +1,15 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -from numbers import Integral, Number -from typing import Sequence, Tuple, Union - +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace +import itertools +import sympy as sp + from dace import dtypes, symbolic from dace.frontend.common import op_repository as oprepo +from dace.frontend.python.replacements import _define_local_scalar from dace.memlet import Memlet from dace.sdfg import SDFG, SDFGState - -import sympy as sp - -from dace.frontend.python.replacements import _define_local_scalar +from numbers import Integral, Number +from typing import Sequence, Tuple, Union ShapeType = Sequence[Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]] RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic] @@ -117,40 +116,33 @@ def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid: return _cart_sub(pv, sdfg, state, parent_grid, color) -@oprepo.replaces_operator('ProcessGrid', 'Eq', otherclass='Comm') -@oprepo.replaces_operator('ProcessGrid', 'Is', otherclass='Comm') -def _pgrid_eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'): - from mpi4py import MPI - if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL: - return False - return True - - -@oprepo.replaces_operator('Comm', 'Eq', otherclass='ProcessGrid') -@oprepo.replaces_operator('Comm', 'Is', otherclass='ProcessGrid') -def _comm_eq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'): - from mpi4py import MPI - if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL: - return False - return True - +# TODO: Revisit after discussing how "immutable" mpi4py communicators are during the program's execution. +for left_cls, right_cls in itertools.product(['Comm', 'Cartcomm', 'Intracomm'], repeat=2): -@oprepo.replaces_operator('ProcessGrid', 'NotEq', otherclass='Comm') -@oprepo.replaces_operator('ProcessGrid', 'IsNot', otherclass='Comm') -def _pgrid_neq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'): - from mpi4py import MPI - if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL: - return True - return False + @oprepo.replaces_operator(left_cls, 'Eq', otherclass=right_cls) + def _eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): + return op1 == op2 + + @oprepo.replaces_operator(left_cls, 'NotEq', otherclass=right_cls) + def _noteq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): + return op1 != op2 + + @oprepo.replaces_operator(left_cls, 'Is', otherclass=right_cls) + def _is_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): + return op1 is op2 + + @oprepo.replaces_operator(left_cls, 'IsNot', otherclass=right_cls) + def _isnot_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): + return op1 is not op2 -@oprepo.replaces_operator('Comm', 'NotEq', otherclass='ProcessGrid') -@oprepo.replaces_operator('Comm', 'IsNot', otherclass='ProcessGrid') -def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'): - from mpi4py import MPI - if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL: +for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'], ['Eq', 'NotEq', 'Is', 'IsNot']): + @oprepo.replaces_operator(cls_a, op, otherclass=cls_b) + @oprepo.replaces_operator(cls_b, op, otherclass=cls_a) + def _op_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: Union[str, 'Comm'], op2: Union[str, 'Comm']): + if op in ('Eq', 'Is'): + return False return True - return False ##### MPI Collectives From ac177bdc9a63709366fd8c857cc5007a950dbbb5 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 20:31:43 +0200 Subject: [PATCH 092/127] Addressed review comments. --- dace/frontend/python/newast.py | 2 +- dace/frontend/python/preprocessing.py | 2 +- dace/libraries/mpi/nodes/alltoall.py | 2 +- dace/libraries/mpi/nodes/isend.py | 7 ++----- tests/library/mpi/mpi4py_test.py | 2 +- tests/library/mpi/mpi_alltoall_test.py | 2 +- 6 files changed, 7 insertions(+), 10 deletions(-) diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 31cb8907c1..853316e097 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1308,7 +1308,7 @@ def defined(self): try: from mpi4py import MPI result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)}) - except: + except (ImportError, ModuleNotFoundError): pass return result diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index ea312a18c0..6a4ea89394 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -1553,7 +1553,7 @@ def visit_BinOp(self, node: ast.BinOp) -> ast.BinOp: if isinstance(node.op, ast.Mod): left = self.generic_visit(node.left) right = self.generic_visit(node.right) - newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=copy.deepcopy(right)), left) + newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=astutils.copy_tree(right)), left) node.left = newleft return node return self.generic_visit(node) diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py index 92be24ce45..bb64740f50 100644 --- a/dace/libraries/mpi/nodes/alltoall.py +++ b/dace/libraries/mpi/nodes/alltoall.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace.library import dace.properties import dace.sdfg.nodes diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py index 8de4035515..39951dd0d0 100644 --- a/dace/libraries/mpi/nodes/isend.py +++ b/dace/libraries/mpi/nodes/isend.py @@ -97,11 +97,8 @@ def validate(self, sdfg, state): if e.src_conn == "_request": req = sdfg.arrays[e.data.data] - # TODO: Should we expect any integer type here and cast to int32 later?. Investigate further in the future. - # if dest.dtype.base_type != dace.dtypes.int32: - # raise ValueError("Destination must be an integer!") - # if tag.dtype.base_type != dace.dtypes.int32: - # raise ValueError("Tag must be an integer!") + # TODO: Should we expect any integer type for dst/tag and cast to int32 later?. + # TODO: Investigate further in the future. count_str = "XXX" for _, _, _, dst_conn, data in state.in_edges(self): diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 1bbeae627f..2237ed8ba4 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -1,4 +1,4 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace from dace.sdfg import utils import dace.dtypes as dtypes diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py index e1eb4fe5f1..66199d9aa5 100644 --- a/tests/library/mpi/mpi_alltoall_test.py +++ b/tests/library/mpi/mpi_alltoall_test.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace from dace.memlet import Memlet import dace.libraries.mpi as mpi From 01f82fac5486ca4faca9e0408a7b0cecdb8bc121 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 20:34:13 +0200 Subject: [PATCH 093/127] YAPF --- dace/frontend/common/distr.py | 16 +++--- dace/frontend/python/newast.py | 6 ++- tests/library/mpi/mpi4py_test.py | 50 ++++++++++++------- tests/library/mpi/mpi_allgather_test.py | 5 +- tests/library/mpi/mpi_alltoall_test.py | 4 +- tests/library/mpi/mpi_isend_irecv_test.py | 4 +- tests/library/mpi/mpi_send_recv_test.py | 1 + .../mpi4py/comm_comparison_test.py | 48 +++++++++--------- 8 files changed, 75 insertions(+), 59 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index b6868d3289..d6f22da358 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -122,21 +122,23 @@ def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid: @oprepo.replaces_operator(left_cls, 'Eq', otherclass=right_cls) def _eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): return op1 == op2 - + @oprepo.replaces_operator(left_cls, 'NotEq', otherclass=right_cls) def _noteq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): return op1 != op2 - + @oprepo.replaces_operator(left_cls, 'Is', otherclass=right_cls) def _is_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): return op1 is op2 - + @oprepo.replaces_operator(left_cls, 'IsNot', otherclass=right_cls) def _isnot_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'): return op1 is not op2 -for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'], ['Eq', 'NotEq', 'Is', 'IsNot']): +for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'], + ['Eq', 'NotEq', 'Is', 'IsNot']): + @oprepo.replaces_operator(cls_a, op, otherclass=cls_b) @oprepo.replaces_operator(cls_b, op, otherclass=cls_a) def _op_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: Union[str, 'Comm'], op2: Union[str, 'Comm']): @@ -469,7 +471,7 @@ def _send(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Send') def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, - dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """ from mpi4py import MPI @@ -481,7 +483,7 @@ def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: ' @oprepo.replaces_method('ProcessGrid', 'Send') def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str, - dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Send(buffer, dst, tag, grid=pgrid)`. """ raise NotImplementedError('ProcessGrid.Send is not supported yet.') @@ -689,7 +691,7 @@ def _recv(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Recv') def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, - src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): + src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """ from mpi4py import MPI diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 853316e097..c9d92b7860 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1304,7 +1304,9 @@ def defined(self): result.update(self.sdfg.arrays) # MPI-related stuff - result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids}) + result.update( + {k: self.sdfg.process_grids[v] + for k, v in self.variables.items() if v in self.sdfg.process_grids}) try: from mpi4py import MPI result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)}) @@ -5001,7 +5003,7 @@ def visit_Subscript(self, node: ast.Subscript, inference: bool = False): rng = expr.subset rng.offset(rng, True) return self.sdfg.arrays[array].dtype, rng.size() - + if is_read: return self._add_read_slice(array, node, expr) else: diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 2237ed8ba4..52b5deb7a8 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -20,7 +20,7 @@ def comm_world_bcast(A: dace.int32[10]): if size < 2: raise ValueError("Please run this test with at least two processes.") - + sdfg = None if rank == 0: sdfg = comm_world_bcast.to_sdfg() @@ -36,7 +36,7 @@ def comm_world_bcast(A: dace.int32[10]): func(A=A) comm_world_bcast.f(A_ref) - assert(np.array_equal(A, A_ref)) + assert (np.array_equal(A, A_ref)) @pytest.mark.mpi @@ -55,7 +55,7 @@ def external_comm_bcast(A: dace.int32[10]): if size < 2: raise ValueError("Please run this test with at least two processes.") - + sdfg = None if rank == 0: sdfg = external_comm_bcast.to_sdfg() @@ -74,7 +74,7 @@ def external_comm_bcast(A: dace.int32[10]): func(A=A, new_comm=new_comm.py2f()) external_comm_bcast.f(A_ref) - assert(np.array_equal(A, A_ref)) + assert (np.array_equal(A, A_ref)) @pytest.mark.mpi @@ -109,7 +109,7 @@ def pgrid_bcast(A: dace.int32[10]): func(A=A) pgrid_bcast.f(A_ref) - assert(np.array_equal(A, A_ref)) + assert (np.array_equal(A, A_ref)) @pytest.mark.mpi @@ -149,12 +149,24 @@ def subgrid_bcast(A: dace.int32[10], rank: dace.int32): func(A=A, rank=rank) subgrid_bcast.f(A_ref, rank) - assert(np.array_equal(A, A_ref)) - - -def initialize_3mm(b_NI: int, b_NJ: int, b_NK: int, b_NL: int, b_NM: int, - ts_NI: int, ts_NJ: int, ts_NK, ts_NL: int, ts_NM: int, - NI: int, NJ: int, NK: int, NL: int, NM: int, + assert (np.array_equal(A, A_ref)) + + +def initialize_3mm(b_NI: int, + b_NJ: int, + b_NK: int, + b_NL: int, + b_NM: int, + ts_NI: int, + ts_NJ: int, + ts_NK, + ts_NL: int, + ts_NM: int, + NI: int, + NJ: int, + NK: int, + NL: int, + NM: int, datatype: type = np.float64): A = np.fromfunction(lambda i, k: b_NK + k + 1, (ts_NI, ts_NK), dtype=datatype) @@ -206,16 +218,16 @@ def k3mm(A, B, C, D): return E N = 128 - assert(size <= 128) - - NI, NJ, NK, NL, NM = (N,) * 5 + assert (size <= 128) + + NI, NJ, NK, NL, NM = (N, ) * 5 PNI, PNJ, PNK, PNL, PNM = 1, 2, 1, 1, 1 cart_comm = commworld.Create_cart([1, size, 1]) cart_rank = cart_comm.Get_rank() cart_size = cart_comm.Get_size() cart_coords = cart_comm.Get_coords(cart_rank) - + ts_NI = int(np.ceil(NI / PNI)) ts_NJ = int(np.ceil(NJ / PNJ)) ts_NK = int(np.ceil(NJ / PNK)) @@ -240,7 +252,7 @@ def k3mm(A, B, C, D): commworld.Barrier() if E_ref is not None: - assert(np.array_equal(E, E_ref)) + assert (np.array_equal(E, E_ref)) @pytest.mark.mpi @@ -255,7 +267,7 @@ def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32): src = (rank - 1) % size dst = (rank + 1) % size req = np.empty((2, ), dtype=MPI.Request) - sbuf = np.full((1,), rank, dtype=np.int32) + sbuf = np.full((1, ), rank, dtype=np.int32) req[0] = commworld.Isend(sbuf, dst, tag=0) rbuf = np.empty((1, ), dtype=np.int32) req[1] = commworld.Irecv(rbuf, src, tag=0) @@ -284,7 +296,7 @@ def test_send_recv(): def mpi4py_send_recv(rank: dace.int32, size: dace.int32): src = np.full([1], (rank - 1) % size, dtype=np.int32) dst = np.full([1], (rank + 1) % size, dtype=np.int32) - sbuf = np.full((1,), rank, dtype=np.int32) + sbuf = np.full((1, ), rank, dtype=np.int32) commworld.Send(sbuf, dst, tag=0) rbuf = np.empty((1, ), dtype=np.int32) commworld.Recv(rbuf, src, tag=0) @@ -310,7 +322,7 @@ def test_alltoall(): @dace.program def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): - sbuf = np.full((size,), rank, dtype=np.int32) + sbuf = np.full((size, ), rank, dtype=np.int32) rbuf = np.zeros((size, ), dtype=np.int32) commworld.Alltoall(sbuf, rbuf) return rbuf diff --git a/tests/library/mpi/mpi_allgather_test.py b/tests/library/mpi/mpi_allgather_test.py index 1eebcd5676..1f0a30a4d1 100644 --- a/tests/library/mpi/mpi_allgather_test.py +++ b/tests/library/mpi/mpi_allgather_test.py @@ -22,10 +22,7 @@ def make_sdfg(dtype): outA = state.add_access("outA") allgather_node = mpi.nodes.allgather.Allgather("allgather") - state.add_memlet_path(inA, - allgather_node, - dst_conn="_inbuffer", - memlet=Memlet.simple(inA, "0:n", num_accesses=n)) + state.add_memlet_path(inA, allgather_node, dst_conn="_inbuffer", memlet=Memlet.simple(inA, "0:n", num_accesses=n)) state.add_memlet_path(allgather_node, outA, src_conn="_outbuffer", diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py index 66199d9aa5..b51289ddd0 100644 --- a/tests/library/mpi/mpi_alltoall_test.py +++ b/tests/library/mpi/mpi_alltoall_test.py @@ -56,14 +56,14 @@ def test_mpi(implementation, dtype): comm.Barrier() size = 128 - size_per_proc = int(size/commsize) + size_per_proc = int(size / commsize) A = np.arange(0, size, dtype=np_dtype) B = np.full(size, 0, dtype=np_dtype) mpi_sdfg(inbuf=A, outbuf=B, n=size) # now B should be an array of size, # containing (size / size_per_proc) repeated chunked_data - chunked_data = A[rank * size_per_proc: (rank + 1) * size_per_proc] + chunked_data = A[rank * size_per_proc:(rank + 1) * size_per_proc] correct_data = np.tile(chunked_data, int(size / size_per_proc)) if (not np.allclose(B, correct_data)): raise (ValueError("The received values are not what I expected on root.")) diff --git a/tests/library/mpi/mpi_isend_irecv_test.py b/tests/library/mpi/mpi_isend_irecv_test.py index 9fab8c0158..ed21ec3fa4 100644 --- a/tests/library/mpi/mpi_isend_irecv_test.py +++ b/tests/library/mpi/mpi_isend_irecv_test.py @@ -109,8 +109,10 @@ def _test_mpi(info, sdfg, dtype): def test_mpi(): _test_mpi("MPI Isend/Irecv", make_sdfg(np.float64), np.float64) + ############################################################################### + @pytest.mark.mpi def test_isend_irecv(): from mpi4py import MPI @@ -123,7 +125,7 @@ def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32): src = (rank - 1) % size dst = (rank + 1) % size req = np.empty((2, ), dtype=MPI.Request) - sbuf = np.full((1,), rank, dtype=np.int32) + sbuf = np.full((1, ), rank, dtype=np.int32) req[0] = commworld.Isend(sbuf, dst, tag=0) rbuf = np.empty((1, ), dtype=np.int32) req[1] = commworld.Irecv(rbuf, src, tag=0) diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py index bf39c955d3..9c8d78c042 100644 --- a/tests/library/mpi/mpi_send_recv_test.py +++ b/tests/library/mpi/mpi_send_recv_test.py @@ -76,6 +76,7 @@ def test_mpi(): ############################################################################### + @dace.program def dace_send_recv(rank: dace.int32, size: dace.int32): src = np.full([1], (rank - 1) % size, dtype=np.int32) diff --git a/tests/python_frontend/mpi4py/comm_comparison_test.py b/tests/python_frontend/mpi4py/comm_comparison_test.py index e7d74e5981..45bda19876 100644 --- a/tests/python_frontend/mpi4py/comm_comparison_test.py +++ b/tests/python_frontend/mpi4py/comm_comparison_test.py @@ -14,8 +14,8 @@ def test_eq_commworld_0(): @dace.program def eq_commworld_0(out: dace.bool[1]): out[0] = comm == MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) eq_commworld_0(res) assert res[0] == (comm == MPI.COMM_WORLD) @@ -30,8 +30,8 @@ def test_eq_commworld_1(): @dace.program def eq_commworld_1(out: dace.bool[1]): out[0] = comm2 == MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) eq_commworld_1(res) assert res[0] == (comm2 == MPI.COMM_WORLD) @@ -44,8 +44,8 @@ def test_eq_commworld_2(): @dace.program def eq_commworld_2(out: dace.bool[1]): out[0] = MPI.COMM_NULL == MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) eq_commworld_2(res) assert res[0] == (MPI.COMM_NULL == MPI.COMM_WORLD) @@ -59,8 +59,8 @@ def test_noteq_commworld_0(): @dace.program def noteq_commworld_0(out: dace.bool[1]): out[0] = comm != MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) noteq_commworld_0(res) assert res[0] == (comm != MPI.COMM_WORLD) @@ -75,8 +75,8 @@ def test_noteq_commworld_1(): @dace.program def noteq_commworld_1(out: dace.bool[1]): out[0] = comm2 != MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) noteq_commworld_1(res) assert res[0] == (comm2 != MPI.COMM_WORLD) @@ -89,8 +89,8 @@ def test_noteq_commworld_2(): @dace.program def noteq_commworld_2(out: dace.bool[1]): out[0] = MPI.COMM_NULL != MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) noteq_commworld_2(res) assert res[0] == (MPI.COMM_NULL != MPI.COMM_WORLD) @@ -104,8 +104,8 @@ def test_is_commworld_0(): @dace.program def is_commworld_0(out: dace.bool[1]): out[0] = comm is MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) is_commworld_0(res) assert res[0] == (comm is MPI.COMM_WORLD) @@ -120,8 +120,8 @@ def test_is_commworld_1(): @dace.program def is_commworld_1(out: dace.bool[1]): out[0] = comm2 is MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) is_commworld_1(res) assert res[0] == (comm2 is MPI.COMM_WORLD) @@ -134,8 +134,8 @@ def test_is_commworld_2(): @dace.program def is_commworld_2(out: dace.bool[1]): out[0] = MPI.COMM_NULL is MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) is_commworld_2(res) assert res[0] == (MPI.COMM_NULL is MPI.COMM_WORLD) @@ -149,8 +149,8 @@ def test_isnot_commworld_0(): @dace.program def isnot_commworld_0(out: dace.bool[1]): out[0] = comm is MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) isnot_commworld_0(res) assert res[0] == (comm is MPI.COMM_WORLD) @@ -165,8 +165,8 @@ def test_isnot_commworld_1(): @dace.program def isnot_commworld_1(out: dace.bool[1]): out[0] = comm2 is not MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) isnot_commworld_1(res) assert res[0] == (comm2 is not MPI.COMM_WORLD) @@ -179,8 +179,8 @@ def test_isnot_commworld_2(): @dace.program def isnot_commworld_2(out: dace.bool[1]): out[0] = MPI.COMM_NULL is not MPI.COMM_WORLD - - res = np.zeros((1,), dtype=np.bool_) + + res = np.zeros((1, ), dtype=np.bool_) isnot_commworld_2(res) assert res[0] == (MPI.COMM_NULL is not MPI.COMM_WORLD) From 7bfd96036c379aa2bfaf7c0a0be21e3eb054b983 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 26 Jul 2023 21:07:11 +0200 Subject: [PATCH 094/127] Added extra exception to catch. --- dace/frontend/python/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index 6a4ea89394..10a1ab120e 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -1603,7 +1603,7 @@ def preprocess_dace_program(f: Callable[..., Any], try: src_ast = MPIResolver(global_vars).visit(src_ast) - except ModuleNotFoundError: + except (ImportError, ModuleNotFoundError): pass src_ast = ModuloConverter().visit(src_ast) From a98fce07b7e78b0bf1a0bc53d17e37e38c22b3dc Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 27 Jul 2023 20:58:42 +0200 Subject: [PATCH 095/127] Serialize Structure members and struct data/length as list of tuples. --- dace/data.py | 5 +++-- dace/dtypes.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dace/data.py b/dace/data.py index 9d3b6b86f3..fd7cdaf8e3 100644 --- a/dace/data.py +++ b/dace/data.py @@ -344,13 +344,14 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global): def _arrays_to_json(arrays): if arrays is None: return None - return {k: serialize.to_json(v) for k, v in arrays.items()} + sorted_keys = sorted(arrays.keys()) + return [(k, serialize.to_json(arrays[k])) for k in sorted_keys] def _arrays_from_json(obj, context=None): if obj is None: return {} - return {k: serialize.from_json(v, context) for k, v in obj.items()} + return {k: serialize.from_json(v, context) for k, v in obj} @make_properties diff --git a/dace/dtypes.py b/dace/dtypes.py index d01209469f..9c483d5df1 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -768,13 +768,12 @@ def fields(self): return self._data def to_json(self): + sorted_keys = sorted(self._data.keys()) return { 'type': 'struct', 'name': self.name, - 'data': {k: v.to_json() - for k, v in self._data.items()}, - 'length': {k: v - for k, v in self._length.items()}, + 'data': [(k, self._data[k].to_json()) for k in sorted_keys], + 'length': [(k, self._length[k]) for k in sorted_keys if k in self._length], 'bytes': self.bytes } @@ -786,8 +785,8 @@ def from_json(json_obj, context=None): import dace.serialize # Avoid import loop ret = struct(json_obj['name']) - ret._data = {k: json_to_typeclass(v, context) for k, v in json_obj['data'].items()} - ret._length = {k: v for k, v in json_obj['length'].items()} + ret._data = {k: json_to_typeclass(v, context) for k, v in json_obj['data']} + ret._length = {k: v for k, v in json_obj['length']} ret.bytes = json_obj['bytes'] return ret From f431a8df0c99890d5dbeef48674157aa196d6a3e Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 28 Jul 2023 10:29:11 +0200 Subject: [PATCH 096/127] Switched Structures and structs to OrderedDicts. --- dace/data.py | 40 ++++++++++++++++++++----------- dace/dtypes.py | 26 ++++++++++---------- tests/sdfg/data/structure_test.py | 8 +++++++ 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/dace/data.py b/dace/data.py index fd7cdaf8e3..b20f9f7db5 100644 --- a/dace/data.py +++ b/dace/data.py @@ -3,8 +3,9 @@ import ctypes import functools +from collections import OrderedDict from numbers import Number -from typing import Any, Dict, Optional, Sequence, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple import numpy import sympy as sp @@ -344,40 +345,47 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global): def _arrays_to_json(arrays): if arrays is None: return None - sorted_keys = sorted(arrays.keys()) - return [(k, serialize.to_json(arrays[k])) for k in sorted_keys] + return [(k, serialize.to_json(v)) for k, v in arrays.items()] def _arrays_from_json(obj, context=None): if obj is None: return {} - return {k: serialize.from_json(v, context) for k, v in obj} + return OrderedDict((k, serialize.from_json(v, context)) for k, v in obj) @make_properties class Structure(Data): """ Base class for structures. """ - members = Property(dtype=dict, + members = Property(dtype=OrderedDict, desc="Dictionary of structure members", from_json=_arrays_from_json, to_json=_arrays_to_json) + order = ListProperty(element_type=str, desc="Order of structure members") name = Property(dtype=str, desc="Structure name") def __init__(self, members: Dict[str, Data], + order: List[str] = None, name: str = 'Structure', transient: bool = False, storage: dtypes.StorageType = dtypes.StorageType.Default, location: Dict[str, str] = None, lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, debuginfo: dtypes.DebugInfo = None): + + self.order = order or list(members.keys()) + if set(members.keys()) != set(self.order): + raise ValueError('Order must contain all members of the structure.') + # TODO: Should we make a deep-copy here? - self.members = members or {} + self.members = OrderedDict((k, members[k]) for k in self.order) + for k, v in self.members.items(): v.transient = transient self.name = name - fields_and_types = dict() + fields_and_types = OrderedDict() symbols = set() for k, v in members.items(): if isinstance(v, Structure): @@ -396,13 +404,17 @@ def __init__(self, fields_and_types[k] = dtypes.typeclass(type(v)) else: raise TypeError(f"Attribute {k}'s value {v} has unsupported type: {type(v)}") - for s in symbols: - if str(s) in fields_and_types: - continue - if hasattr(s, "dtype"): - fields_and_types[str(s)] = s.dtype - else: - fields_and_types[str(s)] = dtypes.int32 + + # NOTE: We will not store symbols in the dtype for now, but leaving it as a comment to investigate later. + # NOTE: See discussion about data/object symbols. + # for s in symbols: + # if str(s) in fields_and_types: + # continue + # if hasattr(s, "dtype"): + # fields_and_types[str(s)] = s.dtype + # else: + # fields_and_types[str(s)] = dtypes.int32 + dtype = dtypes.pointer(dtypes.struct(name, **fields_and_types)) shape = (1,) super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) diff --git a/dace/dtypes.py b/dace/dtypes.py index 9c483d5df1..678f2f59b0 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -7,6 +7,7 @@ import itertools import numpy import re +from collections import OrderedDict from functools import wraps from typing import Any from dace.config import Config @@ -768,12 +769,11 @@ def fields(self): return self._data def to_json(self): - sorted_keys = sorted(self._data.keys()) return { 'type': 'struct', 'name': self.name, - 'data': [(k, self._data[k].to_json()) for k in sorted_keys], - 'length': [(k, self._length[k]) for k in sorted_keys if k in self._length], + 'data': [(k, v.to_json()) for k, v in self._data.items()], + 'length': [(k, v) for k, v in self._length.items()], 'bytes': self.bytes } @@ -792,19 +792,21 @@ def from_json(json_obj, context=None): return ret def _parse_field_and_types(self, **fields_and_types): - from dace.symbolic import pystr_to_symbolic - self._data = dict() - self._length = dict() + # from dace.symbolic import pystr_to_symbolic + self._data = OrderedDict() + self._length = OrderedDict() self.bytes = 0 for k, v in fields_and_types.items(): if isinstance(v, tuple): t, l = v if not isinstance(t, pointer): raise TypeError("Only pointer types may have a length.") - sym_tokens = pystr_to_symbolic(l).free_symbols - for sym in sym_tokens: - if str(sym) not in fields_and_types.keys(): - raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}") + # TODO: Do we need the free symbols of the length in the struct? + # NOTE: It is needed for the old use of dtype.struct. Are we deprecating that? + # sym_tokens = pystr_to_symbolic(l).free_symbols + # for sym in sym_tokens: + # if str(sym) not in fields_and_types.keys(): + # raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}") self._data[k] = t self._length[k] = l self.bytes += t.bytes @@ -830,7 +832,7 @@ def as_ctypes(self): fields.append((k, v.as_ctypes())) else: fields.append((k, _FFI_CTYPES[v.type])) - fields = sorted(fields, key=lambda f: f[0]) + # fields = sorted(fields, key=lambda f: f[0]) # Create new struct class. struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields}) _FFI_CTYPES[self] = struct_class @@ -844,7 +846,7 @@ def emit_definition(self): {typ} }};""".format( name=self.name, - typ='\n'.join([" %s %s;" % (t.ctype, tname) for tname, t in sorted(self._data.items())]), + typ='\n'.join([" %s %s;" % (t.ctype, tname) for tname, t in self._data.items()]), ) diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 02b8f0c174..995aacb2fd 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -12,6 +12,7 @@ def test_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense') @@ -68,6 +69,7 @@ def test_write_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('dense_to_csr') @@ -145,8 +147,10 @@ def test_local_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix', transient=True) @@ -254,6 +258,7 @@ def test_local_structure(): def test_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -315,6 +320,7 @@ def test_write_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -396,6 +402,7 @@ def test_direct_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense_direct') @@ -446,6 +453,7 @@ def test_direct_read_structure(): def test_direct_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') From 86d9cf2180c0b599b0a025447f1a36b7f9a05ecf Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 28 Jul 2023 10:31:32 +0200 Subject: [PATCH 097/127] Removed order from properties. --- dace/data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dace/data.py b/dace/data.py index b20f9f7db5..d8f2d52998 100644 --- a/dace/data.py +++ b/dace/data.py @@ -362,7 +362,6 @@ class Structure(Data): desc="Dictionary of structure members", from_json=_arrays_from_json, to_json=_arrays_to_json) - order = ListProperty(element_type=str, desc="Order of structure members") name = Property(dtype=str, desc="Structure name") def __init__(self, @@ -375,12 +374,12 @@ def __init__(self, lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, debuginfo: dtypes.DebugInfo = None): - self.order = order or list(members.keys()) - if set(members.keys()) != set(self.order): + order = order or list(members.keys()) + if set(members.keys()) != set(order): raise ValueError('Order must contain all members of the structure.') # TODO: Should we make a deep-copy here? - self.members = OrderedDict((k, members[k]) for k in self.order) + self.members = OrderedDict((k, members[k]) for k in order) for k, v in self.members.items(): v.transient = transient From 76d6266cead9f7b3de58e8fc879a7d978ddbe757 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Fri, 28 Jul 2023 12:05:50 +0200 Subject: [PATCH 098/127] `_argminmax` now creates a struct with the members ordered as accessed in the related tasklets. --- dace/frontend/python/replacements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 9eac240a87..b325a2ea7e 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -975,7 +975,7 @@ def _argminmax(pv: ProgramVisitor, reduced_shape = list(copy.deepcopy(a_arr.shape)) reduced_shape.pop(axis) - val_and_idx = dace.struct('_val_and_idx', val=a_arr.dtype, idx=result_type) + val_and_idx = dace.struct('_val_and_idx', idx=result_type, val=a_arr.dtype) # HACK: since identity cannot be specified for structs, we have to init the output array reduced_structs, reduced_struct_arr = sdfg.add_temp_transient(reduced_shape, val_and_idx) From 60b404515f732d81c96dcaed21b0d0c5d7632a18 Mon Sep 17 00:00:00 2001 From: Phillip Allen Lane Date: Sat, 29 Jul 2023 16:34:59 -0500 Subject: [PATCH 099/127] Fix some underlying issues with tensor core sample (#1336) Co-authored-by: Phillip Allen Lane --- samples/codegen/tensor_cores.py | 87 ++++++++++++++------------------- 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index 52d906254b..92ea28eacf 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -27,6 +27,7 @@ from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import StateSubgraphView from dace.codegen.prettycode import CodeIOStream +from dace.codegen.dispatcher import DefinedType from typing import Any, List # Other imports @@ -76,6 +77,9 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + # Make sure the codegen includes the appropriate header files + _include_mma(sdfg) + name = node.data # Based on the hardware, the total size must be 16^2 @@ -85,14 +89,16 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, # Write a fragment based on the storage type if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: - declaration_stream.write('wmma::fragment {};'.format(name), sdfg, state_id, node) + ctype = 'wmma::fragment' + declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) else: - declaration_stream.write( - 'wmma::fragment ' - '{name};'.format(mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj, name=name), sdfg, - state_id, node) + ctype = 'wmma::fragment'.format( + mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) + declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) + + # Add the ctype to defined_vars so that the codegen can properly pass + # fragments to functions as an object reference. + self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype) def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream): @@ -187,50 +193,29 @@ def _include_mma(sdfg: dace.SDFG): sdfg.append_global_code(global_code, 'cuda') -@replaces('frag_fill') -def frag_fill(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, frag: str, fill: Any) -> List[str]: - # Replacement functions receive the SDFG and the current state as the first - # two arguments, followed by all the other arguments. Here we treat them as - # two strings representing the array name to fill and what to fill it with. - - # NOTE: If a slice is used in the `frag` argument, the Python frontend - # automatically creates a new array for it, and uses the correct string as - # the argument. - wnode = state.add_write(frag) - tasklet = state.add_tasklet('fill', - set(), {'out'}, - ''' - wmma::fill_fragment(out, %s);''' % fill, - language=dace.Language.CPP) - - state.add_edge(tasklet, 'out', wnode, None, dace.Memlet.from_array(frag, wnode.desc(sdfg))) - - _include_mma(sdfg) - - # Function has no return value - return [] - - -@replaces('wmma') -def wmma(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, a_frag: str, b_frag: str, - c_frag: str) -> List[str]: - # Implemented similarly to `frag_fill`, but with inputs and outputs. - anode = state.add_read(a_frag) - bnode = state.add_read(b_frag) - cnode = state.add_write(c_frag) - tasklet = state.add_tasklet('wmma', {'afrag', 'bfrag'}, {'cfrag'}, - ''' - wmma::mma_sync(cfrag, afrag, bfrag, cfrag);''', - language=dace.Language.CPP) - - state.add_edge(anode, None, tasklet, 'afrag', dace.Memlet.from_array(a_frag, anode.desc(sdfg))) - state.add_edge(bnode, None, tasklet, 'bfrag', dace.Memlet.from_array(b_frag, bnode.desc(sdfg))) - state.add_edge(tasklet, 'cfrag', cnode, None, dace.Memlet.from_array(c_frag, cnode.desc(sdfg))) - - _include_mma(sdfg) - - # Function has no return value - return [] +def frag_fill(frag, fill): + # Define a tasklet with the appropriate input and output connectors. + # Then we can directly emit CUDA for the tasklet. + with dace.tasklet(dace.Language.CPP): + val << fill + out >> frag + """ + wmma::fill_fragment(out, val); + """ + +def wmma(a_frag, b_frag, c_frag): + # We do the same here as we did with frag_fill. Since c_frag is used + # as both an input and an output, we specify two separate variables + # to be passed to mma_sync and declare c_frag as an input to one and + # an output to the other. This ensures proper dataflow. + with dace.tasklet(dace.Language.CPP): + afrag << a_frag + bfrag << b_frag + cfrag << c_frag + dfrag >> c_frag + """ + wmma::mma_sync(dfrag, afrag, bfrag, cfrag); + """ ############################################################################ From b97443e2782a161cf8fa6afc03f707c6e8bc54c0 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Sun, 30 Jul 2023 15:15:40 -0700 Subject: [PATCH 100/127] Add CPU_Persistent map schedule (#1330) --- dace/cli/daceprof.py | 3 +- dace/codegen/instrumentation/likwid.py | 2 +- dace/codegen/instrumentation/papi.py | 6 +- dace/codegen/targets/cpu.py | 140 +++++++++++------- dace/dtypes.py | 13 +- dace/sdfg/nodes.py | 6 +- .../transformation/interstate/sdfg_nesting.py | 3 +- tests/openmp_test.py | 104 +++++++++++++ 8 files changed, 206 insertions(+), 71 deletions(-) diff --git a/dace/cli/daceprof.py b/dace/cli/daceprof.py index 8a2f894910..b201d40661 100644 --- a/dace/cli/daceprof.py +++ b/dace/cli/daceprof.py @@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG): for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, dace.nodes.EntryNode): sched = getattr(n, 'schedule', False) - if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default: + if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent, + dace.ScheduleType.Default): n.schedule = dace.ScheduleType.Sequential registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential)) diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py index b14a8166af..e4f9c3154e 100644 --- a/dace/codegen/instrumentation/likwid.py +++ b/dace/codegen/instrumentation/likwid.py @@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider): the Likwid tool. """ - perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential] + perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential] def __init__(self): self._likwid_used = False diff --git a/dace/codegen/instrumentation/papi.py b/dace/codegen/instrumentation/papi.py index ee7f17308a..bc7163ea9b 100644 --- a/dace/codegen/instrumentation/papi.py +++ b/dace/codegen/instrumentation/papi.py @@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider): _counters: Optional[Set[str]] = None - perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential] + perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential] def __init__(self): self._papi_used = False @@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream): @staticmethod def perf_get_supersection_start_string(node, dfg, unified_id): - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): # Nested SuperSections are not supported. Therefore, we mark the # outermost section and disallow internal scopes from creating it. if not hasattr(node.map, '_can_be_supersection_start'): @@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id): for x in children: if not hasattr(x.map, '_can_be_supersection_start'): x.map._can_be_supersection_start = True - if x.map.schedule == dtypes.ScheduleType.CPU_Multicore: + if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): x.map._can_be_supersection_start = False elif x.map.schedule == dtypes.ScheduleType.Sequential: diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index eb7d232966..3b7b592775 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -18,7 +18,7 @@ from dace.sdfg import nodes, utils as sdutils from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, dynamic_map_inputs, local_transients) -from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga +from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope from typing import Union from dace.codegen.targets import fpga @@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg): # Register dispatchers dispatcher.register_node_dispatcher(self) - dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self) + dispatcher.register_map_dispatcher( + [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential], + self) cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register] dispatcher.register_array_dispatcher(cpu_storage, self) @@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if # `nodedesc` is a View and `dfg` is None. if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): - raise NotImplementedError("The declare_array method should only be used for variables " + raise NotImplementedError("The declare_array method should only be used for variables " "that must have their declaration and allocation separate.") name = node.data @@ -1714,66 +1716,87 @@ def _generate_MapEntry( # TODO: Refactor to generate_scope_preamble once a general code # generator (that CPU inherits from) is implemented - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: - map_header += "#pragma omp parallel for" - if node.map.omp_schedule != dtypes.OMPScheduleType.Default: - schedule = " schedule(" - if node.map.omp_schedule == dtypes.OMPScheduleType.Static: - schedule += "static" - elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: - schedule += "dynamic" - elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: - schedule += "guided" + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + # OpenMP header + in_persistent = False + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent]) + if in_persistent: + # If already in a #pragma omp parallel, no need to use it twice + map_header += "#pragma omp for" + # TODO(later): barriers and map_header += " nowait" else: - raise ValueError("Unknown OpenMP schedule type") - if node.map.omp_chunk_size > 0: - schedule += f", {node.map.omp_chunk_size}" - schedule += ")" - map_header += schedule - if node.map.omp_num_threads > 0: - map_header += f" num_threads({node.map.omp_num_threads})" - if node.map.collapse > 1: + map_header += "#pragma omp parallel for" + + elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + map_header += "#pragma omp parallel" + + # OpenMP schedule properties + if not in_persistent: + if node.map.omp_schedule != dtypes.OMPScheduleType.Default: + schedule = " schedule(" + if node.map.omp_schedule == dtypes.OMPScheduleType.Static: + schedule += "static" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: + schedule += "dynamic" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: + schedule += "guided" + else: + raise ValueError("Unknown OpenMP schedule type") + if node.map.omp_chunk_size > 0: + schedule += f", {node.map.omp_chunk_size}" + schedule += ")" + map_header += schedule + + if node.map.omp_num_threads > 0: + map_header += f" num_threads({node.map.omp_num_threads})" + + # OpenMP nested loop properties + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1: map_header += ' collapse(%d)' % node.map.collapse - # Loop over outputs, add OpenMP reduction clauses to detected cases - # TODO: set up register outside loop - # exit_node = dfg.exit_node(node) - reduction_stmts = [] - # for outedge in dfg.in_edges(exit_node): - # if (isinstance(outedge.src, nodes.CodeNode) - # and outedge.data.wcr is not None): - # redt = operations.detect_reduction_type(outedge.data.wcr) - # if redt != dtypes.ReductionType.Custom: - # reduction_stmts.append('reduction({typ}:{var})'.format( - # typ=_REDUCTION_TYPE_TO_OPENMP[redt], - # var=outedge.src_conn)) - # reduced_variables.append(outedge) - - map_header += " %s\n" % ", ".join(reduction_stmts) - - # TODO: Explicit map unroller - if node.map.unroll: - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: - raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")") - constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r]) + if node.map.unroll: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")") - # Nested loops result.write(map_header, sdfg, state_id, node) - for i, r in enumerate(node.map.range): - # var = '__DACEMAP_%s_%d' % (node.map.label, i) - var = map_params[i] - begin, end, skip = r - if node.map.unroll: - result.write("#pragma unroll", sdfg, state_id, node) + if node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + result.write('{\n', sdfg, state_id, node) + + # Find if bounds are used within the scope + scope = state_dfg.scope_subgraph(node, False, False) + fsyms = scope.free_symbols + # Include external edges + for n in scope.nodes(): + for e in state_dfg.all_edges(n): + fsyms |= e.data.free_symbols + fsyms = set(map(str, fsyms)) + + ntid_is_used = '__omp_num_threads' in fsyms + tid_is_used = node.map.params[0] in fsyms + if tid_is_used or ntid_is_used: + function_stream.write('#include ', sdfg, state_id, node) + if tid_is_used: + result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node) + if ntid_is_used: + result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node) + else: + # Emit nested loops + for i, r in enumerate(node.map.range): + var = map_params[i] + begin, end, skip = r - result.write( - "for (auto %s = %s; %s < %s; %s += %s) {\n" % - (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), - sdfg, - state_id, - node, - ) + if node.map.unroll: + result.write("#pragma unroll", sdfg, state_id, node) + + result.write( + "for (auto %s = %s; %s < %s; %s += %s) {\n" % + (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), + sdfg, + state_id, + node, + ) callsite_stream.write(inner_stream.getvalue()) @@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream) - for _ in map_node.map.range: + if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent: result.write("}", sdfg, state_id, node) + else: + for _ in map_node.map.range: + result.write("}", sdfg, state_id, node) result.write(outer_stream.getvalue()) diff --git a/dace/dtypes.py b/dace/dtypes.py index dee2283f25..88ce583d08 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -61,7 +61,8 @@ class ScheduleType(aenum.AutoNumberEnum): Default = () #: Scope-default parallel schedule Sequential = () #: Sequential code (single-thread) MPI = () #: MPI processes - CPU_Multicore = () #: OpenMP + CPU_Multicore = () #: OpenMP parallel for loop + CPU_Persistent = () #: OpenMP parallel region Unrolled = () #: Unrolled code SVE_Map = () #: Arm SVE @@ -188,6 +189,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Sequential: StorageType.Register, ScheduleType.MPI: StorageType.CPU_Heap, ScheduleType.CPU_Multicore: StorageType.Register, + ScheduleType.CPU_Persistent: StorageType.CPU_Heap, ScheduleType.GPU_Default: StorageType.GPU_Global, ScheduleType.GPU_Persistent: StorageType.GPU_Global, ScheduleType.GPU_Device: StorageType.GPU_Shared, @@ -205,6 +207,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Sequential: ScheduleType.Sequential, ScheduleType.MPI: ScheduleType.CPU_Multicore, ScheduleType.CPU_Multicore: ScheduleType.Sequential, + ScheduleType.CPU_Persistent: ScheduleType.CPU_Multicore, ScheduleType.Unrolled: ScheduleType.CPU_Multicore, ScheduleType.GPU_Default: ScheduleType.GPU_Device, ScheduleType.GPU_Persistent: ScheduleType.GPU_Device, @@ -1432,7 +1435,7 @@ def can_access(schedule: ScheduleType, storage: StorageType): ScheduleType.GPU_Default, ]: return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned] - elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore]: + elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]: return storage in [ StorageType.Default, StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal ] @@ -1460,19 +1463,19 @@ def can_allocate(storage: StorageType, schedule: ScheduleType): # Host-only allocation if storage in [StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal]: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default ] # GPU-global memory if storage is StorageType.GPU_Global: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default ] # FPGA-global memory if storage is StorageType.FPGA_Global: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device, + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device, ScheduleType.GPU_Default ] diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 5c270153e1..d82cd5607d 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -833,17 +833,17 @@ class Map(object): default=0, desc="Number of OpenMP threads executing the Map", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType, default=dtypes.OMPScheduleType.Default, desc="OpenMP schedule {static, dynamic, guided}", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_chunk_size = Property(dtype=int, default=0, desc="OpenMP schedule chunk size", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) gpu_block_size = ListProperty(element_type=int, default=None, diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py index 71d9e22aca..fc3ebfbdca 100644 --- a/dace/transformation/interstate/sdfg_nesting.py +++ b/dace/transformation/interstate/sdfg_nesting.py @@ -814,7 +814,8 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi # Not every schedule is supported if not permissive: if nsdfg.schedule not in (None, dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential, - dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.GPU_Device): + dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, + dtypes.ScheduleType.GPU_Device): return False candidates = InlineTransients._candidates(sdfg, graph, nsdfg) diff --git a/tests/openmp_test.py b/tests/openmp_test.py index 9f4535dfe4..d842b407fb 100644 --- a/tests/openmp_test.py +++ b/tests/openmp_test.py @@ -2,6 +2,7 @@ import dace from dace import dtypes, nodes from typing import Any, Dict, List, Union +import numpy as np N = dace.symbol("N") @@ -73,6 +74,109 @@ def test_omp_props(): assert ("#pragma omp parallel for schedule(guided, 5) num_threads(10)" in code) +def test_omp_parallel(): + + @dace.program + def tester(A: dace.float64[1]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[0] += 1 + + sdfg = tester.to_sdfg() + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 2 + + code = sdfg.generate_code()[0].clean_code + assert ("#pragma omp parallel num_threads(2)" in code) + + a = np.random.rand(1) + ref = a + 2 + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_parallel_for_in_parallel(): + """ + Tests that an OpenMP map inside a parallel section ends up without an + extra (semantically-incorrect) ``parallel`` statement. + """ + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore: + A[i] += 1 + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "#pragma omp parallel" in code + assert "#pragma omp for" in code + + a = np.random.rand(20) + ref = a + 1 + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_get_tid(): + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[t] += 1 + + sdfg = tester.to_sdfg() + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 2 + + code = sdfg.generate_code()[0].clean_code + assert "#pragma omp parallel num_threads(2)" in code + assert "omp_get_thread_num()" in code + + a = np.random.rand(20) + ref = np.copy(a) + ref[:2] += 1 + + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_get_tid_elision(): + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[0] += 1 + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "omp_get_thread_num()" not in code + + +def test_omp_get_ntid(): + __omp_num_threads = dace.symbol('__omp_num_threads') + + @dace.program + def tester(A: dace.int64[1]): + for _ in dace.map[0:__omp_num_threads] @ dace.ScheduleType.CPU_Persistent: + A[0] = __omp_num_threads + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "omp_get_num_threads()" in code + + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 3 + + a = np.zeros([1], dtype=np.int64) + sdfg(a, __omp_num_threads=1) # Feed in some other value + assert np.allclose(a, 3) + + if __name__ == "__main__": test_lack_of_omp_props() test_omp_props() + test_omp_parallel() + test_omp_parallel_for_in_parallel() + test_omp_get_tid() + test_omp_get_tid_elision() + test_omp_get_ntid() From 7e9d197a5d13ac9b4ba411f3fe7c5a04a20d7327 Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Tue, 1 Aug 2023 17:57:01 +0200 Subject: [PATCH 101/127] Updated hlslib to support Xilinx Vitis >2022.2 (#1340) --- dace/external/hlslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/external/hlslib b/dace/external/hlslib index 1403cd016c..1b5b3aee5d 160000 --- a/dace/external/hlslib +++ b/dace/external/hlslib @@ -1 +1 @@ -Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce +Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1 From f39762ff1397dc8eaa6f7db08acd025c264b55af Mon Sep 17 00:00:00 2001 From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com> Date: Wed, 2 Aug 2023 13:00:04 +0200 Subject: [PATCH 102/127] Docs: mention FPGA backend tested with Intel Quartus PRO (#1335) --- doc/setup/installation.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/setup/installation.rst b/doc/setup/installation.rst index 6eb266dc7c..893f4a1688 100644 --- a/doc/setup/installation.rst +++ b/doc/setup/installation.rst @@ -21,7 +21,9 @@ however, it requires two more runtime dependencies to be installed and available **GPU**: For NVIDIA GPUs, the CUDA toolkit is also required, and AMD GPUs require HIP. :ref:`See more information on how to configure DaCe to use AMD GPUs `. You may (optionally) want to install `CuPy `_ for easy integration of GPU arrays in Python. -**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. +**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. +DaCe has been tested with Intel FPGA SDK for OpenCL Pro edition v18.1 and v19.1, targeting Arria 10 and Stratix 10 devices, and Xilinx Vitis HLS v2020.x, v2021.x targeting u250 and u280 devices. + **Distributed Computing**: If using multiple nodes, MPI has to be installed and available. @@ -136,6 +138,12 @@ Common issues with the DaCe Python module * **Bug in DaCe**: If you suspect an issue happens within DaCe, see :ref:`debugging` for ways to pinpoint the source of the issue. + * **Intel FPGA libraries not found**: when targeting Intel FPGAs, the compilation process may fail due to missing OpenCL headers (CMake returns + a ``Could NOT find IntelFPGAOpenCL`` error). This is usually the case when Intel OpenCL compiler does not return the right path to OpenCL host headers. + DaCe relies on ``hlslib`` for compiling FPGA programs, which in turns relies on Intel's compiler to derive the right include path. Please verify that + the include path returned by the Intel compiler (using the ``aocl compile-config`` command) points to a directory that actually contains the OpenCL headers (namely ``cl.hpp`` and + ``cl2.hpp`` files). If this is not the case, please locate them under the Intel Quartus installation folder, and symlink (or copy) them in the ``aocl`` returned path. + .. _qa_vscode: Common issues with the Visual Studio Code extension From 30af8dabca952d9c4307738c98bd0bb0669f6af9 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Sun, 30 Jul 2023 15:15:40 -0700 Subject: [PATCH 103/127] Add CPU_Persistent map schedule (#1330) --- dace/cli/daceprof.py | 3 +- dace/codegen/instrumentation/likwid.py | 2 +- dace/codegen/instrumentation/papi.py | 6 +- dace/codegen/targets/cpu.py | 140 +++++++++++------- dace/dtypes.py | 13 +- dace/sdfg/nodes.py | 6 +- .../transformation/interstate/sdfg_nesting.py | 3 +- tests/openmp_test.py | 104 +++++++++++++ 8 files changed, 206 insertions(+), 71 deletions(-) diff --git a/dace/cli/daceprof.py b/dace/cli/daceprof.py index 8a2f894910..b201d40661 100644 --- a/dace/cli/daceprof.py +++ b/dace/cli/daceprof.py @@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG): for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, dace.nodes.EntryNode): sched = getattr(n, 'schedule', False) - if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default: + if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent, + dace.ScheduleType.Default): n.schedule = dace.ScheduleType.Sequential registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential)) diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py index b14a8166af..e4f9c3154e 100644 --- a/dace/codegen/instrumentation/likwid.py +++ b/dace/codegen/instrumentation/likwid.py @@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider): the Likwid tool. """ - perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential] + perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential] def __init__(self): self._likwid_used = False diff --git a/dace/codegen/instrumentation/papi.py b/dace/codegen/instrumentation/papi.py index ee7f17308a..bc7163ea9b 100644 --- a/dace/codegen/instrumentation/papi.py +++ b/dace/codegen/instrumentation/papi.py @@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider): _counters: Optional[Set[str]] = None - perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential] + perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential] def __init__(self): self._papi_used = False @@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream): @staticmethod def perf_get_supersection_start_string(node, dfg, unified_id): - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): # Nested SuperSections are not supported. Therefore, we mark the # outermost section and disallow internal scopes from creating it. if not hasattr(node.map, '_can_be_supersection_start'): @@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id): for x in children: if not hasattr(x.map, '_can_be_supersection_start'): x.map._can_be_supersection_start = True - if x.map.schedule == dtypes.ScheduleType.CPU_Multicore: + if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): x.map._can_be_supersection_start = False elif x.map.schedule == dtypes.ScheduleType.Sequential: diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index eb7d232966..3b7b592775 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -18,7 +18,7 @@ from dace.sdfg import nodes, utils as sdutils from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, dynamic_map_inputs, local_transients) -from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga +from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope from typing import Union from dace.codegen.targets import fpga @@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg): # Register dispatchers dispatcher.register_node_dispatcher(self) - dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self) + dispatcher.register_map_dispatcher( + [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential], + self) cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register] dispatcher.register_array_dispatcher(cpu_storage, self) @@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if # `nodedesc` is a View and `dfg` is None. if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): - raise NotImplementedError("The declare_array method should only be used for variables " + raise NotImplementedError("The declare_array method should only be used for variables " "that must have their declaration and allocation separate.") name = node.data @@ -1714,66 +1716,87 @@ def _generate_MapEntry( # TODO: Refactor to generate_scope_preamble once a general code # generator (that CPU inherits from) is implemented - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: - map_header += "#pragma omp parallel for" - if node.map.omp_schedule != dtypes.OMPScheduleType.Default: - schedule = " schedule(" - if node.map.omp_schedule == dtypes.OMPScheduleType.Static: - schedule += "static" - elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: - schedule += "dynamic" - elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: - schedule += "guided" + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + # OpenMP header + in_persistent = False + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent]) + if in_persistent: + # If already in a #pragma omp parallel, no need to use it twice + map_header += "#pragma omp for" + # TODO(later): barriers and map_header += " nowait" else: - raise ValueError("Unknown OpenMP schedule type") - if node.map.omp_chunk_size > 0: - schedule += f", {node.map.omp_chunk_size}" - schedule += ")" - map_header += schedule - if node.map.omp_num_threads > 0: - map_header += f" num_threads({node.map.omp_num_threads})" - if node.map.collapse > 1: + map_header += "#pragma omp parallel for" + + elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + map_header += "#pragma omp parallel" + + # OpenMP schedule properties + if not in_persistent: + if node.map.omp_schedule != dtypes.OMPScheduleType.Default: + schedule = " schedule(" + if node.map.omp_schedule == dtypes.OMPScheduleType.Static: + schedule += "static" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: + schedule += "dynamic" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: + schedule += "guided" + else: + raise ValueError("Unknown OpenMP schedule type") + if node.map.omp_chunk_size > 0: + schedule += f", {node.map.omp_chunk_size}" + schedule += ")" + map_header += schedule + + if node.map.omp_num_threads > 0: + map_header += f" num_threads({node.map.omp_num_threads})" + + # OpenMP nested loop properties + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1: map_header += ' collapse(%d)' % node.map.collapse - # Loop over outputs, add OpenMP reduction clauses to detected cases - # TODO: set up register outside loop - # exit_node = dfg.exit_node(node) - reduction_stmts = [] - # for outedge in dfg.in_edges(exit_node): - # if (isinstance(outedge.src, nodes.CodeNode) - # and outedge.data.wcr is not None): - # redt = operations.detect_reduction_type(outedge.data.wcr) - # if redt != dtypes.ReductionType.Custom: - # reduction_stmts.append('reduction({typ}:{var})'.format( - # typ=_REDUCTION_TYPE_TO_OPENMP[redt], - # var=outedge.src_conn)) - # reduced_variables.append(outedge) - - map_header += " %s\n" % ", ".join(reduction_stmts) - - # TODO: Explicit map unroller - if node.map.unroll: - if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: - raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")") - constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r]) + if node.map.unroll: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")") - # Nested loops result.write(map_header, sdfg, state_id, node) - for i, r in enumerate(node.map.range): - # var = '__DACEMAP_%s_%d' % (node.map.label, i) - var = map_params[i] - begin, end, skip = r - if node.map.unroll: - result.write("#pragma unroll", sdfg, state_id, node) + if node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + result.write('{\n', sdfg, state_id, node) + + # Find if bounds are used within the scope + scope = state_dfg.scope_subgraph(node, False, False) + fsyms = scope.free_symbols + # Include external edges + for n in scope.nodes(): + for e in state_dfg.all_edges(n): + fsyms |= e.data.free_symbols + fsyms = set(map(str, fsyms)) + + ntid_is_used = '__omp_num_threads' in fsyms + tid_is_used = node.map.params[0] in fsyms + if tid_is_used or ntid_is_used: + function_stream.write('#include ', sdfg, state_id, node) + if tid_is_used: + result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node) + if ntid_is_used: + result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node) + else: + # Emit nested loops + for i, r in enumerate(node.map.range): + var = map_params[i] + begin, end, skip = r - result.write( - "for (auto %s = %s; %s < %s; %s += %s) {\n" % - (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), - sdfg, - state_id, - node, - ) + if node.map.unroll: + result.write("#pragma unroll", sdfg, state_id, node) + + result.write( + "for (auto %s = %s; %s < %s; %s += %s) {\n" % + (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), + sdfg, + state_id, + node, + ) callsite_stream.write(inner_stream.getvalue()) @@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream) - for _ in map_node.map.range: + if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent: result.write("}", sdfg, state_id, node) + else: + for _ in map_node.map.range: + result.write("}", sdfg, state_id, node) result.write(outer_stream.getvalue()) diff --git a/dace/dtypes.py b/dace/dtypes.py index dee2283f25..88ce583d08 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -61,7 +61,8 @@ class ScheduleType(aenum.AutoNumberEnum): Default = () #: Scope-default parallel schedule Sequential = () #: Sequential code (single-thread) MPI = () #: MPI processes - CPU_Multicore = () #: OpenMP + CPU_Multicore = () #: OpenMP parallel for loop + CPU_Persistent = () #: OpenMP parallel region Unrolled = () #: Unrolled code SVE_Map = () #: Arm SVE @@ -188,6 +189,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Sequential: StorageType.Register, ScheduleType.MPI: StorageType.CPU_Heap, ScheduleType.CPU_Multicore: StorageType.Register, + ScheduleType.CPU_Persistent: StorageType.CPU_Heap, ScheduleType.GPU_Default: StorageType.GPU_Global, ScheduleType.GPU_Persistent: StorageType.GPU_Global, ScheduleType.GPU_Device: StorageType.GPU_Shared, @@ -205,6 +207,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Sequential: ScheduleType.Sequential, ScheduleType.MPI: ScheduleType.CPU_Multicore, ScheduleType.CPU_Multicore: ScheduleType.Sequential, + ScheduleType.CPU_Persistent: ScheduleType.CPU_Multicore, ScheduleType.Unrolled: ScheduleType.CPU_Multicore, ScheduleType.GPU_Default: ScheduleType.GPU_Device, ScheduleType.GPU_Persistent: ScheduleType.GPU_Device, @@ -1432,7 +1435,7 @@ def can_access(schedule: ScheduleType, storage: StorageType): ScheduleType.GPU_Default, ]: return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned] - elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore]: + elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]: return storage in [ StorageType.Default, StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal ] @@ -1460,19 +1463,19 @@ def can_allocate(storage: StorageType, schedule: ScheduleType): # Host-only allocation if storage in [StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal]: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default ] # GPU-global memory if storage is StorageType.GPU_Global: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default ] # FPGA-global memory if storage is StorageType.FPGA_Global: return schedule in [ - ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device, + ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device, ScheduleType.GPU_Default ] diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 6ba84d919e..bd384b6736 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -851,17 +851,17 @@ class Map(object): default=0, desc="Number of OpenMP threads executing the Map", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType, default=dtypes.OMPScheduleType.Default, desc="OpenMP schedule {static, dynamic, guided}", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_chunk_size = Property(dtype=int, default=0, desc="OpenMP schedule chunk size", optional=True, - optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore) + optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) gpu_block_size = ListProperty(element_type=int, default=None, diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py index 71d9e22aca..fc3ebfbdca 100644 --- a/dace/transformation/interstate/sdfg_nesting.py +++ b/dace/transformation/interstate/sdfg_nesting.py @@ -814,7 +814,8 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi # Not every schedule is supported if not permissive: if nsdfg.schedule not in (None, dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential, - dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.GPU_Device): + dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, + dtypes.ScheduleType.GPU_Device): return False candidates = InlineTransients._candidates(sdfg, graph, nsdfg) diff --git a/tests/openmp_test.py b/tests/openmp_test.py index 9f4535dfe4..d842b407fb 100644 --- a/tests/openmp_test.py +++ b/tests/openmp_test.py @@ -2,6 +2,7 @@ import dace from dace import dtypes, nodes from typing import Any, Dict, List, Union +import numpy as np N = dace.symbol("N") @@ -73,6 +74,109 @@ def test_omp_props(): assert ("#pragma omp parallel for schedule(guided, 5) num_threads(10)" in code) +def test_omp_parallel(): + + @dace.program + def tester(A: dace.float64[1]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[0] += 1 + + sdfg = tester.to_sdfg() + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 2 + + code = sdfg.generate_code()[0].clean_code + assert ("#pragma omp parallel num_threads(2)" in code) + + a = np.random.rand(1) + ref = a + 2 + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_parallel_for_in_parallel(): + """ + Tests that an OpenMP map inside a parallel section ends up without an + extra (semantically-incorrect) ``parallel`` statement. + """ + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore: + A[i] += 1 + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "#pragma omp parallel" in code + assert "#pragma omp for" in code + + a = np.random.rand(20) + ref = a + 1 + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_get_tid(): + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[t] += 1 + + sdfg = tester.to_sdfg() + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 2 + + code = sdfg.generate_code()[0].clean_code + assert "#pragma omp parallel num_threads(2)" in code + assert "omp_get_thread_num()" in code + + a = np.random.rand(20) + ref = np.copy(a) + ref[:2] += 1 + + sdfg(a) + assert np.allclose(a, ref) + + +def test_omp_get_tid_elision(): + + @dace.program + def tester(A: dace.float64[20]): + for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent: + A[0] += 1 + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "omp_get_thread_num()" not in code + + +def test_omp_get_ntid(): + __omp_num_threads = dace.symbol('__omp_num_threads') + + @dace.program + def tester(A: dace.int64[1]): + for _ in dace.map[0:__omp_num_threads] @ dace.ScheduleType.CPU_Persistent: + A[0] = __omp_num_threads + + sdfg = tester.to_sdfg() + code = sdfg.generate_code()[0].clean_code + assert "omp_get_num_threads()" in code + + me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + me.map.omp_num_threads = 3 + + a = np.zeros([1], dtype=np.int64) + sdfg(a, __omp_num_threads=1) # Feed in some other value + assert np.allclose(a, 3) + + if __name__ == "__main__": test_lack_of_omp_props() test_omp_props() + test_omp_parallel() + test_omp_parallel_for_in_parallel() + test_omp_get_tid() + test_omp_get_tid_elision() + test_omp_get_ntid() From 8ace3676e326afd3b0081a032d6483d3f07f0982 Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Tue, 1 Aug 2023 17:57:01 +0200 Subject: [PATCH 104/127] Updated hlslib to support Xilinx Vitis >2022.2 (#1340) --- dace/external/hlslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/external/hlslib b/dace/external/hlslib index 1403cd016c..1b5b3aee5d 160000 --- a/dace/external/hlslib +++ b/dace/external/hlslib @@ -1 +1 @@ -Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce +Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1 From 2f5a00519e88fa6dc2705d1564dd85cbfea7d1ff Mon Sep 17 00:00:00 2001 From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com> Date: Wed, 2 Aug 2023 13:00:04 +0200 Subject: [PATCH 105/127] Docs: mention FPGA backend tested with Intel Quartus PRO (#1335) --- doc/setup/installation.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/setup/installation.rst b/doc/setup/installation.rst index 6eb266dc7c..893f4a1688 100644 --- a/doc/setup/installation.rst +++ b/doc/setup/installation.rst @@ -21,7 +21,9 @@ however, it requires two more runtime dependencies to be installed and available **GPU**: For NVIDIA GPUs, the CUDA toolkit is also required, and AMD GPUs require HIP. :ref:`See more information on how to configure DaCe to use AMD GPUs `. You may (optionally) want to install `CuPy `_ for easy integration of GPU arrays in Python. -**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. +**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. +DaCe has been tested with Intel FPGA SDK for OpenCL Pro edition v18.1 and v19.1, targeting Arria 10 and Stratix 10 devices, and Xilinx Vitis HLS v2020.x, v2021.x targeting u250 and u280 devices. + **Distributed Computing**: If using multiple nodes, MPI has to be installed and available. @@ -136,6 +138,12 @@ Common issues with the DaCe Python module * **Bug in DaCe**: If you suspect an issue happens within DaCe, see :ref:`debugging` for ways to pinpoint the source of the issue. + * **Intel FPGA libraries not found**: when targeting Intel FPGAs, the compilation process may fail due to missing OpenCL headers (CMake returns + a ``Could NOT find IntelFPGAOpenCL`` error). This is usually the case when Intel OpenCL compiler does not return the right path to OpenCL host headers. + DaCe relies on ``hlslib`` for compiling FPGA programs, which in turns relies on Intel's compiler to derive the right include path. Please verify that + the include path returned by the Intel compiler (using the ``aocl compile-config`` command) points to a directory that actually contains the OpenCL headers (namely ``cl.hpp`` and + ``cl2.hpp`` files). If this is not the case, please locate them under the Intel Quartus installation folder, and symlink (or copy) them in the ``aocl`` returned path. + .. _qa_vscode: Common issues with the Visual Studio Code extension From d68c4ffe78c182f082dd04c70967b6f8d8ba345f Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 2 Aug 2023 16:12:13 +0200 Subject: [PATCH 106/127] Added docstrings. --- dace/sdfg/utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 7eef600180..d08518b10c 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -1800,6 +1800,14 @@ def get_thread_local_data(sdfg: SDFG) -> List[str]: def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node: + """ + Finds the global source node of an edge/memlet path, crossing nested SDFG scopes. + + :param sdfg: The SDFG containing the edge. + :param state: The state containing the edge. + :param edge: The edge to find the global source node for. + :return: The global source node of the edge. + """ src = state.memlet_path(edge)[0].src if isinstance(src, nd.AccessNode) and not sdfg.arrays[src.data].transient and sdfg.parent is not None: psdfg = sdfg.parent_sdfg @@ -1813,6 +1821,14 @@ def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnecto def get_global_memlet_path_dst(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node: + """ + Finds the global destination node of an edge/memlet path, crossing nested SDFG scopes. + + :param sdfg: The SDFG containing the edge. + :param state: The state containing the edge. + :param edge: The edge to find the global destination node for. + :return: The global destination node of the edge. + """ dst = state.memlet_path(edge)[-1].dst if isinstance(dst, nd.AccessNode) and not sdfg.arrays[dst.data].transient and sdfg.parent is not None: psdfg = sdfg.parent_sdfg From 9ff109293443416b7591165e5b4dd29ca0e8befa Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 2 Aug 2023 16:28:46 +0200 Subject: [PATCH 107/127] Disabled skip for Scalars. --- dace/sdfg/nodes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index d82cd5607d..1c5cdcc0af 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -628,8 +628,8 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context for dname, desc in self.sdfg.arrays.items(): # TODO(later): Disallow scalars without access nodes (so that this # check passes for them too). - if isinstance(desc, data.Scalar): - continue + # if isinstance(desc, data.Scalar): + # continue if not desc.transient and dname not in connectors: raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname) if dname in connectors and desc.transient: From 67b839f80849b0231a7abbcf00190c95eb9a3a48 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 2 Aug 2023 18:00:52 +0200 Subject: [PATCH 108/127] Fixed typo. --- tests/transformations/refine_nested_access_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py index d6d0921da4..5343240df5 100644 --- a/tests/transformations/refine_nested_access_test.py +++ b/tests/transformations/refine_nested_access_test.py @@ -98,7 +98,7 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5] assert np.allclose(B, lower.T + lower - diag) -def test_free_sybmols_only_by_indices(): +def test_free_symbols_only_by_indices(): i = dace.symbol('i') idx_a = dace.symbol('idx_a') idx_b = dace.symbol('idx_b') @@ -132,4 +132,4 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int): if __name__ == '__main__': test_refine_dataflow() test_refine_interstate() - test_free_sybmols_only_by_indices() + test_free_symbols_only_by_indices() From f350b46b35e70cc406c7bccd40706d286ca71714 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Wed, 2 Aug 2023 18:15:10 +0200 Subject: [PATCH 109/127] Fixed test. --- .../refine_nested_access_test.py | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py index 5343240df5..d9fb9a7392 100644 --- a/tests/transformations/refine_nested_access_test.py +++ b/tests/transformations/refine_nested_access_test.py @@ -100,11 +100,11 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5] def test_free_symbols_only_by_indices(): i = dace.symbol('i') - idx_a = dace.symbol('idx_a') - idx_b = dace.symbol('idx_b') sdfg = dace.SDFG('refine_free_symbols_only_by_indices') sdfg.add_array('A', [5], dace.int32) sdfg.add_array('B', [5, 5], dace.int32) + sdfg.add_scalar('idx_a', dace.int64) + sdfg.add_scalar('idx_b', dace.int64) @dace.program def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int): @@ -116,10 +116,22 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int): state = sdfg.add_state() A = state.add_access('A') B = state.add_access('B') + ia = state.add_access('idx_a') + ib = state.add_access('idx_b') map_entry, map_exit = state.add_map('map', dict(i='0:5')) - nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A'}, {'B'}, {'i': 'i'}) - state.add_memlet_path(A, map_entry, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A'])) - state.add_memlet_path(nsdfg, map_exit, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B'])) + nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A', 'idx_a', 'idx_b'}, {'B'}, {'i': 'i'}) + state.add_memlet_path(A, map_entry, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A'])) + state.add_memlet_path(nsdfg, map_exit, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B'])) + state.add_memlet_path(ia, + map_entry, + nsdfg, + dst_conn='idx_a', + memlet=dace.Memlet.from_array('idx_a', sdfg.arrays['idx_a'])) + state.add_memlet_path(ib, + map_entry, + nsdfg, + dst_conn='idx_b', + memlet=dace.Memlet.from_array('idx_b', sdfg.arrays['idx_b'])) num = sdfg.apply_transformations_repeated(RefineNestedAccess) assert num == 1 @@ -128,6 +140,21 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int): edge = state.in_edges(map_exit)[0] assert edge.data.subset == dace.subsets.Range([(i, i, 1), (0, 4, 1)]) + A = np.array([0, 1, 0, 1, 0], dtype=np.int32) + ref = np.zeros((5, 5), dtype=np.int32) + val = np.zeros((5, 5), dtype=np.int32) + ia = 3 + ib = 2 + + for i in range(5): + if A[i] > 0.5: + ref[i, ia] = 1 + else: + ref[i, ib] = 0 + sdfg(A=A, B=val, idx_a=ia, idx_b=ib) + + assert np.allclose(ref, val) + if __name__ == '__main__': test_refine_dataflow() From 1d3db91f7104e51dd90ce41da3f84a0140ab69e4 Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Thu, 3 Aug 2023 08:38:15 +0200 Subject: [PATCH 110/127] Update dependency --- dace/external/hlslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/external/hlslib b/dace/external/hlslib index 1b5b3aee5d..1403cd016c 160000 --- a/dace/external/hlslib +++ b/dace/external/hlslib @@ -1 +1 @@ -Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1 +Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce From 350cff4ef27bfdf9d859e1f4fbee46888dc34c61 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 3 Aug 2023 04:35:50 -0700 Subject: [PATCH 111/127] Remove unused global data descriptor shapes from arguments (#1338) --- dace/codegen/targets/framecode.py | 5 +- dace/data.py | 48 +- dace/memlet.py | 35 +- dace/sdfg/nodes.py | 35 +- dace/sdfg/sdfg.py | 60 +- dace/sdfg/state.py | 43 +- tests/codegen/symbol_arguments_test.py | 54 ++ tests/transformations/mapfission_test.py | 1023 +++++++++++----------- 8 files changed, 747 insertions(+), 556 deletions(-) create mode 100644 tests/codegen/symbol_arguments_test.py diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 6f302c11ba..56419b9701 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -82,7 +82,10 @@ def free_symbols(self, obj: Any): k = id(obj) if k in self.fsyms: return self.fsyms[k] - result = obj.free_symbols + if hasattr(obj, 'used_symbols'): + result = obj.used_symbols(all_symbols=False) + else: + result = obj.free_symbols self.fsyms[k] = result return result diff --git a/dace/data.py b/dace/data.py index 2fc5f334c6..d492d06258 100644 --- a/dace/data.py +++ b/dace/data.py @@ -243,14 +243,26 @@ def as_arg(self, with_types=True, for_call=False, name=None): """Returns a string for a C++ function signature (e.g., `int *A`). """ raise NotImplementedError + def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]: + """ + Returns a set of symbols that are used by this data descriptor. + + :param all_symbols: Include not-strictly-free symbols that are used by this data descriptor, + e.g., shape and size of a global array. + :return: A set of symbols that are used by this data descriptor. NOTE: The results are symbolic + rather than a set of strings. + """ + result = set() + if self.transient or all_symbols: + for s in self.shape: + if isinstance(s, sp.Basic): + result |= set(s.free_symbols) + return result + @property def free_symbols(self) -> Set[symbolic.SymbolicType]: """ Returns a set of undefined symbols in this data descriptor. """ - result = set() - for s in self.shape: - if isinstance(s, sp.Basic): - result |= set(s.free_symbols) - return result + return self.used_symbols(all_symbols=True) def __repr__(self): return 'Abstract Data Container, DO NOT USE' @@ -689,20 +701,23 @@ def as_arg(self, with_types=True, for_call=False, name=None): def sizes(self): return [d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape] - @property - def free_symbols(self): - result = super().free_symbols + def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]: + result = super().used_symbols(all_symbols) for s in self.strides: if isinstance(s, sp.Expr): result |= set(s.free_symbols) - if isinstance(self.total_size, sp.Expr): - result |= set(self.total_size.free_symbols) for o in self.offset: if isinstance(o, sp.Expr): result |= set(o.free_symbols) - + if self.transient or all_symbols: + if isinstance(self.total_size, sp.Expr): + result |= set(self.total_size.free_symbols) return result + @property + def free_symbols(self): + return self.used_symbols(all_symbols=True) + def _set_shape_dependent_properties(self, shape, strides, total_size, offset): """ Used to set properties which depend on the shape of the array @@ -890,10 +905,9 @@ def covers_range(self, rng): return True - @property - def free_symbols(self): - result = super().free_symbols - if isinstance(self.buffer_size, sp.Expr): + def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]: + result = super().used_symbols(all_symbols) + if (self.transient or all_symbols) and isinstance(self.buffer_size, sp.Expr): result |= set(self.buffer_size.free_symbols) for o in self.offset: if isinstance(o, sp.Expr): @@ -901,6 +915,10 @@ def free_symbols(self): return result + @property + def free_symbols(self): + return self.used_symbols(all_symbols=True) + @make_properties class View(Array): diff --git a/dace/memlet.py b/dace/memlet.py index 35b689381d..74a1320a3b 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: import dace.sdfg.graph + @make_properties class Memlet(object): """ Data movement object. Represents the data, the subset moved, and the @@ -176,15 +177,16 @@ def to_json(self): @staticmethod def from_json(json_obj, context=None): ret = Memlet() - dace.serialize.set_properties_from_json(ret, - json_obj, - context=context, - ignore_properties={'src_subset', 'dst_subset', 'num_accesses', 'is_data_src'}) - + dace.serialize.set_properties_from_json( + ret, + json_obj, + context=context, + ignore_properties={'src_subset', 'dst_subset', 'num_accesses', 'is_data_src'}) + # Allow serialized memlet to override src/dst_subset to disambiguate self-copies if 'is_data_src' in json_obj['attributes']: ret._is_data_src = json_obj['attributes']['is_data_src'] - + if context: ret._sdfg = context['sdfg'] ret._state = context['sdfg_state'] @@ -510,18 +512,30 @@ def validate(self, sdfg, state): if self.data is not None and self.data not in sdfg.arrays: raise KeyError('Array "%s" not found in SDFG' % self.data) - @property - def free_symbols(self) -> Set[str]: - """ Returns a set of symbols used in this edge's properties. """ + def used_symbols(self, all_symbols: bool) -> Set[str]: + """ + Returns a set of symbols used in this edge's properties. + + :param all_symbols: If False, only returns the set of symbols that will be used + in the generated code and are needed as arguments. + """ # Symbolic properties are in volume, and the two subsets result = set() - result |= set(map(str, self.volume.free_symbols)) + if all_symbols: + result |= set(map(str, self.volume.free_symbols)) if self.src_subset: result |= self.src_subset.free_symbols + if self.dst_subset: result |= self.dst_subset.free_symbols + return result + @property + def free_symbols(self) -> Set[str]: + """ Returns a set of symbols used in this edge's properties. """ + return self.used_symbols(all_symbols=True) + def get_free_symbols_by_indices(self, indices_src: List[int], indices_dst: List[int]) -> Set[str]: """ Returns set of free symbols used in this edges properties but only taking certain indices of the src and dst @@ -640,6 +654,7 @@ class MemletTree(object): all siblings of the same edge and their children, for instance if multiple inputs from the same access node are used. """ + def __init__(self, edge: 'dace.sdfg.graph.MultiConnectorEdge[Memlet]', downwards: bool = True, diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index bd384b6736..378ee7be3e 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -580,12 +580,22 @@ def from_json(json_obj, context=None): return ret + def used_symbols(self, all_symbols: bool) -> Set[str]: + free_syms = set().union(*(map(str, + pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()), + *(map(str, + pystr_to_symbolic(v).free_symbols) for v in self.location.values())) + + # Filter out unused internal symbols from symbol mapping + if not all_symbols: + internally_used_symbols = self.sdfg.used_symbols(all_symbols=False) + free_syms &= internally_used_symbols + + return free_syms + @property def free_symbols(self) -> Set[str]: - return set().union(*(map(str, - pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()), - *(map(str, - pystr_to_symbolic(v).free_symbols) for v in self.location.values())) + return self.used_symbols(all_symbols=True) def infer_connector_types(self, sdfg, state): # Avoid import loop @@ -673,6 +683,7 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context # Scope entry class class EntryNode(Node): """ A type of node that opens a scope (e.g., Map or Consume). """ + def validate(self, sdfg, state): self.map.validate(sdfg, state, self) @@ -683,6 +694,7 @@ def validate(self, sdfg, state): # Scope exit class class ExitNode(Node): """ A type of node that closes a scope (e.g., Map or Consume). """ + def validate(self, sdfg, state): self.map.validate(sdfg, state, self) @@ -696,6 +708,7 @@ class MapEntry(EntryNode): :see: Map """ + def __init__(self, map: 'Map', dynamic_inputs=None): super(MapEntry, self).__init__(dynamic_inputs or set()) if map is None: @@ -772,6 +785,7 @@ class MapExit(ExitNode): :see: Map """ + def __init__(self, map: 'Map'): super(MapExit, self).__init__() if map is None: @@ -851,17 +865,20 @@ class Map(object): default=0, desc="Number of OpenMP threads executing the Map", optional=True, - optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) + optional_condition=lambda m: m.schedule in + (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType, default=dtypes.OMPScheduleType.Default, desc="OpenMP schedule {static, dynamic, guided}", optional=True, - optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) + optional_condition=lambda m: m.schedule in + (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) omp_chunk_size = Property(dtype=int, default=0, desc="OpenMP schedule chunk size", optional=True, - optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) + optional_condition=lambda m: m.schedule in + (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent)) gpu_block_size = ListProperty(element_type=int, default=None, @@ -928,6 +945,7 @@ class ConsumeEntry(EntryNode): :see: Consume """ + def __init__(self, consume: 'Consume', dynamic_inputs=None): super(ConsumeEntry, self).__init__(dynamic_inputs or set()) if consume is None: @@ -1006,6 +1024,7 @@ class ConsumeExit(ExitNode): :see: Consume """ + def __init__(self, consume: 'Consume'): super(ConsumeExit, self).__init__() if consume is None: @@ -1117,6 +1136,7 @@ def get_param_num(self): @dace.serialize.serializable class PipelineEntry(MapEntry): + @staticmethod def map_type(): return PipelineScope @@ -1149,6 +1169,7 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]: @dace.serialize.serializable class PipelineExit(MapExit): + @staticmethod def map_type(): return PipelineScope diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 18763e385a..f3a37ef08c 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -212,8 +212,7 @@ def read_symbols(self) -> Set[str]: return result - @property - def free_symbols(self) -> Set[str]: + def used_symbols(self, all_symbols: bool) -> Set[str]: """ Returns a set of symbols used in this edge's properties. """ # NOTE: The former algorithm for computing an edge's free symbols was: # `self.read_symbols() - set(self.assignments.keys())` @@ -241,6 +240,11 @@ def free_symbols(self) -> Set[str]: # Return the set of candidate free symbols minus the set of candidate defined symbols return (cond_symbols | rhs_symbols) - lhs_symbols + @property + def free_symbols(self) -> Set[str]: + """ Returns a set of symbols used in this edge's properties. """ + return self.used_symbols(all_symbols=True) + def replace_dict(self, repl: Dict[str, str], replace_keys=True) -> None: """ Replaces all given keys with their corresponding values. @@ -293,7 +297,7 @@ def new_symbols(self, sdfg, symbols) -> Dict[str, dtypes.typeclass]: alltypes = symbols inferred_lhs_symbols = {k: infer_expr_type(v, alltypes) for k, v in self.assignments.items()} - + # Symbols in assignment keys are candidate newly defined symbols lhs_symbols = set() # Symbols already defined @@ -303,7 +307,7 @@ def new_symbols(self, sdfg, symbols) -> Dict[str, dtypes.typeclass]: # Only add LHS to the set of candidate newly defined symbols if it has not been defined yet if lhs not in rhs_symbols: lhs_symbols.add(lhs) - + return {k: v for k, v in inferred_lhs_symbols.items() if k in lhs_symbols} def get_read_memlets(self, arrays: Dict[str, dt.Data]) -> List[mm.Memlet]: @@ -593,6 +597,7 @@ def hash_sdfg(self, jsondict: Optional[Dict[str, Any]] = None) -> str: :param jsondict: If not None, uses given JSON dictionary as input. :return: The hash (in SHA-256 format). """ + def keyword_remover(json_obj: Any, last_keyword=""): # Makes non-unique in SDFG hierarchy v2 # Recursively remove attributes from the SDFG which are not used in @@ -1277,27 +1282,36 @@ def arrays_recursive(self): if isinstance(node, nd.NestedSDFG): yield from node.sdfg.arrays_recursive() - @property - def free_symbols(self) -> Set[str]: + def used_symbols(self, all_symbols: bool) -> Set[str]: """ Returns a set of symbol names that are used by the SDFG, but not defined within it. This property is used to determine the symbolic - parameters of the SDFG and verify that ``SDFG.symbols`` is complete. + parameters of the SDFG. - :note: Assumes that the graph is valid (i.e., without undefined or - overlapping symbols). + :param all_symbols: If False, only returns the set of symbols that will be used + in the generated code and are needed as arguments. """ defined_syms = set() free_syms = set() - # Start with the set of SDFG free symbols - free_syms |= set(self.symbols.keys()) - - # Exclude data descriptor names and constants + # Exclude data descriptor names, constants, and shapes of global data descriptors + not_strictly_necessary_global_symbols = set() for name, desc in self.arrays.items(): defined_syms.add(name) + + if not all_symbols: + used_desc_symbols = desc.used_symbols(all_symbols) + not_strictly_necessary = (desc.used_symbols(all_symbols=True) - used_desc_symbols) + not_strictly_necessary_global_symbols |= set(map(str, not_strictly_necessary)) + defined_syms |= set(self.constants_prop.keys()) + # Start with the set of SDFG free symbols + if all_symbols: + free_syms |= set(self.symbols.keys()) + else: + free_syms |= set(s for s in self.symbols.keys() if s not in not_strictly_necessary_global_symbols) + # Add free state symbols used_before_assignment = set() @@ -1307,14 +1321,14 @@ def free_symbols(self) -> Set[str]: ordered_states = self.nodes() for state in ordered_states: - free_syms |= state.free_symbols + free_syms |= state.used_symbols(all_symbols) # Add free inter-state symbols for e in self.out_edges(state): # NOTE: First we get the true InterstateEdge free symbols, then we compute the newly defined symbols by # subracting the (true) free symbols from the edge's assignment keys. This way we can correctly # compute the symbols that are used before being assigned. - efsyms = e.data.free_symbols + efsyms = e.data.used_symbols(all_symbols) defined_syms |= set(e.data.assignments.keys()) - efsyms used_before_assignment.update(efsyms - defined_syms) free_syms |= efsyms @@ -1325,6 +1339,18 @@ def free_symbols(self) -> Set[str]: # Subtract symbols defined in inter-state edges and constants return free_syms - defined_syms + @property + def free_symbols(self) -> Set[str]: + """ + Returns a set of symbol names that are used by the SDFG, but not + defined within it. This property is used to determine the symbolic + parameters of the SDFG and verify that ``SDFG.symbols`` is complete. + + :note: Assumes that the graph is valid (i.e., without undefined or + overlapping symbols). + """ + return self.used_symbols(all_symbols=True) + def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]: """ Determines what data containers are read and written in this SDFG. Does @@ -1373,8 +1399,8 @@ def arglist(self, scalars_only=False, free_symbols=None) -> Dict[str, dt.Data]: if not v.transient and isinstance(v, dt.Scalar) and not k.startswith('__dace') } - # Add global free symbols to scalar arguments - free_symbols = free_symbols if free_symbols is not None else self.free_symbols + # Add global free symbols used in the generated code to scalar arguments + free_symbols = free_symbols if free_symbols is not None else self.used_symbols(all_symbols=False) scalar_args.update({k: dt.Scalar(self.symbols[k]) for k in free_symbols if not k.startswith('__dace')}) # Fill up ordered dictionary diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index c354cd9d1f..a4a6648401 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -409,14 +409,12 @@ def scope_children(self, ################################################################### # Query, subgraph, and replacement methods - @property - def free_symbols(self) -> Set[str]: + def used_symbols(self, all_symbols: bool) -> Set[str]: """ - Returns a set of symbol names that are used, but not defined, in - this graph view (SDFG state or subgraph thereof). + Returns a set of symbol names that are used in the state. - :note: Assumes that the graph is valid (i.e., without undefined or - overlapping symbols). + :param all_symbols: If False, only returns the set of symbols that will be used + in the generated code and are needed as arguments. """ state = self.graph if isinstance(self, SubgraphView) else self sdfg = state.parent @@ -429,7 +427,7 @@ def free_symbols(self) -> Set[str]: new_symbols |= set(n.new_symbols(sdfg, self, {}).keys()) elif isinstance(n, nd.AccessNode): # Add data descriptor symbols - freesyms |= set(map(str, n.desc(sdfg).free_symbols)) + freesyms |= set(map(str, n.desc(sdfg).used_symbols(all_symbols))) elif (isinstance(n, nd.Tasklet) and n.language == dtypes.Language.Python): # Consider callbacks defined as symbols as free for stmt in n.code.code: @@ -438,14 +436,41 @@ def free_symbols(self) -> Set[str]: and astnode.func.id in sdfg.symbols): freesyms.add(astnode.func.id) - freesyms |= n.free_symbols + if hasattr(n, 'used_symbols'): + freesyms |= n.used_symbols(all_symbols) + else: + freesyms |= n.free_symbols + # Free symbols from memlets + def _is_leaf_memlet(e): + if isinstance(e.src, nd.ExitNode) and e.src_conn and e.src_conn.startswith('OUT_'): + return False + if isinstance(e.dst, nd.EntryNode) and e.dst_conn and e.dst_conn.startswith('IN_'): + return False + return True + for e in self.edges(): - freesyms |= e.data.free_symbols + # If used for code generation, only consider memlet tree leaves + if not all_symbols and not _is_leaf_memlet(e): + continue + + freesyms |= e.data.used_symbols(all_symbols) # Do not consider SDFG constants as symbols new_symbols.update(set(sdfg.constants.keys())) return freesyms - new_symbols + + @property + def free_symbols(self) -> Set[str]: + """ + Returns a set of symbol names that are used, but not defined, in + this graph view (SDFG state or subgraph thereof). + + :note: Assumes that the graph is valid (i.e., without undefined or + overlapping symbols). + """ + return self.used_symbols(all_symbols=True) + def defined_symbols(self) -> Dict[str, dt.Data]: """ diff --git a/tests/codegen/symbol_arguments_test.py b/tests/codegen/symbol_arguments_test.py new file mode 100644 index 0000000000..3ca89ddd06 --- /dev/null +++ b/tests/codegen/symbol_arguments_test.py @@ -0,0 +1,54 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +import numpy as np + +N = dace.symbol('N') + + +def test_global_sizes(): + + @dace.program + def tester(A: dace.float64[N]): + for i in dace.map[0:10]: + A[i] = 2 + + sdfg = tester.to_sdfg() + # Since N is not used anywhere, it should not be listed in the arguments + assert 'N' not in sdfg.arglist() + + a = np.random.rand(20) + sdfg(a, N=20) + assert np.allclose(a[:10], 2) + + +def test_global_sizes_used(): + + @dace.program + def tester(A: dace.float64[N]): + for i in dace.map[0:10]: + with dace.tasklet: + a >> A[i] + a = N + + sdfg = tester.to_sdfg() + # N is used in a tasklet + assert 'N' in sdfg.arglist() + + +def test_global_sizes_multidim(): + + @dace.program + def tester(A: dace.float64[N, N]): + for i, j in dace.map[0:10, 0:10]: + A[i, j] = 2 + + sdfg = tester.to_sdfg() + # Here N is implicitly used in the index expression, so it should be in the arguments + assert 'N' in sdfg.arglist() + + +if __name__ == '__main__': + test_global_sizes() + test_global_sizes_used() + test_global_sizes_multidim() diff --git a/tests/transformations/mapfission_test.py b/tests/transformations/mapfission_test.py index 72dbebb089..609c075c21 100644 --- a/tests/transformations/mapfission_test.py +++ b/tests/transformations/mapfission_test.py @@ -60,533 +60,562 @@ def config(): return A, expected -class MapFissionTest(unittest.TestCase): - - def test_subgraph(self): - A, expected = config() - B = np.random.rand(2) - - graph = mapfission_sdfg() - self.assertGreater(graph.apply_transformations(MapFission), 0) - graph(A=A, B=B) - - self.assertTrue(np.allclose(B, expected)) - - def test_nested_sdfg(self): - A, expected = config() - B = np.random.rand(2) - - # Nest the subgraph within the outer map, then apply transformation - graph = mapfission_sdfg() - state = graph.nodes()[0] - topmap = next(node for node in state.nodes() if isinstance(node, nodes.MapEntry) and node.label == 'outer') - subgraph = state.scope_subgraph(topmap, include_entry=False, include_exit=False) - nest_state_subgraph(graph, state, subgraph) - self.assertGreater(graph.apply_transformations(MapFission), 0) - graph(A=A, B=B) - self.assertTrue(np.allclose(B, expected)) - - def test_nested_transient(self): - """ Test nested SDFGs with transients. """ - - # Inner SDFG - nsdfg = dace.SDFG('nested') - nsdfg.add_array('a', [1], dace.float64) - nsdfg.add_array('b', [1], dace.float64) - nsdfg.add_transient('t', [1], dace.float64) - - # a->t state - nstate = nsdfg.add_state() - irnode = nstate.add_read('a') - task = nstate.add_tasklet('t1', {'inp'}, {'out'}, 'out = 2*inp') - iwnode = nstate.add_write('t') - nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('a', '0')) - nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) - - # t->a state - first_state = nstate - nstate = nsdfg.add_state() - irnode = nstate.add_read('t') - task = nstate.add_tasklet('t2', {'inp'}, {'out'}, 'out = 3*inp') - iwnode = nstate.add_write('b') - nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('t', '0')) - nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('b', '0')) - - nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) - - # Outer SDFG - sdfg = dace.SDFG('nested_transient_fission') - sdfg.add_array('A', [2], dace.float64) - state = sdfg.add_state() - rnode = state.add_read('A') - wnode = state.add_write('A') - me, mx = state.add_map('outer', dict(i='0:2')) - nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) - state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) - state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) - - self.assertGreater(sdfg.apply_transformations_repeated(MapFission), 0) - - # Test - A = np.random.rand(2) - expected = A * 6 - sdfg(A=A) - self.assertTrue(np.allclose(A, expected)) - - def test_inputs_outputs(self): - """ - Test subgraphs where the computation modules that are in the middle - connect to the outside. - """ - - sdfg = dace.SDFG('inputs_outputs_fission') - sdfg.add_array('in1', [2], dace.float64) - sdfg.add_array('in2', [2], dace.float64) - sdfg.add_scalar('tmp', dace.float64, transient=True) - sdfg.add_array('out1', [2], dace.float64) - sdfg.add_array('out2', [2], dace.float64) - state = sdfg.add_state() - in1 = state.add_read('in1') - in2 = state.add_read('in2') - out1 = state.add_write('out1') - out2 = state.add_write('out2') - me, mx = state.add_map('outer', dict(i='0:2')) - t1 = state.add_tasklet('t1', {'i1'}, {'o1', 'o2'}, 'o1 = i1 * 2; o2 = i1 * 5') - t2 = state.add_tasklet('t2', {'i1', 'i2'}, {'o1'}, 'o1 = i1 * i2') - state.add_memlet_path(in1, me, t1, dst_conn='i1', memlet=dace.Memlet.simple('in1', 'i')) - state.add_memlet_path(in2, me, t2, dst_conn='i2', memlet=dace.Memlet.simple('in2', 'i')) - state.add_edge(t1, 'o1', t2, 'i1', dace.Memlet.simple('tmp', '0')) - state.add_memlet_path(t2, mx, out1, src_conn='o1', memlet=dace.Memlet.simple('out1', 'i')) - state.add_memlet_path(t1, mx, out2, src_conn='o2', memlet=dace.Memlet.simple('out2', 'i')) - - self.assertGreater(sdfg.apply_transformations(MapFission), 0) - - # Test - A, B, C, D = tuple(np.random.rand(2) for _ in range(4)) - expected_C = (A * 2) * B - expected_D = A * 5 - sdfg(in1=A, in2=B, out1=C, out2=D) - self.assertTrue(np.allclose(C, expected_C)) - self.assertTrue(np.allclose(D, expected_D)) - - def test_multidim(self): - sdfg = dace.SDFG('mapfission_multidim') - sdfg.add_array('A', [2, 3], dace.float64) - state = sdfg.add_state() - me, mx = state.add_map('outer', dict(i='0:2', j='0:3')) - - nsdfg = dace.SDFG('nested') - nsdfg.add_array('a', [1], dace.float64) - nstate = nsdfg.add_state() - t = nstate.add_tasklet('reset', {}, {'out'}, 'out = 0') - a = nstate.add_write('a') - nstate.add_edge(t, 'out', a, None, dace.Memlet.simple('a', '0')) - nsdfg_node = state.add_nested_sdfg(nsdfg, None, {}, {'a'}) - - state.add_edge(me, None, nsdfg_node, None, dace.Memlet()) - anode = state.add_write('A') - state.add_memlet_path(nsdfg_node, mx, anode, src_conn='a', memlet=dace.Memlet.simple('A', 'i,j')) - - self.assertGreater(sdfg.apply_transformations_repeated(MapFission), 0) - - # Test - A = np.random.rand(2, 3) - sdfg(A=A) - self.assertTrue(np.allclose(A, np.zeros_like(A))) - - def test_offsets(self): - sdfg = dace.SDFG('mapfission_offsets') - sdfg.add_array('A', [20], dace.float64) - sdfg.add_scalar('interim', dace.float64, transient=True) - state = sdfg.add_state() - me, mx = state.add_map('outer', dict(i='10:20')) - - t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1') - t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2') - - aread = state.add_read('A') - awrite = state.add_write('A') - state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) - state.add_edge(t1, 'b', t2, 'a', dace.Memlet.simple('interim', '0')) - state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) - - self.assertGreater(sdfg.apply_transformations(MapFission), 0) - - dace.propagate_memlets_sdfg(sdfg) - sdfg.validate() - - # Test - A = np.random.rand(20) - expected = A.copy() - expected[10:] += 3 - sdfg(A=A) - self.assertTrue(np.allclose(A, expected)) - - def test_offsets_array(self): - sdfg = dace.SDFG('mapfission_offsets2') - sdfg.add_array('A', [20], dace.float64) - sdfg.add_array('interim', [1], dace.float64, transient=True) - state = sdfg.add_state() - me, mx = state.add_map('outer', dict(i='10:20')) - - t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1') - interim = state.add_access('interim') - t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2') - - aread = state.add_read('A') - awrite = state.add_write('A') - state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) - state.add_edge(t1, 'b', interim, None, dace.Memlet.simple('interim', '0')) - state.add_edge(interim, None, t2, 'a', dace.Memlet.simple('interim', '0')) - state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) - - self.assertGreater(sdfg.apply_transformations(MapFission), 0) - - dace.propagate_memlets_sdfg(sdfg) - sdfg.validate() - - # Test - A = np.random.rand(20) - expected = A.copy() - expected[10:] += 3 - sdfg(A=A) - self.assertTrue(np.allclose(A, expected)) - - def test_mapfission_with_symbols(self): - ''' - Tests MapFission in the case of a Map containing a NestedSDFG that is using some symbol from the top-level SDFG - missing from the NestedSDFG's symbol mapping. Please note that this is an unusual case that is difficult to - reproduce and ultimately unrelated to MapFission. Consider solving the underlying issue and then deleting this - test and the corresponding (obsolete) code in MapFission. - ''' - - M, N = dace.symbol('M'), dace.symbol('N') - - sdfg = dace.SDFG('tasklet_code_with_symbols') - sdfg.add_array('A', (M, N), dace.int32) - sdfg.add_array('B', (M, N), dace.int32) - - state = sdfg.add_state('parent', is_start_state=True) - me, mx = state.add_map('parent_map', {'i': '0:N'}) - - nsdfg = dace.SDFG('nested_sdfg') - nsdfg.add_scalar('inner_A', dace.int32) - nsdfg.add_scalar('inner_B', dace.int32) - - nstate = nsdfg.add_state('child', is_start_state=True) - na = nstate.add_access('inner_A') - nb = nstate.add_access('inner_B') - ta = nstate.add_tasklet('tasklet_A', {}, {'__out'}, '__out = M') - tb = nstate.add_tasklet('tasklet_B', {}, {'__out'}, '__out = M') - nstate.add_edge(ta, '__out', na, None, dace.Memlet.from_array('inner_A', nsdfg.arrays['inner_A'])) - nstate.add_edge(tb, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B'])) - - a = state.add_access('A') - b = state.add_access('B') - t = nodes.NestedSDFG('child_sdfg', nsdfg, {}, {'inner_A', 'inner_B'}, {}) - nsdfg.parent = state - nsdfg.parent_sdfg = sdfg - nsdfg.parent_nsdfg_node = t - state.add_node(t) - state.add_nedge(me, t, dace.Memlet()) - state.add_memlet_path(t, mx, a, memlet=dace.Memlet('A[0, i]'), src_conn='inner_A') - state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[0, i]'), src_conn='inner_B') - - num = sdfg.apply_transformations_repeated(MapFission) - self.assertTrue(num == 1) - - A = np.ndarray((2, 10), dtype=np.int32) - B = np.ndarray((2, 10), dtype=np.int32) - sdfg(A=A, B=B, M=2, N=10) - - ref = np.full((10, ), fill_value=2, dtype=np.int32) - - self.assertTrue(np.array_equal(A[0], ref)) - self.assertTrue(np.array_equal(B[0], ref)) - - def test_two_edges_through_map(self): - ''' - Tests MapFission in the case of a Map with a component that has two inputs from a single data container. In such - cases, using `fill_scope_connectors` will lead to broken Map connectors. The tests confirms that new code in the - transformation manually adding the appropriate Map connectors works properly. - ''' - - N = dace.symbol('N') - - sdfg = dace.SDFG('two_edges_through_map') - sdfg.add_array('A', (N, ), dace.int32) - sdfg.add_array('B', (N, ), dace.int32) - - state = sdfg.add_state('parent', is_start_state=True) - me, mx = state.add_map('parent_map', {'i': '0:N'}) - - nsdfg = dace.SDFG('nested_sdfg') - nsdfg.add_array('inner_A', (N, ), dace.int32) - nsdfg.add_scalar('inner_B', dace.int32) - - nstate = nsdfg.add_state('child', is_start_state=True) - na = nstate.add_access('inner_A') - nb = nstate.add_access('inner_B') - t = nstate.add_tasklet('tasklet', {'__in1', '__in2'}, {'__out'}, '__out = __in1 + __in2') - nstate.add_edge(na, None, t, '__in1', dace.Memlet('inner_A[i]')) - nstate.add_edge(na, None, t, '__in2', dace.Memlet('inner_A[N-i-1]')) - nstate.add_edge(t, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B'])) - - a = state.add_access('A') - b = state.add_access('B') - t = state.add_nested_sdfg(nsdfg, None, {'inner_A'}, {'inner_B'}, {'N': 'N', 'i': 'i'}) - state.add_memlet_path(a, me, t, memlet=dace.Memlet.from_array('A', sdfg.arrays['A']), dst_conn='inner_A') - state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[i]'), src_conn='inner_B') - - num = sdfg.apply_transformations_repeated(MapFission) - self.assertTrue(num == 1) - - A = np.arange(10, dtype=np.int32) - B = np.ndarray((10, ), dtype=np.int32) - sdfg(A=A, B=B, N=10) - - ref = np.full((10, ), fill_value=9, dtype=np.int32) - - self.assertTrue(np.array_equal(B, ref)) - - def test_if_scope(self): - - @dace.program - def map_with_if(A: dace.int32[10]): - for i in dace.map[0:10]: - if i < 5: - A[i] = 0 - else: - A[i] = 1 - - ref = np.array([0] * 5 + [1] * 5, dtype=np.int32) - - sdfg = map_with_if.to_sdfg() - val0 = np.ndarray((10, ), dtype=np.int32) - sdfg(A=val0) - self.assertTrue(np.array_equal(val0, ref)) - - sdfg.apply_transformations_repeated(MapFission) - - val1 = np.ndarray((10, ), dtype=np.int32) - sdfg(A=val1) - self.assertTrue(np.array_equal(val1, ref)) - - def test_if_scope_2(self): - - @dace.program - def map_with_if_2(A: dace.int32[10]): - for i in dace.map[0:10]: - j = i < 5 - if j: - A[i] = 0 - else: - A[i] = 1 - - ref = np.array([0] * 5 + [1] * 5, dtype=np.int32) - - sdfg = map_with_if_2.to_sdfg() - val0 = np.ndarray((10, ), dtype=np.int32) - sdfg(A=val0) - self.assertTrue(np.array_equal(val0, ref)) - - sdfg.apply_transformations_repeated(MapFission) - - val1 = np.ndarray((10, ), dtype=np.int32) - sdfg(A=val1) - self.assertTrue(np.array_equal(val1, ref)) - - def test_array_copy_outside_scope(self): - - """ - This test checks for two issues occuring when MapFission applies on a NestedSDFG with a state-subgraph - containing copies among AccessNodes. In such cases, these copies may end up outside the scope of the generated - Maps (after MapFssion), potentially leading to the following errors: - 1. The memlet subset corresponding to a NestedSDFG connector (input/output) may have its dimensionality - erroneously increased. - 2. The memlet subset corresponding to a NestedSDFG connector (input/output) may not be propagated even if it uses - the Map's parameters. - """ - - sdfg = dace.SDFG('array_copy_outside_scope') - iname, _ = sdfg.add_array('inp', (10,), dtype=dace.int32) - oname, _ = sdfg.add_array('out', (10,), dtype=dace.int32) - - nsdfg = dace.SDFG('nested_sdfg') - niname, nidesc = nsdfg.add_array('ninp', (1,), dtype=dace.int32) - ntname, ntdesc = nsdfg.add_scalar('ntmp', dtype=dace.int32, transient=True) - noname, nodesc = nsdfg.add_array('nout', (1,), dtype=dace.int32) - - nstate = nsdfg.add_state('nmain') - ninode = nstate.add_access(niname) - ntnode = nstate.add_access(ntname) - nonode = nstate.add_access(noname) - tasklet = nstate.add_tasklet('tasklet', {'__inp'}, {'__out'}, '__out = __inp + 1') - nstate.add_edge(ninode, None, tasklet, '__inp', dace.Memlet.from_array(niname, nidesc)) - nstate.add_edge(tasklet, '__out', ntnode, None, dace.Memlet.from_array(ntname, ntdesc)) - nstate.add_nedge(ntnode, nonode, dace.Memlet.from_array(noname, nodesc)) - - state = sdfg.add_state('main') - inode = state.add_access(iname) - onode = state.add_access(oname) - me, mx = state.add_map('map', {'i': '0:10'}) - snode = state.add_nested_sdfg(nsdfg, None, {'ninp'}, {'nout'}) - state.add_memlet_path(inode, me, snode, memlet=dace.Memlet(data=iname, subset='i'), dst_conn='ninp') - state.add_memlet_path(snode, mx, onode, memlet=dace.Memlet(data=oname, subset='i'), src_conn='nout') - - # Issue no. 1 will be caught by validation after MapFission - sdfg.apply_transformations(MapFission) - - # Issue no. 2 will be caught by code-generation due to `i` existing in a memlet outside the Map's scope. - A = np.arange(10, dtype=np.int32) - B = np.empty((10,), dtype=np.int32) - sdfg(inp=A, out=B) - assert np.array_equal(A+1, B) - - def test_single_data_multiple_connectors(self): - - outer_sdfg = dace.SDFG('single_data_multiple_connectors') - outer_sdfg.add_array('A', (2, 10), dtype=dace.int32) - outer_sdfg.add_array('B', (2, 10), dtype=dace.int32) - - inner_sdfg = dace.SDFG('inner') - inner_sdfg.add_array('A0', (10,), dtype=dace.int32) - inner_sdfg.add_array('A1', (10,), dtype=dace.int32) - inner_sdfg.add_array('B0', (10,), dtype=dace.int32) - inner_sdfg.add_array('B1', (10,), dtype=dace.int32) - - inner_state = inner_sdfg.add_state('inner_state', is_start_state=True) - - inner_state.add_mapped_tasklet(name='plus', - map_ranges={'j': '0:10'}, - inputs={'__a0': dace.Memlet(data='A0', subset='j'), - '__a1': dace.Memlet(data='A1', subset='j')}, - outputs={'__b0': dace.Memlet(data='B0', subset='j')}, - code='__b0 = __a0 + __a1', - external_edges=True) - inner_state.add_mapped_tasklet(name='minus', - map_ranges={'j': '0:10'}, - inputs={'__a0': dace.Memlet(data='A0', subset='j'), - '__a1': dace.Memlet(data='A1', subset='j')}, - outputs={'__b1': dace.Memlet(data='B1', subset='j')}, - code='__b1 = __a0 - __a1', - external_edges=True) +def test_subgraph(): + A, expected = config() + B = np.random.rand(2) + + graph = mapfission_sdfg() + assert graph.apply_transformations(MapFission) > 0 + graph(A=A, B=B) + + assert np.allclose(B, expected) + + +def test_nested_sdfg(): + A, expected = config() + B = np.random.rand(2) + + # Nest the subgraph within the outer map, then apply transformation + graph = mapfission_sdfg() + state = graph.nodes()[0] + topmap = next(node for node in state.nodes() if isinstance(node, nodes.MapEntry) and node.label == 'outer') + subgraph = state.scope_subgraph(topmap, include_entry=False, include_exit=False) + nest_state_subgraph(graph, state, subgraph) + assert graph.apply_transformations(MapFission) > 0 + graph(A=A, B=B) + assert np.allclose(B, expected) + + +def test_nested_transient(): + """ Test nested SDFGs with transients. """ + + # Inner SDFG + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.float64) + nsdfg.add_array('b', [1], dace.float64) + nsdfg.add_transient('t', [1], dace.float64) + + # a->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + task = nstate.add_tasklet('t1', {'inp'}, {'out'}, 'out = 2*inp') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('a', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->a state + first_state = nstate + nstate = nsdfg.add_state() + irnode = nstate.add_read('t') + task = nstate.add_tasklet('t2', {'inp'}, {'out'}, 'out = 3*inp') + iwnode = nstate.add_write('b') + nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('t', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('b', '0')) + + nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + + # Outer SDFG + sdfg = dace.SDFG('nested_transient_fission') + sdfg.add_array('A', [2], dace.float64) + state = sdfg.add_state() + rnode = state.add_read('A') + wnode = state.add_write('A') + me, mx = state.add_map('outer', dict(i='0:2')) + nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) + state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) + state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) + + assert sdfg.apply_transformations_repeated(MapFission) > 0 + + # Test + A = np.random.rand(2) + expected = A * 6 + sdfg(A=A) + assert np.allclose(A, expected) + + +def test_inputs_outputs(): + """ + Test subgraphs where the computation modules that are in the middle + connect to the outside. + """ + + sdfg = dace.SDFG('inputs_outputs_fission') + sdfg.add_array('in1', [2], dace.float64) + sdfg.add_array('in2', [2], dace.float64) + sdfg.add_scalar('tmp', dace.float64, transient=True) + sdfg.add_array('out1', [2], dace.float64) + sdfg.add_array('out2', [2], dace.float64) + state = sdfg.add_state() + in1 = state.add_read('in1') + in2 = state.add_read('in2') + out1 = state.add_write('out1') + out2 = state.add_write('out2') + me, mx = state.add_map('outer', dict(i='0:2')) + t1 = state.add_tasklet('t1', {'i1'}, {'o1', 'o2'}, 'o1 = i1 * 2; o2 = i1 * 5') + t2 = state.add_tasklet('t2', {'i1', 'i2'}, {'o1'}, 'o1 = i1 * i2') + state.add_memlet_path(in1, me, t1, dst_conn='i1', memlet=dace.Memlet.simple('in1', 'i')) + state.add_memlet_path(in2, me, t2, dst_conn='i2', memlet=dace.Memlet.simple('in2', 'i')) + state.add_edge(t1, 'o1', t2, 'i1', dace.Memlet.simple('tmp', '0')) + state.add_memlet_path(t2, mx, out1, src_conn='o1', memlet=dace.Memlet.simple('out1', 'i')) + state.add_memlet_path(t1, mx, out2, src_conn='o2', memlet=dace.Memlet.simple('out2', 'i')) + + assert sdfg.apply_transformations(MapFission) > 0 + + # Test + A, B, C, D = tuple(np.random.rand(2) for _ in range(4)) + expected_C = (A * 2) * B + expected_D = A * 5 + sdfg(in1=A, in2=B, out1=C, out2=D) + assert np.allclose(C, expected_C) + assert np.allclose(D, expected_D) + + +def test_multidim(): + sdfg = dace.SDFG('mapfission_multidim') + sdfg.add_array('A', [2, 3], dace.float64) + state = sdfg.add_state() + me, mx = state.add_map('outer', dict(i='0:2', j='0:3')) + + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.float64) + nstate = nsdfg.add_state() + t = nstate.add_tasklet('reset', {}, {'out'}, 'out = 0') + a = nstate.add_write('a') + nstate.add_edge(t, 'out', a, None, dace.Memlet.simple('a', '0')) + nsdfg_node = state.add_nested_sdfg(nsdfg, None, {}, {'a'}) + + state.add_edge(me, None, nsdfg_node, None, dace.Memlet()) + anode = state.add_write('A') + state.add_memlet_path(nsdfg_node, mx, anode, src_conn='a', memlet=dace.Memlet.simple('A', 'i,j')) + + assert sdfg.apply_transformations_repeated(MapFission) > 0 + + # Test + A = np.random.rand(2, 3) + sdfg(A=A) + assert np.allclose(A, np.zeros_like(A)) + + +def test_offsets(): + sdfg = dace.SDFG('mapfission_offsets') + sdfg.add_array('A', [20], dace.float64) + sdfg.add_scalar('interim', dace.float64, transient=True) + state = sdfg.add_state() + me, mx = state.add_map('outer', dict(i='10:20')) + + t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1') + t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2') + + aread = state.add_read('A') + awrite = state.add_write('A') + state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) + state.add_edge(t1, 'b', t2, 'a', dace.Memlet.simple('interim', '0')) + state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) + + assert sdfg.apply_transformations(MapFission) > 0 + + dace.propagate_memlets_sdfg(sdfg) + sdfg.validate() + + # Test + A = np.random.rand(20) + expected = A.copy() + expected[10:] += 3 + sdfg(A=A) + assert np.allclose(A, expected) + + +def test_offsets_array(): + sdfg = dace.SDFG('mapfission_offsets2') + sdfg.add_array('A', [20], dace.float64) + sdfg.add_array('interim', [1], dace.float64, transient=True) + state = sdfg.add_state() + me, mx = state.add_map('outer', dict(i='10:20')) + + t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1') + interim = state.add_access('interim') + t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2') + + aread = state.add_read('A') + awrite = state.add_write('A') + state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) + state.add_edge(t1, 'b', interim, None, dace.Memlet.simple('interim', '0')) + state.add_edge(interim, None, t2, 'a', dace.Memlet.simple('interim', '0')) + state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) + + assert sdfg.apply_transformations(MapFission) > 0 + + dace.propagate_memlets_sdfg(sdfg) + sdfg.validate() + + # Test + A = np.random.rand(20) + expected = A.copy() + expected[10:] += 3 + sdfg(A=A) + assert np.allclose(A, expected) + + +def test_mapfission_with_symbols(): + """ + Tests MapFission in the case of a Map containing a NestedSDFG that is using some symbol from the top-level SDFG + missing from the NestedSDFG's symbol mapping. Please note that this is an unusual case that is difficult to + reproduce and ultimately unrelated to MapFission. Consider solving the underlying issue and then deleting this + test and the corresponding (obsolete) code in MapFission. + """ + + M, N = dace.symbol('M'), dace.symbol('N') - outer_state = outer_sdfg.add_state('outer_state', is_start_state=True) + sdfg = dace.SDFG('tasklet_code_with_symbols') + sdfg.add_array('A', (M, N), dace.int32) + sdfg.add_array('B', (M, N), dace.int32) - a = outer_state.add_access('A') - b = outer_state.add_access('B') + state = sdfg.add_state('parent', is_start_state=True) + me, mx = state.add_map('parent_map', {'i': '0:N'}) - me, mx = outer_state.add_map('map', {'i': '0:2'}) - inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'}) + nsdfg = dace.SDFG('nested_sdfg') + nsdfg.add_scalar('inner_A', dace.int32) + nsdfg.add_scalar('inner_B', dace.int32) - outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0') - outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1') - outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0') - outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1') + nstate = nsdfg.add_state('child', is_start_state=True) + na = nstate.add_access('inner_A') + nb = nstate.add_access('inner_B') + ta = nstate.add_tasklet('tasklet_A', {}, {'__out'}, '__out = M') + tb = nstate.add_tasklet('tasklet_B', {}, {'__out'}, '__out = M') + nstate.add_edge(ta, '__out', na, None, dace.Memlet.from_array('inner_A', nsdfg.arrays['inner_A'])) + nstate.add_edge(tb, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B'])) - sdutils.consolidate_edges(outer_sdfg) - - A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy() - ref = np.empty_like(A) - ref_sdfg = copy.deepcopy(outer_sdfg) - ref_sdfg.name = f"{ref_sdfg.name}_ref" - ref_sdfg(A=A, B=ref) + a = state.add_access('A') + b = state.add_access('B') + t = state.add_nested_sdfg(nsdfg, None, {}, {'inner_A', 'inner_B'}) + state.add_nedge(me, t, dace.Memlet()) + state.add_memlet_path(t, mx, a, memlet=dace.Memlet('A[0, i]'), src_conn='inner_A') + state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[0, i]'), src_conn='inner_B') - MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node) - val = np.empty_like(A) - outer_sdfg(A=A, B=val) + num = sdfg.apply_transformations_repeated(MapFission) + assert num == 1 - assert np.array_equal(val, ref) + A = np.ndarray((2, 10), dtype=np.int32) + B = np.ndarray((2, 10), dtype=np.int32) + sdfg(A=A, B=B, M=2, N=10) - def test_dependent_symbol(self): + ref = np.full((10, ), fill_value=2, dtype=np.int32) - outer_sdfg = dace.SDFG('map_fission_with_dependent_symbol') + assert np.array_equal(A[0], ref) + assert np.array_equal(B[0], ref) - outer_sdfg.add_symbol('fidx', dace.int32) - outer_sdfg.add_symbol('lidx', dace.int32) - outer_sdfg.add_array('A', (2, 10), dtype=dace.int32) - outer_sdfg.add_array('B', (2, 10), dtype=dace.int32) +def test_two_edges_through_map(): + """ + Tests MapFission in the case of a Map with a component that has two inputs from a single data container. In such + cases, using `fill_scope_connectors` will lead to broken Map connectors. The tests confirms that new code in the + transformation manually adding the appropriate Map connectors works properly. + """ - inner_sdfg = dace.SDFG('inner') + N = dace.symbol('N') - inner_sdfg.add_symbol('first', dace.int32) - inner_sdfg.add_symbol('last', dace.int32) + sdfg = dace.SDFG('two_edges_through_map') + sdfg.add_array('A', (N, ), dace.int32) + sdfg.add_array('B', (N, ), dace.int32) - inner_sdfg.add_array('A0', (10,), dtype=dace.int32) - inner_sdfg.add_array('A1', (10,), dtype=dace.int32) - inner_sdfg.add_array('B0', (10,), dtype=dace.int32) - inner_sdfg.add_array('B1', (10,), dtype=dace.int32) + state = sdfg.add_state('parent', is_start_state=True) + me, mx = state.add_map('parent_map', {'i': '0:N'}) - inner_state = inner_sdfg.add_state('inner_state', is_start_state=True) + nsdfg = dace.SDFG('nested_sdfg') + nsdfg.add_array('inner_A', (N, ), dace.int32) + nsdfg.add_scalar('inner_B', dace.int32) - inner_state.add_mapped_tasklet(name='plus', - map_ranges={'j': 'first:last'}, - inputs={'__a0': dace.Memlet(data='A0', subset='j'), - '__a1': dace.Memlet(data='A1', subset='j')}, - outputs={'__b0': dace.Memlet(data='B0', subset='j')}, - code='__b0 = __a0 + __a1', - external_edges=True) + nstate = nsdfg.add_state('child', is_start_state=True) + na = nstate.add_access('inner_A') + nb = nstate.add_access('inner_B') + t = nstate.add_tasklet('tasklet', {'__in1', '__in2'}, {'__out'}, '__out = __in1 + __in2') + nstate.add_edge(na, None, t, '__in1', dace.Memlet('inner_A[i]')) + nstate.add_edge(na, None, t, '__in2', dace.Memlet('inner_A[N-i-1]')) + nstate.add_edge(t, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B'])) + + a = state.add_access('A') + b = state.add_access('B') + t = state.add_nested_sdfg(nsdfg, None, {'inner_A'}, {'inner_B'}, {'N': 'N', 'i': 'i'}) + state.add_memlet_path(a, me, t, memlet=dace.Memlet.from_array('A', sdfg.arrays['A']), dst_conn='inner_A') + state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[i]'), src_conn='inner_B') + + num = sdfg.apply_transformations_repeated(MapFission) + assert num == 1 + + A = np.arange(10, dtype=np.int32) + B = np.ndarray((10, ), dtype=np.int32) + sdfg(A=A, B=B, N=10) + + ref = np.full((10, ), fill_value=9, dtype=np.int32) + + assert np.array_equal(B, ref) + + +def test_if_scope(): - inner_sdfg2 = dace.SDFG('inner2') + @dace.program + def map_with_if(A: dace.int32[10]): + for i in dace.map[0:10]: + if i < 5: + A[i] = 0 + else: + A[i] = 1 - inner_sdfg2.add_symbol('first', dace.int32) - inner_sdfg2.add_symbol('last', dace.int32) + ref = np.array([0] * 5 + [1] * 5, dtype=np.int32) - inner_sdfg2.add_array('A0', (10,), dtype=dace.int32) - inner_sdfg2.add_array('A1', (10,), dtype=dace.int32) - inner_sdfg2.add_array('B1', (10,), dtype=dace.int32) + sdfg = map_with_if.to_sdfg() + val0 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val0) + assert np.array_equal(val0, ref) + + sdfg.apply_transformations_repeated(MapFission) + + val1 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val1) + assert np.array_equal(val1, ref) + + +def test_if_scope_2(): + + @dace.program + def map_with_if_2(A: dace.int32[10]): + for i in dace.map[0:10]: + j = i < 5 + if j: + A[i] = 0 + else: + A[i] = 1 + + ref = np.array([0] * 5 + [1] * 5, dtype=np.int32) + + sdfg = map_with_if_2.to_sdfg() + val0 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val0) + assert np.array_equal(val0, ref) + + sdfg.apply_transformations_repeated(MapFission) + + val1 = np.ndarray((10, ), dtype=np.int32) + sdfg(A=val1) + assert np.array_equal(val1, ref) + + +def test_array_copy_outside_scope(): + """ + This test checks for two issues occuring when MapFission applies on a NestedSDFG with a state-subgraph + containing copies among AccessNodes. In such cases, these copies may end up outside the scope of the generated + Maps (after MapFssion), potentially leading to the following errors: + 1. The memlet subset corresponding to a NestedSDFG connector (input/output) may have its dimensionality + erroneously increased. + 2. The memlet subset corresponding to a NestedSDFG connector (input/output) may not be propagated even if it uses + the Map's parameters. + """ + + sdfg = dace.SDFG('array_copy_outside_scope') + iname, _ = sdfg.add_array('inp', (10, ), dtype=dace.int32) + oname, _ = sdfg.add_array('out', (10, ), dtype=dace.int32) + + nsdfg = dace.SDFG('nested_sdfg') + niname, nidesc = nsdfg.add_array('ninp', (1, ), dtype=dace.int32) + ntname, ntdesc = nsdfg.add_scalar('ntmp', dtype=dace.int32, transient=True) + noname, nodesc = nsdfg.add_array('nout', (1, ), dtype=dace.int32) - inner_state2 = inner_sdfg2.add_state('inner_state2', is_start_state=True) + nstate = nsdfg.add_state('nmain') + ninode = nstate.add_access(niname) + ntnode = nstate.add_access(ntname) + nonode = nstate.add_access(noname) + tasklet = nstate.add_tasklet('tasklet', {'__inp'}, {'__out'}, '__out = __inp + 1') + nstate.add_edge(ninode, None, tasklet, '__inp', dace.Memlet.from_array(niname, nidesc)) + nstate.add_edge(tasklet, '__out', ntnode, None, dace.Memlet.from_array(ntname, ntdesc)) + nstate.add_nedge(ntnode, nonode, dace.Memlet.from_array(noname, nodesc)) - inner_state2.add_mapped_tasklet(name='minus', - map_ranges={'j': 'first:last'}, - inputs={'__a0': dace.Memlet(data='A0', subset='j'), - '__a1': dace.Memlet(data='A1', subset='j')}, - outputs={'__b1': dace.Memlet(data='B1', subset='j')}, - code='__b1 = __a0 - __a1', - external_edges=True) - - nsdfg = inner_state.add_nested_sdfg(inner_sdfg2, None, {'A0', 'A1'}, {'B1'}) - a0 = inner_state.add_access('A0') - a1 = inner_state.add_access('A1') - b1 = inner_state.add_access('B1') + state = sdfg.add_state('main') + inode = state.add_access(iname) + onode = state.add_access(oname) + me, mx = state.add_map('map', {'i': '0:10'}) + snode = state.add_nested_sdfg(nsdfg, None, {'ninp'}, {'nout'}) + state.add_memlet_path(inode, me, snode, memlet=dace.Memlet(data=iname, subset='i'), dst_conn='ninp') + state.add_memlet_path(snode, mx, onode, memlet=dace.Memlet(data=oname, subset='i'), src_conn='nout') + + # Issue no. 1 will be caught by validation after MapFission + sdfg.apply_transformations(MapFission) + + # Issue no. 2 will be caught by code-generation due to `i` existing in a memlet outside the Map's scope. + A = np.arange(10, dtype=np.int32) + B = np.empty((10, ), dtype=np.int32) + sdfg(inp=A, out=B) + assert np.array_equal(A + 1, B) + + +def test_single_data_multiple_connectors(): + + outer_sdfg = dace.SDFG('single_data_multiple_connectors') + outer_sdfg.add_array('A', (2, 10), dtype=dace.int32) + outer_sdfg.add_array('B', (2, 10), dtype=dace.int32) + + inner_sdfg = dace.SDFG('inner') + inner_sdfg.add_array('A0', (10, ), dtype=dace.int32) + inner_sdfg.add_array('A1', (10, ), dtype=dace.int32) + inner_sdfg.add_array('B0', (10, ), dtype=dace.int32) + inner_sdfg.add_array('B1', (10, ), dtype=dace.int32) + + inner_state = inner_sdfg.add_state('inner_state', is_start_state=True) + + inner_state.add_mapped_tasklet(name='plus', + map_ranges={'j': '0:10'}, + inputs={ + '__a0': dace.Memlet(data='A0', subset='j'), + '__a1': dace.Memlet(data='A1', subset='j') + }, + outputs={'__b0': dace.Memlet(data='B0', subset='j')}, + code='__b0 = __a0 + __a1', + external_edges=True) + inner_state.add_mapped_tasklet(name='minus', + map_ranges={'j': '0:10'}, + inputs={ + '__a0': dace.Memlet(data='A0', subset='j'), + '__a1': dace.Memlet(data='A1', subset='j') + }, + outputs={'__b1': dace.Memlet(data='B1', subset='j')}, + code='__b1 = __a0 - __a1', + external_edges=True) + + outer_state = outer_sdfg.add_state('outer_state', is_start_state=True) + + a = outer_state.add_access('A') + b = outer_state.add_access('B') + + me, mx = outer_state.add_map('map', {'i': '0:2'}) + inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'}) + + outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0') + outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1') + outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0') + outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1') + + sdutils.consolidate_edges(outer_sdfg) + + A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy() + ref = np.empty_like(A) + ref_sdfg = copy.deepcopy(outer_sdfg) + ref_sdfg.name = f"{ref_sdfg.name}_ref" + ref_sdfg(A=A, B=ref) + + MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node) + val = np.empty_like(A) + outer_sdfg(A=A, B=val) + + assert np.array_equal(val, ref) + + +def test_dependent_symbol(): + + outer_sdfg = dace.SDFG('map_fission_with_dependent_symbol') + + outer_sdfg.add_symbol('fidx', dace.int32) + outer_sdfg.add_symbol('lidx', dace.int32) + + outer_sdfg.add_array('A', (2, 10), dtype=dace.int32) + outer_sdfg.add_array('B', (2, 10), dtype=dace.int32) + + inner_sdfg = dace.SDFG('inner') + + inner_sdfg.add_symbol('first', dace.int32) + inner_sdfg.add_symbol('last', dace.int32) + + inner_sdfg.add_array('A0', (10, ), dtype=dace.int32) + inner_sdfg.add_array('A1', (10, ), dtype=dace.int32) + inner_sdfg.add_array('B0', (10, ), dtype=dace.int32) + inner_sdfg.add_array('B1', (10, ), dtype=dace.int32) + + inner_state = inner_sdfg.add_state('inner_state', is_start_state=True) + + inner_state.add_mapped_tasklet(name='plus', + map_ranges={'j': 'first:last'}, + inputs={ + '__a0': dace.Memlet(data='A0', subset='j'), + '__a1': dace.Memlet(data='A1', subset='j') + }, + outputs={'__b0': dace.Memlet(data='B0', subset='j')}, + code='__b0 = __a0 + __a1', + external_edges=True) + + inner_sdfg2 = dace.SDFG('inner2') + + inner_sdfg2.add_symbol('first', dace.int32) + inner_sdfg2.add_symbol('last', dace.int32) + + inner_sdfg2.add_array('A0', (10, ), dtype=dace.int32) + inner_sdfg2.add_array('A1', (10, ), dtype=dace.int32) + inner_sdfg2.add_array('B1', (10, ), dtype=dace.int32) + + inner_state2 = inner_sdfg2.add_state('inner_state2', is_start_state=True) + + inner_state2.add_mapped_tasklet(name='minus', + map_ranges={'j': 'first:last'}, + inputs={ + '__a0': dace.Memlet(data='A0', subset='j'), + '__a1': dace.Memlet(data='A1', subset='j') + }, + outputs={'__b1': dace.Memlet(data='B1', subset='j')}, + code='__b1 = __a0 - __a1', + external_edges=True) - inner_state.add_edge(a0, None, nsdfg, 'A0', dace.Memlet(data='A0', subset='0:10')) - inner_state.add_edge(a1, None, nsdfg, 'A1', dace.Memlet(data='A1', subset='0:10')) - inner_state.add_edge(nsdfg, 'B1', b1, None, dace.Memlet(data='B1', subset='0:10')) + nsdfg = inner_state.add_nested_sdfg(inner_sdfg2, None, {'A0', 'A1'}, {'B1'}) + a0 = inner_state.add_access('A0') + a1 = inner_state.add_access('A1') + b1 = inner_state.add_access('B1') - outer_state = outer_sdfg.add_state('outer_state', is_start_state=True) + inner_state.add_edge(a0, None, nsdfg, 'A0', dace.Memlet(data='A0', subset='0:10')) + inner_state.add_edge(a1, None, nsdfg, 'A1', dace.Memlet(data='A1', subset='0:10')) + inner_state.add_edge(nsdfg, 'B1', b1, None, dace.Memlet(data='B1', subset='0:10')) - a = outer_state.add_access('A') - b = outer_state.add_access('B') + outer_state = outer_sdfg.add_state('outer_state', is_start_state=True) - me, mx = outer_state.add_map('map', {'i': '0:2'}) - inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'}, - symbol_mapping={'first': 'max(0, i - fidx)', - 'last': 'min(10, i + lidx)'}) + a = outer_state.add_access('A') + b = outer_state.add_access('B') - outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0') - outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1') - outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0') - outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1') + me, mx = outer_state.add_map('map', {'i': '0:2'}) + inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, + None, {'A0', 'A1'}, {'B0', 'B1'}, + symbol_mapping={ + 'first': 'max(0, i - fidx)', + 'last': 'min(10, i + lidx)' + }) - sdutils.consolidate_edges(outer_sdfg) - A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy() - ref = np.zeros_like(A) - ref_sdfg = copy.deepcopy(outer_sdfg) - ref_sdfg.name = f"{ref_sdfg.name}_ref" - ref_sdfg(A=A, B=ref, fidx=1, lidx=5) + outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0') + outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1') + outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0') + outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1') - MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node) - outer_sdfg.apply_transformations_repeated(InlineSDFG) - val = np.zeros_like(A) - outer_sdfg(A=A, B=val, fidx=1, lidx=5) + sdutils.consolidate_edges(outer_sdfg) + A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy() + ref = np.zeros_like(A) + ref_sdfg = copy.deepcopy(outer_sdfg) + ref_sdfg.name = f"{ref_sdfg.name}_ref" + ref_sdfg(A=A, B=ref, fidx=1, lidx=5) - assert np.array_equal(val, ref) + MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node) + outer_sdfg.apply_transformations_repeated(InlineSDFG) + val = np.zeros_like(A) + outer_sdfg(A=A, B=val, fidx=1, lidx=5) + assert np.array_equal(val, ref) if __name__ == '__main__': - unittest.main() + test_subgraph() + test_nested_sdfg() + test_nested_transient() + test_inputs_outputs() + test_multidim() + test_offsets() + test_offsets_array() + test_mapfission_with_symbols() + test_two_edges_through_map() + test_if_scope() + test_if_scope_2() + test_array_copy_outside_scope() + test_single_data_multiple_connectors() + test_dependent_symbol() From 7ad61767fa8263af7b164e4bbc5a0d77772d5814 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 3 Aug 2023 13:43:24 +0200 Subject: [PATCH 112/127] Added test. --- dace/sdfg/nodes.py | 4 -- tests/sdfg/validation/nested_sdfg_test.py | 47 +++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 91284013f9..28431deeea 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -636,10 +636,6 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context f'Connector "{conn}" was given but is not a registered data descriptor in the nested SDFG. ' 'Example: parameter passed to a function without a matching array within it.') for dname, desc in self.sdfg.arrays.items(): - # TODO(later): Disallow scalars without access nodes (so that this - # check passes for them too). - # if isinstance(desc, data.Scalar): - # continue if not desc.transient and dname not in connectors: raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname) if dname in connectors and desc.transient: diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py index 398a1635ef..100568507e 100644 --- a/tests/sdfg/validation/nested_sdfg_test.py +++ b/tests/sdfg/validation/nested_sdfg_test.py @@ -64,6 +64,53 @@ def test_inout_connector_validation_fail(): assert False, "SDFG should not validate" +def test_nested_sdfg_with_transient_connector(): + + sdfg = dace.SDFG('nested_main') + sdfg.add_array('A', [2], dace.float32) + + def mystate(state, src, dst): + src_node = state.add_read(src) + dst_node = state.add_write(dst) + tasklet = state.add_tasklet('aaa2', {'a'}, {'b'}, 'b = a + 1') + + # input path (src->tasklet[a]) + state.add_memlet_path(src_node, tasklet, dst_conn='a', memlet=dace.Memlet(data=src, subset='0')) + # output path (tasklet[b]->dst) + state.add_memlet_path(tasklet, dst_node, src_conn='b', memlet=dace.Memlet(data=dst, subset='0')) + + + sub_sdfg = dace.SDFG('nested_sub') + sub_sdfg.add_scalar('sA', dace.float32) + sub_sdfg.add_scalar('sB', dace.float32, transient=True) + sub_sdfg.add_scalar('sC', dace.float32, transient=True) + + state0 = sub_sdfg.add_state('subs0') + mystate(state0, 'sA', 'sB') + state1 = sub_sdfg.add_state('subs1') + mystate(state1, 'sB', 'sC') + + sub_sdfg.add_edge(state0, state1, dace.InterstateEdge()) + + + state = sdfg.add_state('s0') + me, mx = state.add_map('mymap', dict(k='0:2')) + nsdfg = state.add_nested_sdfg(sub_sdfg, sdfg, {'sA'}, {'sC'}) + Ain = state.add_read('A') + Aout = state.add_write('A') + + state.add_memlet_path(Ain, me, nsdfg, memlet=dace.Memlet(data='A', subset='k'), dst_conn='sA') + state.add_memlet_path(nsdfg, mx, Aout, memlet=dace.Memlet(data='A', subset='k'), src_conn='sC') + + try: + sdfg.validate() + except dace.sdfg.InvalidSDFGError: + return + + assert False, "SDFG should not validate" + + if __name__ == "__main__": test_inout_connector_validation_success() test_inout_connector_validation_fail() + test_nested_sdfg_with_transient_connector() From b47d82b72decce012b088602acc9b8290da04f8e Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Thu, 3 Aug 2023 13:55:34 +0200 Subject: [PATCH 113/127] Add fix plus testcase --- dace/frontend/fortran/fortran_parser.py | 1 + tests/fortran/array_test.py | 50 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py index 6d1be7138a..d7112892fe 100644 --- a/dace/frontend/fortran/fortran_parser.py +++ b/dace/frontend/fortran/fortran_parser.py @@ -463,6 +463,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node, if i.type == "ALL": shape.append(array.shape[indices]) mysize = mysize * array.shape[indices] + index_list.append(None) else: raise NotImplementedError("Index in ParDecl should be ALL") else: diff --git a/tests/fortran/array_test.py b/tests/fortran/array_test.py index 8685628012..a8ece680a6 100644 --- a/tests/fortran/array_test.py +++ b/tests/fortran/array_test.py @@ -11,6 +11,7 @@ from dace.frontend.fortran import fortran_parser from fparser.two.symbol_table import SymbolTable from dace.sdfg import utils as sdutil +from dace.sdfg.nodes import AccessNode import dace.frontend.fortran.ast_components as ast_components import dace.frontend.fortran.ast_transforms as ast_transforms @@ -167,6 +168,54 @@ def test_fortran_frontend_input_output_connector(): assert (a[1, 2] == 0) +def test_fortran_frontend_memlet_in_map_test(): + """ + Tests that no assumption is made where the iteration variable is inside a memlet subset + """ + test_string = """ + PROGRAM memlet_range_test + implicit None + REAL INP(100, 10) + REAL OUT(100, 10) + CALL memlet_range_test_routine(INP, OUT) + END PROGRAM + + SUBROUTINE memlet_range_test_routine(INP, OUT) + REAL INP(100, 10) + REAL OUT(100, 10) + DO I=1,100 + CALL inner_loops(INP(I, :), OUT(I, :)) + ENDDO + END SUBROUTINE memlet_range_test_routine + + SUBROUTINE inner_loops(INP, OUT) + REAL INP(10) + REAL OUT(10) + DO J=1,10 + OUT(J) = INP(J) + 1 + ENDDO + END SUBROUTINE inner_loops + + """ + sdfg = fortran_parser.create_sdfg_from_string(test_string, "memlet_range_test") + sdfg.simplify() + # Expect that start is begin of for loop -> only one out edge to guard defining iterator variable + assert len(sdfg.out_edges(sdfg.start_state)) == 1 + iter_var = symbolic.symbol(list(sdfg.out_edges(sdfg.start_state)[0].data.assignments.keys())[0]) + + for state in sdfg.states(): + if len(state.nodes()) > 1: + for node in state.nodes(): + if isinstance(node, AccessNode) and node.data in ['INP', 'OUT']: + edges = [*state.in_edges(node), *state.out_edges(node)] + # There should be only one edge in/to the access node + assert len(edges) == 1 + memlet = edges[0].data + # Check that the correct memlet has the iteration variable + assert memlet.subset[0] == (iter_var, iter_var, 1) + assert memlet.subset[1] == (1, 10, 1) + + if __name__ == "__main__": test_fortran_frontend_array_3dmap() @@ -174,3 +223,4 @@ def test_fortran_frontend_input_output_connector(): test_fortran_frontend_input_output_connector() test_fortran_frontend_array_ranges() test_fortran_frontend_twoconnector() + test_fortran_frontend_memlet_in_map_test() From 4c824a310a53c2aefd6d03113dda091f4c48bad8 Mon Sep 17 00:00:00 2001 From: Samuel Martin Date: Thu, 3 Aug 2023 13:59:20 +0200 Subject: [PATCH 114/127] Tried to undo wrong update of dependency --- dace/external/hlslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/external/hlslib b/dace/external/hlslib index 1403cd016c..1b5b3aee5d 160000 --- a/dace/external/hlslib +++ b/dace/external/hlslib @@ -1 +1 @@ -Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce +Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1 From 7171ecc79c716137465a4e05e5dd204ab7bba2d8 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 3 Aug 2023 22:04:07 -0700 Subject: [PATCH 115/127] Fix for None set properties (#1345) --- dace/properties.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dace/properties.py b/dace/properties.py index 6e883f8549..951a0564cc 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -888,9 +888,13 @@ def from_string(s): return [eval(i) for i in re.sub(r"[\{\}\(\)\[\]]", "", s).split(",")] def to_json(self, l): + if l is None: + return None return list(sorted(l)) def from_json(self, l, sdfg=None): + if l is None: + return None return set(l) def __get__(self, obj, objtype=None): From 425fed6ba3941d4fa46499e69eec6c0702522137 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Fri, 4 Aug 2023 07:49:20 -0700 Subject: [PATCH 116/127] Add Object to defined types in code generation and some documentation (#1343) Co-authored-by: acalotoiu <61420859+acalotoiu@users.noreply.github.com> --- dace/codegen/dispatcher.py | 13 +++++++------ dace/codegen/targets/cpp.py | 6 +++--- dace/codegen/targets/cpu.py | 2 +- samples/codegen/tensor_cores.py | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 0b4f58d5ef..be032556a0 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -19,12 +19,13 @@ class DefinedType(aenum.AutoNumberEnum): :see: DefinedMemlets """ - Pointer = () - Scalar = () - Stream = () - StreamArray = () - FPGA_ShiftRegister = () - ArrayInterface = () + Pointer = () # Pointer + Scalar = () # A copyable scalar moved by value (e.g., POD) + Object = () # An object moved by reference + Stream = () # A stream object moved by reference and accessed via a push/pop API + StreamArray = () # An array of Streams + FPGA_ShiftRegister = () # A shift-register object used in FPGA code generation + ArrayInterface = () # An object representing an interface to an array, used mostly in FPGA class DefinedMemlets: diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index afbc6fca12..264311a45c 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -108,7 +108,7 @@ def copy_expr( elif def_type == DefinedType.FPGA_ShiftRegister: return expr - elif def_type in [DefinedType.Scalar, DefinedType.Stream]: + elif def_type in [DefinedType.Scalar, DefinedType.Stream, DefinedType.Object]: if add_offset: raise TypeError("Tried to offset address of scalar {}: {}".format(data_name, offset_cppstr)) @@ -327,7 +327,7 @@ def make_const(expr: str) -> str: ref = '&' if is_scalar else '' defined_type = DefinedType.Scalar if is_scalar else DefinedType.Pointer offset_expr = '' - elif defined_type == DefinedType.Stream: + elif defined_type in (DefinedType.Stream, DefinedType.Object): typedef = defined_ctype ref = '&' offset_expr = '' @@ -1232,7 +1232,7 @@ def visit_Name(self, node: ast.Name): defined_type = None if (self.allow_casts and isinstance(dtype, dtypes.pointer) and memlet.subset.num_elements() == 1): return ast.parse(f"{name}[0]").body[0].value - elif (self.allow_casts and (defined_type == DefinedType.Stream or defined_type == DefinedType.StreamArray) + elif (self.allow_casts and (defined_type in (DefinedType.Stream, DefinedType.StreamArray)) and memlet.dynamic): return ast.parse(f"{name}.pop()").body[0].value else: diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 3b7b592775..9bca137d51 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1228,7 +1228,7 @@ def memlet_ctor(self, sdfg, memlet, dtype, is_output): ptrname = cpp.ptr(memlet.data, sdfg.arrays[memlet.data], sdfg, self._frame) def_type, _ = self._dispatcher.defined_vars.get(ptrname) - if def_type in [DefinedType.Stream, DefinedType.StreamArray]: + if def_type in [DefinedType.Stream, DefinedType.Object, DefinedType.StreamArray]: return self.memlet_stream_ctor(sdfg, memlet) elif def_type in [DefinedType.Pointer, DefinedType.Scalar]: diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index 92ea28eacf..eaad543e6c 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -98,7 +98,7 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, # Add the ctype to defined_vars so that the codegen can properly pass # fragments to functions as an object reference. - self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype) + self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream): From 20240a8552108c5939ad088d3c62c47c39da39e7 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Fri, 4 Aug 2023 07:49:49 -0700 Subject: [PATCH 117/127] Fix symbolic parsing for ternary operators (#1346) Co-authored-by: acalotoiu <61420859+acalotoiu@users.noreply.github.com> --- dace/runtime/include/dace/pyinterop.h | 5 +++++ dace/symbolic.py | 21 +++++++++++++++++++- tests/passes/scalar_to_symbol_test.py | 28 +++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/dace/runtime/include/dace/pyinterop.h b/dace/runtime/include/dace/pyinterop.h index e8f255af70..f93cbab770 100644 --- a/dace/runtime/include/dace/pyinterop.h +++ b/dace/runtime/include/dace/pyinterop.h @@ -52,5 +52,10 @@ template static DACE_HDFI T Abs(T val) { return abs(val); } +template +DACE_CONSTEXPR DACE_HDFI typename std::common_type::type IfExpr(bool condition, const T& iftrue, const U& iffalse) +{ + return condition ? iftrue : iffalse; +} #endif // __DACE_INTEROP_H diff --git a/dace/symbolic.py b/dace/symbolic.py index 01440d465e..0ab6e3f6ff 100644 --- a/dace/symbolic.py +++ b/dace/symbolic.py @@ -658,6 +658,21 @@ def eval(cls, x, y): def _eval_is_boolean(self): return True +class IfExpr(sympy.Function): + + @classmethod + def eval(cls, x, y, z): + """ + Evaluates a ternary operator. + + :param x: Predicate. + :param y: If true return this. + :param z: If false return this. + :return: Return value (literal or symbolic). + """ + if x.is_Boolean: + return (y if x else z) + class BitwiseAnd(sympy.Function): pass @@ -968,6 +983,9 @@ def visit_Constant(self, node): def visit_NameConstant(self, node): return self.visit_Constant(node) + def visit_IfExp(self, node): + new_node = ast.Call(func=ast.Name(id='IfExpr', ctx=ast.Load), args=[node.test, node.body, node.orelse], keywords=[]) + return ast.copy_location(new_node, node) class BitwiseOpConverter(ast.NodeTransformer): """ @@ -1050,6 +1068,7 @@ def pystr_to_symbolic(expr, symbol_map=None, simplify=None) -> sympy.Basic: 'RightShift': RightShift, 'int_floor': int_floor, 'int_ceil': int_ceil, + 'IfExpr': IfExpr, 'Mod': sympy.Mod, } # _clash1 enables all one-letter variables like N as symbols @@ -1059,7 +1078,7 @@ def pystr_to_symbolic(expr, symbol_map=None, simplify=None) -> sympy.Basic: if isinstance(expr, str): # Sympy processes "not/and/or" as direct evaluation. Replace with # And/Or(x, y), Not(x) - if re.search(r'\bnot\b|\band\b|\bor\b|\bNone\b|==|!=|\bis\b', expr): + if re.search(r'\bnot\b|\band\b|\bor\b|\bNone\b|==|!=|\bis\b|\bif\b', expr): expr = unparse(SympyBooleanConverter().visit(ast.parse(expr).body[0])) # NOTE: If the expression contains bitwise operations, replace them with user-functions. diff --git a/tests/passes/scalar_to_symbol_test.py b/tests/passes/scalar_to_symbol_test.py index 9ec23e3886..02cc57a204 100644 --- a/tests/passes/scalar_to_symbol_test.py +++ b/tests/passes/scalar_to_symbol_test.py @@ -666,6 +666,32 @@ def prog(inp: dace.int32[4, 2], out: dace.float64[5, 5]): sdfg.compile() +@pytest.mark.parametrize('compile_time_evaluatable', (False, True)) +def test_ternary_expression(compile_time_evaluatable): + sdfg = dace.SDFG('tester') + sdfg.add_symbol('N', dace.int32) + sdfg.add_symbol('M', dace.int32) + sdfg.add_scalar('a', dace.int32, transient=True) + state = sdfg.add_state() + + if compile_time_evaluatable: + expr = '1 if N > N else 2' + else: + expr = '1 if N > M else 2' + + # Test that symbolic conversion works + symexpr = dace.symbolic.pystr_to_symbolic(expr) + if compile_time_evaluatable: + assert symexpr == 2 + + t = state.add_tasklet('doit', {}, {'out'}, f'out = {expr}') + state.add_edge(t, 'out', state.add_access('a'), None, dace.Memlet('a[0]')) + + promoted = scalar_to_symbol.ScalarToSymbolPromotion().apply_pass(sdfg, {}) + assert promoted == {'a'} + sdfg.compile() + + if __name__ == '__main__': test_find_promotable() test_promote_simple() @@ -687,3 +713,5 @@ def prog(inp: dace.int32[4, 2], out: dace.float64[5, 5]): test_multiple_boolop() test_multidim_cpp() test_dynamic_mapind() + test_ternary_expression(False) + test_ternary_expression(True) From 22718af782d2e36ea7004aa00c79b8fce176fe03 Mon Sep 17 00:00:00 2001 From: Cliff Hodel <111381329+hodelcl@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:15:03 +0200 Subject: [PATCH 118/127] Work Depth Analysis for SDFGs (#1327) * initial push of work_depth analysis script * adding tests to work_depth analysis * rename work depth analysis * todos added * code ready for PR * yapf for formatting * put tests into dace/tests/sdfg * fixed import after merge * merged propgatate_states_symbolically into propagate_states * fixed format issue in work_depth.py * small bugfix --------- Co-authored-by: Cliff Hodel Co-authored-by: Cliff Hodel Co-authored-by: Philipp Schaad --- dace/sdfg/propagation.py | 51 +- dace/sdfg/work_depth_analysis/helpers.py | 331 ++++++++++ dace/sdfg/work_depth_analysis/work_depth.py | 653 ++++++++++++++++++++ tests/sdfg/work_depth_tests.py | 201 ++++++ 4 files changed, 1224 insertions(+), 12 deletions(-) create mode 100644 dace/sdfg/work_depth_analysis/helpers.py create mode 100644 dace/sdfg/work_depth_analysis/work_depth.py create mode 100644 tests/sdfg/work_depth_tests.py diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py index 89ba6928c7..0fec4812b7 100644 --- a/dace/sdfg/propagation.py +++ b/dace/sdfg/propagation.py @@ -10,7 +10,7 @@ import itertools import functools import sympy -from sympy import ceiling +from sympy import ceiling, Symbol from sympy.concrete.summations import Sum import warnings import networkx as nx @@ -564,8 +564,7 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states): Annotate each valid for loop construct with its loop variable ranges. :param sdfg: The SDFG in which to look. - :param unannotated_cycle_states: List of states in cycles without valid - for loop ranges. + :param unannotated_cycle_states: List of lists. Each sub-list contains the states of one unannotated cycle. """ # We import here to avoid cyclic imports. @@ -652,7 +651,7 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states): res = find_for_loop(sdfg, guard, begin, itervar=itvar) if res is None: # No range detected, mark as unbounded. - unannotated_cycle_states.extend(cycle) + unannotated_cycle_states.append(cycle) else: itervar, rng, _ = res @@ -674,10 +673,10 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states): else: # There's no guard state, so this cycle marks all states in it as # dynamically unbounded. - unannotated_cycle_states.extend(cycle) + unannotated_cycle_states.append(cycle) -def propagate_states(sdfg) -> None: +def propagate_states(sdfg, concretize_dynamic_unbounded=False) -> None: """ Annotate the states of an SDFG with the number of executions. @@ -728,6 +727,9 @@ def propagate_states(sdfg) -> None: once. :param sdfg: The SDFG to annotate. + :param concretize_dynamic_unbounded: If True, we annotate dyncamic unbounded states with symbols of the + form "num_execs_{sdfg_id}_{loop_start_state_id}". Hence, for each + unbounded loop its states will have the same number of symbolic executions. :note: This operates on the SDFG in-place. """ @@ -759,6 +761,9 @@ def propagate_states(sdfg) -> None: # cycle should be marked as unannotated. unannotated_cycle_states = [] _annotate_loop_ranges(sdfg, unannotated_cycle_states) + if not concretize_dynamic_unbounded: + # Flatten the list. This keeps the old behavior of propagate_states. + unannotated_cycle_states = [state for cycle in unannotated_cycle_states for state in cycle] # Keep track of states that fully merge a previous conditional split. We do # this so we can remove the dynamic executions flag for those states. @@ -800,7 +805,7 @@ def propagate_states(sdfg) -> None: # The only exception to this rule: If the state is in an # unannotated loop, i.e. should be annotated as dynamic # unbounded instead, we do that. - if (state in unannotated_cycle_states): + if (not concretize_dynamic_unbounded) and state in unannotated_cycle_states: state.executions = 0 state.dynamic_executions = True else: @@ -872,17 +877,39 @@ def propagate_states(sdfg) -> None: else: # Conditional split or unannotated (dynamic unbounded) loop. unannotated_loop_edge = None - for oedge in out_edges: - if oedge.dst in unannotated_cycle_states: - # This is an unannotated loop down this branch. - unannotated_loop_edge = oedge + if concretize_dynamic_unbounded: + to_remove = [] + for oedge in out_edges: + for cycle in unannotated_cycle_states: + if oedge.dst in cycle: + # This is an unannotated loop down this branch. + unannotated_loop_edge = oedge + # remove cycle, since it is now annotated with symbol + to_remove.append(cycle) + + for c in to_remove: + unannotated_cycle_states.remove(c) + else: + for oedge in out_edges: + if oedge.dst in unannotated_cycle_states: + # This is an unannotated loop down this branch. + unannotated_loop_edge = oedge if unannotated_loop_edge is not None: # Traverse as an unbounded loop. out_edges.remove(unannotated_loop_edge) for oedge in out_edges: traversal_q.append((oedge.dst, state.executions, False, itvar_stack)) - traversal_q.append((unannotated_loop_edge.dst, 0, True, itvar_stack)) + if concretize_dynamic_unbounded: + # Here we introduce the num_exec symbol and propagate it down the loop. + # We can always assume these symbols to be non-negative. + traversal_q.append( + (unannotated_loop_edge.dst, + Symbol(f'num_execs_{sdfg.sdfg_id}_{sdfg.node_id(unannotated_loop_edge.dst)}', + nonnegative=True), False, itvar_stack)) + else: + # Propagate dynamic unbounded. + traversal_q.append((unannotated_loop_edge.dst, 0, True, itvar_stack)) else: # Traverse as a conditional split. proposed_executions = state.executions diff --git a/dace/sdfg/work_depth_analysis/helpers.py b/dace/sdfg/work_depth_analysis/helpers.py new file mode 100644 index 0000000000..a80e769f64 --- /dev/null +++ b/dace/sdfg/work_depth_analysis/helpers.py @@ -0,0 +1,331 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" Helper functions used by the work depth analysis. """ + +from dace import SDFG, SDFGState, nodes +from collections import deque +from typing import List, Dict, Set, Tuple, Optional, Union +import networkx as nx + +NodeT = str +EdgeT = Tuple[NodeT, NodeT] + + +class NodeCycle: + + nodes: Set[NodeT] = [] + + def __init__(self, nodes: List[NodeT]) -> None: + self.nodes = set(nodes) + + @property + def length(self) -> int: + return len(self.nodes) + + +UUID_SEPARATOR = '/' + + +def ids_to_string(sdfg_id, state_id=-1, node_id=-1, edge_id=-1): + return (str(sdfg_id) + UUID_SEPARATOR + str(state_id) + UUID_SEPARATOR + str(node_id) + UUID_SEPARATOR + + str(edge_id)) + + +def get_uuid(element, state=None): + if isinstance(element, SDFG): + return ids_to_string(element.sdfg_id) + elif isinstance(element, SDFGState): + return ids_to_string(element.parent.sdfg_id, element.parent.node_id(element)) + elif isinstance(element, nodes.Node): + return ids_to_string(state.parent.sdfg_id, state.parent.node_id(state), state.node_id(element)) + else: + return ids_to_string(-1) + + +def get_domtree(graph: nx.DiGraph, start_node: str, idom: Dict[str, str] = None): + idom = idom or nx.immediate_dominators(graph, start_node) + + alldominated = {n: set() for n in graph.nodes} + domtree = nx.DiGraph() + + for node, dom in idom.items(): + if node is dom: + continue + domtree.add_edge(dom, node) + alldominated[dom].add(node) + + nextidom = idom[dom] + ndom = nextidom if nextidom != dom else None + + while ndom: + alldominated[ndom].add(node) + nextidom = idom[ndom] + ndom = nextidom if nextidom != ndom else None + + # 'Rank' the tree, i.e., annotate each node with the level it is on. + q = deque() + q.append((start_node, 0)) + while q: + node, level = q.popleft() + domtree.add_node(node, level=level) + for s in domtree.successors(node): + q.append((s, level + 1)) + + return alldominated, domtree + + +def get_backedges(graph: nx.DiGraph, + start: Optional[NodeT], + strict: bool = False) -> Union[Set[EdgeT], Tuple[Set[EdgeT], Set[EdgeT]]]: + '''Find all backedges in a directed graph. + + Note: + This algorithm has an algorithmic complexity of O((|V|+|E|)*C) for a + graph with vertices V, edges E, and C cycles. + + Args: + graph (nx.DiGraph): The graph for which to search backedges. + start (str): Start node of the graph. If no start is provided, a node + with no incoming edges is used as the start. If no such node can + be found, a `ValueError` is raised. + + Returns: + A set of backedges in the graph. + + Raises: + ValueError: If no `start` is provided and the graph contains no nodes + with no incoming edges. + ''' + backedges = set() + eclipsed_backedges = set() + + if start is None: + for node in graph.nodes(): + if graph.in_degree(node) == 0: + start = node + break + if start is None: + raise ValueError('No start node provided and no start node could ' + 'be determined automatically') + + # Gather all cycles in the graph. Cycles are represented as a sequence of + # nodes. + # O((|V|+|E|)*(C+1)), for C cycles. + all_cycles_nx: List[List[NodeT]] = nx.cycles.simple_cycles(graph) + #all_cycles_nx: List[List[NodeT]] = nx.simple_cycles(graph) + all_cycles: Set[NodeCycle] = set() + for cycle in all_cycles_nx: + all_cycles.add(NodeCycle(cycle)) + + # Construct a dictionary mapping a node to the cycles containing that node. + # O(|V|*|C|) + cycle_map: Dict[NodeT, Set[NodeCycle]] = dict() + for cycle in all_cycles: + for node in cycle.nodes: + try: + cycle_map[node].add(cycle) + except KeyError: + cycle_map[node] = set([cycle]) + + # Do a BFS traversal of the graph to detect the back edges. + # For each node that is part of an (unhandled) cycle, find the longest + # still unhandled cycle and try to use it to find the back edge for it. + bfs_frontier = [start] + visited: Set[NodeT] = set([start]) + handled_cycles: Set[NodeCycle] = set() + unhandled_cycles = all_cycles + while bfs_frontier: + node = bfs_frontier.pop(0) + pred = [p for p in graph.predecessors(node) if p not in visited] + longest_cycles: Dict[NodeT, NodeCycle] = dict() + try: + cycles = cycle_map[node] + remove_cycles = set() + for cycle in cycles: + if cycle not in handled_cycles: + for p in pred: + if p in cycle.nodes: + if p not in longest_cycles: + longest_cycles[p] = cycle + else: + if cycle.length > longest_cycles[p].length: + longest_cycles[p] = cycle + else: + remove_cycles.add(cycle) + for cycle in remove_cycles: + cycles.remove(cycle) + except KeyError: + longest_cycles = dict() + + # For the current node, find the incoming edge which belongs to the + # cycle and has not been visited yet, which indicates a backedge. + node_backedge_candidates: Set[Tuple[EdgeT, NodeCycle]] = set() + for p, longest_cycle in longest_cycles.items(): + handled_cycles.add(longest_cycle) + unhandled_cycles.remove(longest_cycle) + cycle_map[node].remove(longest_cycle) + backedge_candidates = graph.in_edges(node) + for candidate in backedge_candidates: + src = candidate[0] + dst = candidate[0] + if src not in visited and src in longest_cycle.nodes: + node_backedge_candidates.add((candidate, longest_cycle)) + if not strict: + backedges.add(candidate) + + # Make sure that any cycle containing this back edge is + # not evaluated again, i.e., mark as handled. + remove_cycles = set() + for cycle in unhandled_cycles: + if src in cycle.nodes and dst in cycle.nodes: + handled_cycles.add(cycle) + remove_cycles.add(cycle) + for cycle in remove_cycles: + unhandled_cycles.remove(cycle) + + # If strict is set, we only report the longest cycle's back edges for + # any given node, and separately return any other backedges as + # 'eclipsed' backedges. In the case of a while-loop, for example, + # the loop edge is considered a backedge, while a continue inside the + # loop is considered an 'eclipsed' backedge. + if strict: + longest_candidate: Tuple[EdgeT, NodeCycle] = None + eclipsed_candidates = set() + for be_candidate in node_backedge_candidates: + if longest_candidate is None: + longest_candidate = be_candidate + elif longest_candidate[1].length < be_candidate[1].length: + eclipsed_candidates.add(longest_candidate[0]) + longest_candidate = be_candidate + else: + eclipsed_candidates.add(be_candidate[0]) + if longest_candidate is not None: + backedges.add(longest_candidate[0]) + if eclipsed_candidates: + eclipsed_backedges.update(eclipsed_candidates) + + # Continue BFS. + for neighbour in graph.successors(node): + if neighbour not in visited: + visited.add(neighbour) + bfs_frontier.append(neighbour) + + if strict: + return backedges, eclipsed_backedges + else: + return backedges + + +def find_loop_guards_tails_exits(sdfg_nx: nx.DiGraph): + """ + Detects loops in a SDFG. For each loop, it identifies (node, oNode, exit). + We know that there is a backedge from oNode to node that creates the loop and that exit is the exit state of the loop. + + :param sdfg_nx: The networkx representation of a SDFG. + """ + + # preparation phase: compute dominators, backedges etc + for node in sdfg_nx.nodes(): + if sdfg_nx.in_degree(node) == 0: + start = node + break + if start is None: + raise ValueError('No start node could be determined') + + # sdfg can have multiple end nodes --> not good for postDomTree + # --> add a new end node + artificial_end_node = 'artificial_end_node' + sdfg_nx.add_node(artificial_end_node) + for node in sdfg_nx.nodes(): + if sdfg_nx.out_degree(node) == 0 and node != artificial_end_node: + # this is an end node of the sdfg + sdfg_nx.add_edge(node, artificial_end_node) + + # sanity check: + if sdfg_nx.in_degree(artificial_end_node) == 0: + raise ValueError('No end node could be determined in the SDFG') + + # compute dominators and backedges + iDoms = nx.immediate_dominators(sdfg_nx, start) + allDom, domTree = get_domtree(sdfg_nx, start, iDoms) + + reversed_sdfg_nx = sdfg_nx.reverse() + iPostDoms = nx.immediate_dominators(reversed_sdfg_nx, artificial_end_node) + allPostDoms, postDomTree = get_domtree(reversed_sdfg_nx, artificial_end_node, iPostDoms) + + backedges = get_backedges(sdfg_nx, start) + backedgesDstDict = {} + for be in backedges: + if be[1] in backedgesDstDict: + backedgesDstDict[be[1]].add(be) + else: + backedgesDstDict[be[1]] = set([be]) + + # This list will be filled with triples (node, oNode, exit), one triple for each loop construct in the SDFG. + # There will always be a backedge from oNode to node. Either node or oNode will be the corresponding loop guard, + # depending on whether it is a while-do or a do-while loop. exit will always be the exit state of the loop. + nodes_oNodes_exits = [] + + # iterate over all nodes + for node in sdfg_nx.nodes(): + # Check if any backedge ends in node. + if node in backedgesDstDict: + inc_backedges = backedgesDstDict[node] + + # gather all successors of node that are not reached by backedges + successors = [] + for edge in sdfg_nx.out_edges(node): + if not edge in backedges: + successors.append(edge[1]) + + # For each incoming backedge, we want to find oNode and exit. There can be multiple backedges, in case + # we have a continue statement in the original code. But we can handle these backedges normally. + for be in inc_backedges: + # since node has an incoming backedge, it is either a loop guard or loop tail + # oNode will exactly be the other thing + oNode = be[0] + exitCandidates = set() + # search for exit candidates: + # a state is a exit candidate if: + # - it is in successor and it does not dominate oNode (else it dominates + # the last loop state, and hence is inside the loop itself) + # - is is a successor of oNode (but not node) + # This handles both cases of while-do and do-while loops + for succ in successors: + if succ != oNode and oNode not in allDom[succ]: + exitCandidates.add(succ) + for succ in sdfg_nx.successors(oNode): + if succ != node: + exitCandidates.add(succ) + + if len(exitCandidates) == 0: + raise ValueError('failed to find any exit nodes') + elif len(exitCandidates) > 1: + # Find the exit candidate that sits highest up in the + # postdominator tree (i.e., has the lowest level). + # That must be the exit node (it must post-dominate) + # everything inside the loop. If there are multiple + # candidates on the lowest level (i.e., disjoint set of + # postdominated nodes), there are multiple exit paths, + # and they all share one level. + cand = exitCandidates.pop() + minSet = set([cand]) + minLevel = nx.get_node_attributes(postDomTree, 'level')[cand] + for cand in exitCandidates: + curr_level = nx.get_node_attributes(postDomTree, 'level')[cand] + if curr_level < minLevel: + # new minimum found + minLevel = curr_level + minSet.clear() + minSet.add(cand) + elif curr_level == minLevel: + # add cand to curr set + minSet.add(cand) + + if len(minSet) > 0: + exitCandidates = minSet + else: + raise ValueError('failed to find exit minSet') + + # now we have a triple (node, oNode, exitCandidates) + nodes_oNodes_exits.append((node, oNode, exitCandidates)) + + return nodes_oNodes_exits diff --git a/dace/sdfg/work_depth_analysis/work_depth.py b/dace/sdfg/work_depth_analysis/work_depth.py new file mode 100644 index 0000000000..a05fe10266 --- /dev/null +++ b/dace/sdfg/work_depth_analysis/work_depth.py @@ -0,0 +1,653 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" Work depth analysis for any input SDFG. Can be used with the DaCe VS Code extension or +from command line as a Python script. """ + +import argparse +from collections import deque +from dace.sdfg import nodes as nd, propagation, InterstateEdge +from dace import SDFG, SDFGState, dtypes +from dace.subsets import Range +from typing import Tuple, Dict +import os +import sympy as sp +from copy import deepcopy +from dace.libraries.blas import MatMul +from dace.libraries.standard import Reduce, Transpose +from dace.symbolic import pystr_to_symbolic +import ast +import astunparse +import warnings + +from dace.sdfg.work_depth_analysis.helpers import get_uuid, find_loop_guards_tails_exits + + +def get_array_size_symbols(sdfg): + """ + Returns all symbols that appear isolated in shapes of the SDFG's arrays. + These symbols can then be assumed to be positive. + + :note: This only works if a symbol appears in isolation, i.e. array A[N]. + If we have A[N+1], we cannot assume N to be positive. + :param sdfg: The SDFG in which it searches for symbols. + :return: A set containing symbols which we can assume to be positive. + """ + symbols = set() + for _, _, arr in sdfg.arrays_recursive(): + for s in arr.shape: + if isinstance(s, sp.Symbol): + symbols.add(s) + return symbols + + +def posify_certain_symbols(expr, syms_to_posify): + """ + Takes an expression and evaluates it while assuming that certain symbols are positive. + + :param expr: The expression to evaluate. + :param syms_to_posify: List of symbols we assume to be positive. + :note: This is adapted from the Sympy function posify. + """ + + expr = sp.sympify(expr) + + reps = {s: sp.Dummy(s.name, positive=True, **s.assumptions0) for s in syms_to_posify if s.is_positive is None} + expr = expr.subs(reps) + return expr.subs({r: s for s, r in reps.items()}) + + +def symeval(val, symbols): + """ + Takes a sympy expression and substitutes its symbols according to a dict { old_symbol: new_symbol}. + + :param val: The expression we are updating. + :param symbols: Dictionary of key value pairs { old_symbol: new_symbol}. + """ + first_replacement = {pystr_to_symbolic(k): pystr_to_symbolic('__REPLSYM_' + k) for k in symbols.keys()} + second_replacement = {pystr_to_symbolic('__REPLSYM_' + k): v for k, v in symbols.items()} + return val.subs(first_replacement).subs(second_replacement) + + +def evaluate_symbols(base, new): + result = {} + for k, v in new.items(): + result[k] = symeval(v, base) + return result + + +def count_work_matmul(node, symbols, state): + A_memlet = next(e for e in state.in_edges(node) if e.dst_conn == '_a') + B_memlet = next(e for e in state.in_edges(node) if e.dst_conn == '_b') + C_memlet = next(e for e in state.out_edges(node) if e.src_conn == '_c') + result = 2 # Multiply, add + # Batch + if len(C_memlet.data.subset) == 3: + result *= symeval(C_memlet.data.subset.size()[0], symbols) + # M*N + result *= symeval(C_memlet.data.subset.size()[-2], symbols) + result *= symeval(C_memlet.data.subset.size()[-1], symbols) + # K + result *= symeval(A_memlet.data.subset.size()[-1], symbols) + return result + + +def count_work_reduce(node, symbols, state): + result = 0 + if node.wcr is not None: + result += count_arithmetic_ops_code(node.wcr) + in_memlet = None + in_edges = state.in_edges(node) + if in_edges is not None and len(in_edges) == 1: + in_memlet = in_edges[0] + if in_memlet is not None and in_memlet.data.volume is not None: + result *= in_memlet.data.volume + else: + result = 0 + return result + + +LIBNODES_TO_WORK = { + MatMul: count_work_matmul, + Transpose: lambda *args: 0, + Reduce: count_work_reduce, +} + + +def count_depth_matmul(node, symbols, state): + # For now we set it equal to work: see comments in count_depth_reduce just below + return count_work_matmul(node, symbols, state) + + +def count_depth_reduce(node, symbols, state): + # depth of reduction is log2 of the work + # TODO: Can we actually assume this? Or is it equal to the work? + # Another thing to consider is that we essetially do NOT count wcr edges as operations for now... + + # return sp.ceiling(sp.log(count_work_reduce(node, symbols, state), 2)) + # set it equal to work for now + return count_work_reduce(node, symbols, state) + + +LIBNODES_TO_DEPTH = { + MatMul: count_depth_matmul, + Transpose: lambda *args: 0, + Reduce: count_depth_reduce, +} + +bigo = sp.Function('bigo') +PYFUNC_TO_ARITHMETICS = { + 'float': 0, + 'dace.float64': 0, + 'dace.int64': 0, + 'math.exp': 1, + 'exp': 1, + 'math.tanh': 1, + 'sin': 1, + 'cos': 1, + 'tanh': 1, + 'math.sqrt': 1, + 'sqrt': 1, + 'atan2:': 1, + 'min': 0, + 'max': 0, + 'ceiling': 0, + 'floor': 0, + 'abs': 0 +} + + +class ArithmeticCounter(ast.NodeVisitor): + + def __init__(self): + self.count = 0 + + def visit_BinOp(self, node): + if isinstance(node.op, ast.MatMult): + raise NotImplementedError('MatMult op count requires shape ' + 'inference') + self.count += 1 + return self.generic_visit(node) + + def visit_UnaryOp(self, node): + self.count += 1 + return self.generic_visit(node) + + def visit_Call(self, node): + fname = astunparse.unparse(node.func)[:-1] + if fname not in PYFUNC_TO_ARITHMETICS: + print( + 'WARNING: Unrecognized python function "%s". If this is a type conversion, like "dace.float64", then this is fine.' + % fname) + return self.generic_visit(node) + self.count += PYFUNC_TO_ARITHMETICS[fname] + return self.generic_visit(node) + + def visit_AugAssign(self, node): + return self.visit_BinOp(node) + + def visit_For(self, node): + raise NotImplementedError + + def visit_While(self, node): + raise NotImplementedError + + +def count_arithmetic_ops_code(code): + ctr = ArithmeticCounter() + if isinstance(code, (tuple, list)): + for stmt in code: + ctr.visit(stmt) + elif isinstance(code, str): + ctr.visit(ast.parse(code)) + else: + ctr.visit(code) + return ctr.count + + +class DepthCounter(ast.NodeVisitor): + # so far this is identical to the ArithmeticCounter above. + def __init__(self): + self.count = 0 + + def visit_BinOp(self, node): + if isinstance(node.op, ast.MatMult): + raise NotImplementedError('MatMult op count requires shape ' + 'inference') + self.count += 1 + return self.generic_visit(node) + + def visit_UnaryOp(self, node): + self.count += 1 + return self.generic_visit(node) + + def visit_Call(self, node): + fname = astunparse.unparse(node.func)[:-1] + if fname not in PYFUNC_TO_ARITHMETICS: + print( + 'WARNING: Unrecognized python function "%s". If this is a type conversion, like "dace.float64", then this is fine.' + % fname) + return self.generic_visit(node) + self.count += PYFUNC_TO_ARITHMETICS[fname] + return self.generic_visit(node) + + def visit_AugAssign(self, node): + return self.visit_BinOp(node) + + def visit_For(self, node): + raise NotImplementedError + + def visit_While(self, node): + raise NotImplementedError + + +def count_depth_code(code): + # so far this is the same as the work counter, since work = depth for each tasklet, as we can't assume any parallelism + ctr = ArithmeticCounter() + if isinstance(code, (tuple, list)): + for stmt in code: + ctr.visit(stmt) + elif isinstance(code, str): + ctr.visit(ast.parse(code)) + else: + ctr.visit(code) + return ctr.count + + +def tasklet_work(tasklet_node, state): + if tasklet_node.code.language == dtypes.Language.CPP: + for oedge in state.out_edges(tasklet_node): + return bigo(oedge.data.num_accesses) + + elif tasklet_node.code.language == dtypes.Language.Python: + return count_arithmetic_ops_code(tasklet_node.code.code) + else: + # other languages not implemented, count whole tasklet as work of 1 + warnings.warn('Work of tasklets only properly analyzed for Python or CPP. For all other ' + 'languages work = 1 will be counted for each tasklet.') + return 1 + + +def tasklet_depth(tasklet_node, state): + # TODO: how to get depth of CPP tasklets? + # For now we use depth == work: + if tasklet_node.code.language == dtypes.Language.CPP: + for oedge in state.out_edges(tasklet_node): + return bigo(oedge.data.num_accesses) + if tasklet_node.code.language == dtypes.Language.Python: + return count_depth_code(tasklet_node.code.code) + else: + # other languages not implemented, count whole tasklet as work of 1 + warnings.warn('Depth of tasklets only properly analyzed for Python code. For all other ' + 'languages depth = 1 will be counted for each tasklet.') + return 1 + + +def get_tasklet_work(node, state): + return tasklet_work(node, state), -1 + + +def get_tasklet_work_depth(node, state): + return tasklet_work(node, state), tasklet_depth(node, state) + + +def get_tasklet_avg_par(node, state): + return tasklet_work(node, state), tasklet_depth(node, state) + + +def sdfg_work_depth(sdfg: SDFG, w_d_map: Dict[str, Tuple[sp.Expr, sp.Expr]], analyze_tasklet, + symbols) -> Tuple[sp.Expr, sp.Expr]: + """ + Analyze the work and depth of a given SDFG. + First we determine the work and depth of each state. Then we break loops in the state machine, such that we get a DAG. + Lastly, we compute the path with most work and the path with the most depth in order to get the total work depth. + + :param sdfg: The SDFG to analyze. + :param w_d_map: Dictionary which will save the result. + :param analyze_tasklet: Function used to analyze tasklet nodes. + :param symbols: A dictionary mapping local nested SDFG symbols to global symbols. + :return: A tuple containing the work and depth of the SDFG. + """ + + # First determine the work and depth of each state individually. + # Keep track of the work and depth for each state in a dictionary, where work and depth are multiplied by the number + # of times the state will be executed. + state_depths: Dict[SDFGState, sp.Expr] = {} + state_works: Dict[SDFGState, sp.Expr] = {} + for state in sdfg.nodes(): + state_work, state_depth = state_work_depth(state, w_d_map, analyze_tasklet, symbols) + state_works[state] = sp.simplify(state_work * state.executions) + state_depths[state] = sp.simplify(state_depth * state.executions) + w_d_map[get_uuid(state)] = (state_works[state], state_depths[state]) + + # Prepare the SDFG for a depth analysis by breaking loops. This removes the edge between the last loop state and + # the guard, and instead places an edge between the last loop state and the exit state. + # This transforms the state machine into a DAG. Hence, we can find the "heaviest" and "deepest" paths in linear time. + # Additionally, construct a dummy exit state and connect every state that has no outgoing edges to it. + + # identify all loops in the SDFG + nodes_oNodes_exits = find_loop_guards_tails_exits(sdfg._nx) + + # Now we need to go over each triple (node, oNode, exits). For each triple, we + # - remove edge (oNode, node), i.e. the backward edge + # - for all exits e, add edge (oNode, e). This edge may already exist + for node, oNode, exits in nodes_oNodes_exits: + sdfg.remove_edge(sdfg.edges_between(oNode, node)[0]) + for e in exits: + if len(sdfg.edges_between(oNode, e)) == 0: + # no edge there yet + sdfg.add_edge(oNode, e, InterstateEdge()) + + # add a dummy exit to the SDFG, such that each path ends there. + dummy_exit = sdfg.add_state('dummy_exit') + for state in sdfg.nodes(): + if len(sdfg.out_edges(state)) == 0 and state != dummy_exit: + sdfg.add_edge(state, dummy_exit, InterstateEdge()) + + # These two dicts save the current length of the "heaviest", resp. "deepest", paths at each state. + work_map: Dict[SDFGState, sp.Expr] = {} + depth_map: Dict[SDFGState, sp.Expr] = {} + # The dummy state has 0 work and depth. + state_depths[dummy_exit] = sp.sympify(0) + state_works[dummy_exit] = sp.sympify(0) + + # Perform a BFS traversal of the state machine and calculate the maximum work / depth at each state. Only advance to + # the next state in the BFS if all incoming edges have been visited, to ensure the maximum work / depth expressions + # have been calculated. + traversal_q = deque() + traversal_q.append((sdfg.start_state, sp.sympify(0), sp.sympify(0), None)) + visited = set() + while traversal_q: + state, depth, work, ie = traversal_q.popleft() + + if ie is not None: + visited.add(ie) + + n_depth = sp.simplify(depth + state_depths[state]) + n_work = sp.simplify(work + state_works[state]) + + # If we are analysing average parallelism, we don't search "heaviest" and "deepest" paths separately, but we want one + # single path with the least average parallelsim (of all paths with more than 0 work). + if analyze_tasklet == get_tasklet_avg_par: + if state in depth_map: # and hence als state in work_map + # if current path has 0 depth, we don't do anything. + if n_depth != 0: + # see if we need to update the work and depth of the current state + # we update if avg parallelism of new incoming path is less than current avg parallelism + old_avg_par = sp.simplify(work_map[state] / depth_map[state]) + new_avg_par = sp.simplify(n_work / n_depth) + + if depth_map[state] == 0 or new_avg_par < old_avg_par: + # old value was divided by zero or new path gives actually worse avg par, then we keep new value + depth_map[state] = n_depth + work_map[state] = n_work + else: + depth_map[state] = n_depth + work_map[state] = n_work + else: + # search heaviest and deepest path separately + if state in depth_map: # and consequently also in work_map + depth_map[state] = sp.Max(depth_map[state], n_depth) + work_map[state] = sp.Max(work_map[state], n_work) + else: + depth_map[state] = n_depth + work_map[state] = n_work + + out_edges = sdfg.out_edges(state) + # only advance after all incoming edges were visited (meaning that current work depth values of state are final). + if any(iedge not in visited for iedge in sdfg.in_edges(state)): + pass + else: + for oedge in out_edges: + traversal_q.append((oedge.dst, depth_map[state], work_map[state], oedge)) + + try: + max_depth = depth_map[dummy_exit] + max_work = work_map[dummy_exit] + except KeyError: + # If we get a KeyError above, this means that the traversal never reached the dummy_exit state. + # This happens if the loops were not properly detected and broken. + raise Exception( + 'Analysis failed, since not all loops got detected. It may help to use more structured loop constructs.') + + sdfg_result = (sp.simplify(max_work), sp.simplify(max_depth)) + w_d_map[get_uuid(sdfg)] = sdfg_result + return sdfg_result + + +def scope_work_depth(state: SDFGState, + w_d_map: Dict[str, sp.Expr], + analyze_tasklet, + symbols, + entry: nd.EntryNode = None) -> Tuple[sp.Expr, sp.Expr]: + """ + Analyze the work and depth of a scope. + This works by traversing through the scope analyzing the work and depth of each encountered node. + Depending on what kind of node we encounter, we do the following: + - EntryNode: Recursively analyze work depth of scope. + - Tasklet: use analyze_tasklet to get work depth of tasklet node. + - NestedSDFG: After translating its local symbols to global symbols, we analyze the nested SDFG recursively. + - LibraryNode: Library nodes are analyzed with special functions depending on their type. + Work inside a state can simply be summed up, but for the depth we need to find the longest path. Since dataflow is a DAG, + this can be done in linear time by traversing the graph in topological order. + + :param state: The state in which the scope to analyze is contained. + :param sym_map: A dictionary mapping symbols to their values. + :param entry: The entry node of the scope to analyze. If None, the entire state is analyzed. + :return: A tuple containing the work and depth of the scope. + """ + + # find the work and depth of each node + # for maps and nested SDFG, we do it recursively + work = sp.sympify(0) + max_depth = sp.sympify(0) + scope_nodes = state.scope_children()[entry] + scope_exit = None if entry is None else state.exit_node(entry) + for node in scope_nodes: + # add node to map + w_d_map[get_uuid(node, state)] = (sp.sympify(0), sp.sympify(0)) + if isinstance(node, nd.EntryNode): + # If the scope contains an entry node, we need to recursively analyze the sub-scope of the entry node first. + # The resulting work/depth are summarized into the entry node + s_work, s_depth = scope_work_depth(state, w_d_map, analyze_tasklet, symbols, node) + # add up work for whole state, but also save work for this sub-scope scope in w_d_map + work += s_work + w_d_map[get_uuid(node, state)] = (s_work, s_depth) + elif node == scope_exit: + # don't do anything for exit nodes, everthing handled already in the corresponding entry node. + pass + elif isinstance(node, nd.Tasklet): + # add up work for whole state, but also save work for this node in w_d_map + t_work, t_depth = analyze_tasklet(node, state) + work += t_work + w_d_map[get_uuid(node, state)] = (sp.sympify(t_work), sp.sympify(t_depth)) + elif isinstance(node, nd.NestedSDFG): + # keep track of nested symbols: "symbols" maps local nested SDFG symbols to global symbols. + # We only want global symbols in our final work depth expressions. + nested_syms = {} + nested_syms.update(symbols) + nested_syms.update(evaluate_symbols(symbols, node.symbol_mapping)) + # Nested SDFGs are recursively analyzed first. + nsdfg_work, nsdfg_depth = sdfg_work_depth(node.sdfg, w_d_map, analyze_tasklet, nested_syms) + + # add up work for whole state, but also save work for this nested SDFG in w_d_map + work += nsdfg_work + w_d_map[get_uuid(node, state)] = (nsdfg_work, nsdfg_depth) + elif isinstance(node, nd.LibraryNode): + lib_node_work = LIBNODES_TO_WORK[type(node)](node, symbols, state) + work += lib_node_work + lib_node_depth = -1 # not analyzed + if analyze_tasklet != get_tasklet_work: + # we are analyzing depth + lib_node_depth = LIBNODES_TO_DEPTH[type(node)](node, symbols, state) + w_d_map[get_uuid(node, state)] = (lib_node_work, lib_node_depth) + + if entry is not None: + # If the scope being analyzed is a map, multiply the work by the number of iterations of the map. + if isinstance(entry, nd.MapEntry): + nmap: nd.Map = entry.map + range: Range = nmap.range + n_exec = range.num_elements_exact() + work = work * sp.simplify(n_exec) + else: + print('WARNING: Only Map scopes are supported in work analysis for now. Assuming 1 iteration.') + + # Work inside a state can simply be summed up. But now we need to find the depth of a state (i.e. longest path). + # Since dataflow graph is a DAG, this can be done in linear time. + max_depth = sp.sympify(0) + # only do this if we are analyzing depth + if analyze_tasklet == get_tasklet_work_depth or analyze_tasklet == get_tasklet_avg_par: + # Calculate the maximum depth of the scope by finding the 'deepest' path from the source to the sink. This is done by + # a traversal in topological order, where each node propagates its current max depth for all incoming paths. + traversal_q = deque() + visited = set() + # find all starting nodes + if entry: + # the entry is the starting node + traversal_q.append((entry, sp.sympify(0), None)) + else: + for node in scope_nodes: + if len(state.in_edges(node)) == 0: + # This node is a start node of the traversal + traversal_q.append((node, sp.sympify(0), None)) + # this map keeps track of the length of the longest path ending at each state so far seen. + depth_map = {} + while traversal_q: + node, in_depth, in_edge = traversal_q.popleft() + + if in_edge is not None: + visited.add(in_edge) + + n_depth = sp.simplify(in_depth + w_d_map[get_uuid(node, state)][1]) + + if node in depth_map: + depth_map[node] = sp.Max(depth_map[node], n_depth) + else: + depth_map[node] = n_depth + + out_edges = state.out_edges(node) + # Only advance to next node, if all incoming edges have been visited or the current node is the entry (aka starting node). + # If the current node is the exit of the scope, we stop, such that we don't leave the scope. + if (all(iedge in visited for iedge in state.in_edges(node)) or node == entry) and node != scope_exit: + # If we encounter a nested map, we must not analyze its contents (as they have already been recursively analyzed). + # Hence, we continue from the outgoing edges of the corresponding exit. + if isinstance(node, nd.EntryNode) and node != entry: + exit_node = state.exit_node(node) + # replace out_edges with the out_edges of the scope exit node + out_edges = state.out_edges(exit_node) + for oedge in out_edges: + traversal_q.append((oedge.dst, depth_map[node], oedge)) + if len(out_edges) == 0 or node == scope_exit: + # We have reached an end node --> update max_depth + max_depth = sp.Max(max_depth, depth_map[node]) + + # summarise work / depth of the whole scope in the dictionary + scope_result = (sp.simplify(work), sp.simplify(max_depth)) + w_d_map[get_uuid(state)] = scope_result + return scope_result + + +def state_work_depth(state: SDFGState, w_d_map: Dict[str, sp.Expr], analyze_tasklet, + symbols) -> Tuple[sp.Expr, sp.Expr]: + """ + Analyze the work and depth of a state. + + :param state: The state to analyze. + :param w_d_map: The result will be saved to this map. + :param analyze_tasklet: Function used to analyze tasklet nodes. + :param symbols: A dictionary mapping local nested SDFG symbols to global symbols. + :return: A tuple containing the work and depth of the state. + """ + work, depth = scope_work_depth(state, w_d_map, analyze_tasklet, symbols, None) + return work, depth + + +def analyze_sdfg(sdfg: SDFG, w_d_map: Dict[str, sp.Expr], analyze_tasklet) -> None: + """ + Analyze a given SDFG. We can either analyze work, work and depth or average parallelism. + + :note: SDFGs should have split interstate edges. This means there should be no interstate edges containing both a + condition and an assignment. + :param sdfg: The SDFG to analyze. + :param w_d_map: Dictionary of SDFG elements to (work, depth) tuples. Result will be saved in here. + :param analyze_tasklet: The function used to analyze tasklet nodes. Analyzes either just work, work and depth or average parallelism. + """ + + # deepcopy such that original sdfg not changed + sdfg = deepcopy(sdfg) + + # Run state propagation for all SDFGs recursively. This is necessary to determine the number of times each state + # will be executed, or to determine upper bounds for that number (such as in the case of branching) + for sd in sdfg.all_sdfgs_recursive(): + propagation.propagate_states(sd, concretize_dynamic_unbounded=True) + + # Analyze the work and depth of the SDFG. + symbols = {} + sdfg_work_depth(sdfg, w_d_map, analyze_tasklet, symbols) + + # Note: This posify could be done more often to improve performance. + array_symbols = get_array_size_symbols(sdfg) + for k, (v_w, v_d) in w_d_map.items(): + # The symeval replaces nested SDFG symbols with their global counterparts. + v_w = posify_certain_symbols(symeval(v_w, symbols), array_symbols) + v_d = posify_certain_symbols(symeval(v_d, symbols), array_symbols) + w_d_map[k] = (v_w, v_d) + + +################################################################################ +# Utility functions for running the analysis from the command line ############# +################################################################################ + + +def main() -> None: + + parser = argparse.ArgumentParser('work_depth', + usage='python work_depth.py [-h] filename --analyze {work,workDepth,avgPar}', + description='Analyze the work/depth of an SDFG.') + + parser.add_argument('filename', type=str, help='The SDFG file to analyze.') + parser.add_argument('--analyze', + choices=['work', 'workDepth', 'avgPar'], + default='workDepth', + help='Choose what to analyze. Default: workDepth') + + args = parser.parse_args() + + if not os.path.exists(args.filename): + print(args.filename, 'does not exist.') + exit() + + if args.analyze == 'workDepth': + analyze_tasklet = get_tasklet_work_depth + elif args.analyze == 'avgPar': + analyze_tasklet = get_tasklet_avg_par + elif args.analyze == 'work': + analyze_tasklet = get_tasklet_work + + sdfg = SDFG.from_file(args.filename) + work_depth_map = {} + analyze_sdfg(sdfg, work_depth_map, analyze_tasklet) + + if args.analyze == 'workDepth': + for k, v, in work_depth_map.items(): + work_depth_map[k] = (str(sp.simplify(v[0])), str(sp.simplify(v[1]))) + elif args.analyze == 'work': + for k, v, in work_depth_map.items(): + work_depth_map[k] = str(sp.simplify(v[0])) + elif args.analyze == 'avgPar': + for k, v, in work_depth_map.items(): + work_depth_map[k] = str(sp.simplify(v[0] / v[1]) if str(v[1]) != '0' else 0) # work / depth = avg par + + result_whole_sdfg = work_depth_map[get_uuid(sdfg)] + + print(80 * '-') + if args.analyze == 'workDepth': + print("Work:\t", result_whole_sdfg[0]) + print("Depth:\t", result_whole_sdfg[1]) + elif args.analyze == 'work': + print("Work:\t", result_whole_sdfg) + elif args.analyze == 'avgPar': + print("Average Parallelism:\t", result_whole_sdfg) + print(80 * '-') + + +if __name__ == '__main__': + main() diff --git a/tests/sdfg/work_depth_tests.py b/tests/sdfg/work_depth_tests.py new file mode 100644 index 0000000000..133afe8ae4 --- /dev/null +++ b/tests/sdfg/work_depth_tests.py @@ -0,0 +1,201 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" Contains test cases for the work depth analysis. """ +import dace as dc +from dace.sdfg.work_depth_analysis.work_depth import analyze_sdfg, get_tasklet_work_depth +from dace.sdfg.work_depth_analysis.helpers import get_uuid +import sympy as sp + +from dace.transformation.interstate import NestSDFG +from dace.transformation.dataflow import MapExpansion + +# TODO: add tests for library nodes (e.g. reduce, matMul) + +N = dc.symbol('N') +M = dc.symbol('M') +K = dc.symbol('K') + + +@dc.program +def single_map(x: dc.float64[N], y: dc.float64[N], z: dc.float64[N]): + z[:] = x + y + + +@dc.program +def single_for_loop(x: dc.float64[N], y: dc.float64[N]): + for i in range(N): + x[i] += y[i] + + +@dc.program +def if_else(x: dc.int64[1000], y: dc.int64[1000], z: dc.int64[1000], sum: dc.int64[1]): + if x[10] > 50: + z[:] = x + y # 1000 work, 1 depth + else: + for i in range(100): # 100 work, 100 depth + sum += x[i] + + +@dc.program +def if_else_sym(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], sum: dc.int64[1]): + if x[10] > 50: + z[:] = x + y # N work, 1 depth + else: + for i in range(K): # K work, K depth + sum += x[i] + + +@dc.program +def nested_sdfg(x: dc.float64[N], y: dc.float64[N], z: dc.float64[N]): + single_map(x, y, z) + single_for_loop(x, y) + + +@dc.program +def nested_maps(x: dc.float64[N, M], y: dc.float64[N, M], z: dc.float64[N, M]): + z[:, :] = x + y + + +@dc.program +def nested_for_loops(x: dc.float64[N], y: dc.float64[K]): + for i in range(N): + for j in range(K): + x[i] += y[j] + + +@dc.program +def nested_if_else(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], sum: dc.int64[1]): + if x[10] > 50: + if x[9] > 50: + z[:] = x + y # N work, 1 depth + z[:] += 2 * x # 2*N work, 2 depth --> total outer if: 3*N work, 3 depth + else: + if y[9] > 50: + for i in range(K): + sum += x[i] # K work, K depth + else: + for j in range(M): + sum += x[j] # M work, M depth + z[:] = x + y # N work, depth 1 --> total inner else: M+N work, M+1 depth + # --> total outer else: Max(K, M+N) work, Max(K, M+1) depth + # --> total over both branches: Max(K, M+N, 3*N) work, Max(K, M+1, 3) depth + + +@dc.program +def max_of_positive_symbol(x: dc.float64[N]): + if x[0] > 0: + for i in range(2 * N): # work 2*N^2, depth 2*N + x += 1 + else: + for j in range(3 * N): # work 3*N^2, depth 3*N + x += 1 + # total is work 3*N^2, depth 3*N without any max + + +@dc.program +def multiple_array_sizes(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], x2: dc.int64[M], y2: dc.int64[M], + z2: dc.int64[M], x3: dc.int64[K], y3: dc.int64[K], z3: dc.int64[K]): + if x[0] > 0: + z[:] = 2 * x + y # work 2*N, depth 2 + elif x[1] > 0: + z2[:] = 2 * x2 + y2 # work 2*M + 3, depth 5 + z2[0] += 3 + z[1] + z[2] + elif x[2] > 0: + z3[:] = 2 * x3 + y3 # work 2*K, depth 2 + elif x[3] > 0: + z[:] = 3 * x + y + 1 # work 3*N, depth 3 + # --> work= Max(3*N, 2*M, 2*K) and depth = 5 + + +@dc.program +def unbounded_while_do(x: dc.float64[N]): + while x[0] < 100: + x += 1 + + +@dc.program +def unbounded_do_while(x: dc.float64[N]): + while True: + x += 1 + if x[0] >= 100: + break + + +@dc.program +def unbounded_nonnegify(x: dc.float64[N]): + while x[0] < 100: + if x[1] < 42: + x += 3 * x + else: + x += x + + +@dc.program +def continue_for_loop(x: dc.float64[N]): + for i in range(N): + if x[i] > 100: + continue + x += 1 + + +@dc.program +def break_for_loop(x: dc.float64[N]): + for i in range(N): + if x[i] > 100: + break + x += 1 + + +@dc.program +def break_while_loop(x: dc.float64[N]): + while x[0] > 10: + if x[1] > 100: + break + x += 1 + + +tests_cases = [ + (single_map, (N, 1)), + (single_for_loop, (N, N)), + (if_else, (1000, 100)), + (if_else_sym, (sp.Max(K, N), sp.Max(1, K))), + (nested_sdfg, (2 * N, N + 1)), + (nested_maps, (M * N, 1)), + (nested_for_loops, (K * N, K * N)), + (nested_if_else, (sp.Max(K, 3 * N, M + N), sp.Max(3, K, M + 1))), + (max_of_positive_symbol, (3 * N**2, 3 * N)), + (multiple_array_sizes, (sp.Max(2 * K, 3 * N, 2 * M + 3), 5)), + (unbounded_while_do, (sp.Symbol('num_execs_0_2', nonnegative=True) * N, sp.Symbol('num_execs_0_2', + nonnegative=True))), + # We get this Max(1, num_execs), since it is a do-while loop, but the num_execs symbol does not capture this. + (unbounded_do_while, (sp.Max(1, sp.Symbol('num_execs_0_1', nonnegative=True)) * N, + sp.Max(1, sp.Symbol('num_execs_0_1', nonnegative=True)))), + (unbounded_nonnegify, (2 * sp.Symbol('num_execs_0_7', nonnegative=True) * N, + 2 * sp.Symbol('num_execs_0_7', nonnegative=True))), + (continue_for_loop, (sp.Symbol('num_execs_0_6', nonnegative=True) * N, sp.Symbol('num_execs_0_6', + nonnegative=True))), + (break_for_loop, (N**2, N)), + (break_while_loop, (sp.Symbol('num_execs_0_5', nonnegative=True) * N, sp.Symbol('num_execs_0_5', nonnegative=True))) +] + + +def test_work_depth(): + good = 0 + failed = 0 + exception = 0 + failed_tests = [] + for test, correct in tests_cases: + w_d_map = {} + sdfg = test.to_sdfg() + if 'nested_sdfg' in test.name: + sdfg.apply_transformations(NestSDFG) + if 'nested_maps' in test.name: + sdfg.apply_transformations(MapExpansion) + + analyze_sdfg(sdfg, w_d_map, get_tasklet_work_depth) + res = w_d_map[get_uuid(sdfg)] + # check result + assert correct == res + + +if __name__ == '__main__': + test_work_depth() From 1cb9f9fa459390df0267b1f9365bb62793563b95 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 17 Aug 2023 13:58:33 +0200 Subject: [PATCH 119/127] Added support for StructureViews. --- dace/codegen/compiled_sdfg.py | 2 +- dace/codegen/dispatcher.py | 4 ++-- dace/codegen/targets/cpu.py | 20 ++++++++++++++++---- dace/codegen/targets/framecode.py | 2 +- dace/data.py | 1 + dace/sdfg/utils.py | 2 +- 6 files changed, 22 insertions(+), 9 deletions(-) diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 863e804802..9ee0772eeb 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -473,7 +473,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: else: warnings.warn(f'Casting scalar argument "{a}" from {type(arg).__name__} to {atype.dtype.type}') arglist[i] = atype.dtype.type(arg) - elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) + elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and not isinstance(atype, dt.StructArray) and atype.dtype.as_numpy_dtype() != arg.dtype): # Make exception for vector types if (isinstance(atype.dtype, dtypes.vector) and atype.dtype.vtype.as_numpy_dtype() == arg.dtype): diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 0b4f58d5ef..5972f5759d 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -504,11 +504,11 @@ def get_copy_dispatcher(self, src_node, dst_node, edge, sdfg, state): dst_is_data = True # Skip copies to/from views where edge matches - if src_is_data and isinstance(src_node.desc(sdfg), dt.View): + if src_is_data and isinstance(src_node.desc(sdfg), (dt.StructureView, dt.View)): e = sdutil.get_view_edge(state, src_node) if e is edge: return None - if dst_is_data and isinstance(dst_node.desc(sdfg), dt.View): + if dst_is_data and isinstance(dst_node.desc(sdfg), (dt.StructureView, dt.View)): e = sdutil.get_view_edge(state, dst_node) if e is edge: return None diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 3cd262e050..1fa4778806 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -215,9 +215,21 @@ def allocate_view(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.A ancestor=0, is_write=is_write) if not declared: - declaration_stream.write(f'{atype} {aname};', sdfg, state_id, node) ctypedef = dtypes.pointer(nodedesc.dtype).ctype self._dispatcher.declared_arrays.add(aname, DefinedType.Pointer, ctypedef) + if isinstance(nodedesc, data.StructureView): + for k, v in nodedesc.members.items(): + if isinstance(v, data.Data): + ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype + defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer + self._dispatcher.declared_arrays.add(f"{name}.{k}", defined_type, ctypedef) + self._dispatcher.defined_vars.add(f"{name}.{k}", defined_type, ctypedef) + # TODO: Find a better way to do this (the issue is with pointers of pointers) + if atype.endswith('*'): + atype = atype[:-1] + if value.startswith('&'): + value = value[1:] + declaration_stream.write(f'{atype} {aname};', sdfg, state_id, node) allocation_stream.write(f'{aname} = {value};', sdfg, state_id, node) def allocate_reference(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.AccessNode, @@ -311,7 +323,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d if not isinstance(nodedesc.dtype, dtypes.opaque): arrsize_bytes = arrsize * nodedesc.dtype.bytes - if isinstance(nodedesc, data.Structure): + if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView): declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n") define_var(name, DefinedType.Pointer, nodedesc.ctype) for k, v in nodedesc.members.items(): @@ -322,7 +334,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream, declaration_stream, allocation_stream) return - if isinstance(nodedesc, data.View): + if isinstance(nodedesc, (data.StructureView, data.View)): return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) if isinstance(nodedesc, data.Reference): return self.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream, @@ -487,7 +499,7 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, dtypes.AllocationLifetime.External) self._dispatcher.declared_arrays.remove(alloc_name, is_global=is_global) - if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)): + if isinstance(nodedesc, (data.Scalar, data.StructureView, data.View, data.Stream, data.Reference)): return elif (nodedesc.storage == dtypes.StorageType.CPU_Heap or (nodedesc.storage == dtypes.StorageType.Register and symbolic.issymbolic(arrsize, sdfg.constants))): diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 52915f51b5..9ee5c2ef17 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -749,7 +749,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): instances = access_instances[sdfg.sdfg_id][name] # A view gets "allocated" everywhere it appears - if isinstance(desc, data.View): + if isinstance(desc, (data.StructureView, data.View)): for s, n in instances: self.to_allocate[s].append((sdfg, s, n, False, True, False)) self.to_allocate[s].append((sdfg, s, n, False, False, True)) diff --git a/dace/data.py b/dace/data.py index 99d7ffc774..bf771db1d4 100644 --- a/dace/data.py +++ b/dace/data.py @@ -510,6 +510,7 @@ def validate(self): if self.lifetime != dtypes.AllocationLifetime.Scope: raise ValueError('Only Scope allocation lifetime is supported for Views') + @make_properties class Scalar(Data): """ Data descriptor of a scalar value. """ diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index d08518b10c..3396335ece 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -1396,7 +1396,7 @@ def is_nonfree_sym_dependent(node: nd.AccessNode, desc: dt.Data, state: SDFGStat :param state: the state that contains the node :param fsymbols: the free symbols to check against """ - if isinstance(desc, dt.View): + if isinstance(desc, (dt.StructureView, dt.View)): # Views can be non-free symbol dependent due to the adjacent edges. e = get_view_edge(state, node) if e.data: From 5a2c4602c2341f057a5159c3cbe2437f33ab24e8 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 17 Aug 2023 13:58:58 +0200 Subject: [PATCH 120/127] Added tests for StructArrays. --- tests/sdfg/data/struct_array_test.py | 184 +++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/sdfg/data/struct_array_test.py diff --git a/tests/sdfg/data/struct_array_test.py b/tests/sdfg/data/struct_array_test.py new file mode 100644 index 0000000000..9b40379e53 --- /dev/null +++ b/tests/sdfg/data/struct_array_test.py @@ -0,0 +1,184 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import ctypes +import dace +import numpy as np + +from scipy import sparse + + +def test_read_struct_array(): + + L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz')) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], + name='CSRMatrix') + csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], + name='CSRMatrix', + transient=True) + + sdfg = dace.SDFG('array_of_csr_to_dense') + + sdfg.add_datadesc('A', csr_obj[L]) + sdfg.add_array('B', [L, M, N], dace.float32) + + sdfg.add_datadesc('vcsr', csr_obj_view) + sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) + + state = sdfg.add_state() + + A = state.add_access('A') + B = state.add_access('B') + + bme, bmx = state.add_map('b', dict(b='0:L')) + bme.map.schedule = dace.ScheduleType.Sequential + + vcsr = state.add_access('vcsr') + indptr = state.add_access('vindptr') + indices = state.add_access('vindices') + data = state.add_access('vdata') + + state.add_memlet_path(A, bme, vcsr, dst_conn='views', memlet=dace.Memlet(data='A', subset='b')) + state.add_edge(vcsr, None, indptr, 'views', memlet=dace.Memlet.from_array('vcsr.indptr', csr_obj.members['indptr'])) + state.add_edge(vcsr, None, indices, 'views', memlet=dace.Memlet.from_array('vcsr.indices', csr_obj.members['indices'])) + state.add_edge(vcsr, None, data, 'views', memlet=dace.Memlet.from_array('vcsr.data', csr_obj.members['data'])) + + ime, imx = state.add_map('i', dict(i='0:M')) + jme, jmx = state.add_map('idx', dict(idx='start:stop')) + jme.add_in_connector('start') + jme.add_in_connector('stop') + t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val') + + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start') + state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop') + state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j') + state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val') + state.add_memlet_path(t, jmx, imx, bmx, B, memlet=dace.Memlet(data='B', subset='b, 0:M, 0:N', volume=1), src_conn='__out') + + func = sdfg.compile() + + rng = np.random.default_rng(42) + A = np.ndarray((10,), dtype=sparse.csr_matrix) + dace_A = np.ndarray((10,), dtype=ctypes.c_void_p) + B = np.zeros((10, 20, 20), dtype=np.float32) + + ctypes_A = [] + for b in range(10): + A[b] = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + ctypes_obj = csr_obj.dtype._typeclass.as_ctypes()(indptr=A[b].indptr.__array_interface__['data'][0], + indices=A[b].indices.__array_interface__['data'][0], + data=A[b].data.__array_interface__['data'][0]) + ctypes_A.append(ctypes_obj) # This is needed to keep the object alive ... + dace_A[b] = ctypes.addressof(ctypes_obj) + + func(A=dace_A, B=B, L=A.shape[0], M=A[0].shape[0], N=A[0].shape[1], nnz=A[0].nnz) + ref = np.ndarray((10, 20, 20), dtype=np.float32) + for b in range(10): + ref[b] = A[b].toarray() + + assert np.allclose(B, ref) + + +def test_write_struct_array(): + + L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz')) + csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], + name='CSRMatrix') + csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + order=['indptr', 'indices', 'data'], + name='CSRMatrix', + transient=True) + + sdfg = dace.SDFG('array_dense_to_csr') + + sdfg.add_array('A', [L, M, N], dace.float32) + sdfg.add_datadesc('B', csr_obj[L]) + + sdfg.add_datadesc('vcsr', csr_obj_view) + sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype) + sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype) + sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype) + + # Make If + if_before = sdfg.add_state('if_before') + if_guard = sdfg.add_state('if_guard') + if_body = sdfg.add_state('if_body') + if_after = sdfg.add_state('if_after') + sdfg.add_edge(if_before, if_guard, dace.InterstateEdge()) + sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[k, i, j] != 0')) + sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'})) + sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[k, i, j] == 0')) + A = if_body.add_access('A') + vcsr = if_body.add_access('vcsr') + B = if_body.add_access('B') + indices = if_body.add_access('vindices') + data = if_body.add_access('vdata') + if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='k, i, j', other_subset='idx')) + if_body.add_edge(data, 'views', vcsr, None, dace.Memlet(data='vcsr.data', subset='0:nnz')) + t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j') + if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx')) + if_body.add_edge(indices, 'views', vcsr, None, dace.Memlet(data='vcsr.indices', subset='0:nnz')) + if_body.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k')) + # Make For Loop for j + j_before, j_guard, j_after = sdfg.add_loop(None, + if_before, + None, + 'j', + '0', + 'j < N', + 'j + 1', + loop_end_state=if_after) + # Make For Loop for i + i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after) + sdfg.start_state = sdfg.node_id(i_before) + i_before_guard = sdfg.edges_between(i_before, i_guard)[0] + i_before_guard.data.assignments['idx'] = '0' + vcsr = i_guard.add_access('vcsr') + B = i_guard.add_access('B') + indptr = i_guard.add_access('vindptr') + t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx') + i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i')) + i_guard.add_edge(indptr, 'views', vcsr, None, dace.Memlet(data='vcsr.indptr', subset='0:M+1')) + i_guard.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k')) + vcsr = i_after.add_access('vcsr') + B = i_after.add_access('B') + indptr = i_after.add_access('vindptr') + t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz') + i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M')) + i_after.add_edge(indptr, 'views', vcsr, None, dace.Memlet(data='vcsr.indptr', subset='0:M+1')) + i_after.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k')) + + k_before, k_guard, k_after = sdfg.add_loop(None, i_before, None, 'k', '0', 'k < L', 'k + 1', loop_end_state=i_after) + + func = sdfg.compile() + + rng = np.random.default_rng(42) + B = np.ndarray((10,), dtype=sparse.csr_matrix) + dace_B = np.ndarray((10,), dtype=ctypes.c_void_p) + A = np.empty((10, 20, 20), dtype=np.float32) + + ctypes_B = [] + for b in range(10): + B[b] = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng) + A[b] = B[b].toarray() + nnz = B[b].nnz + B[b].indptr[:] = -1 + B[b].indices[:] = -1 + B[b].data[:] = -1 + ctypes_obj = csr_obj.dtype._typeclass.as_ctypes()(indptr=B[b].indptr.__array_interface__['data'][0], + indices=B[b].indices.__array_interface__['data'][0], + data=B[b].data.__array_interface__['data'][0]) + ctypes_B.append(ctypes_obj) # This is needed to keep the object alive ... + dace_B[b] = ctypes.addressof(ctypes_obj) + + func(A=A, B=dace_B, L=B.shape[0], M=B[0].shape[0], N=B[0].shape[1], nnz=nnz) + for b in range(10): + assert np.allclose(A[b], B[b].toarray()) + + +if __name__ == '__main__': + test_read_struct_array() + test_write_struct_array() From f1b0c73dffee4468119cd1575edecc9f1fa7bdab Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Thu, 17 Aug 2023 15:15:24 +0200 Subject: [PATCH 121/127] Fixed serialization. --- dace/data.py | 22 +++++++++++++++++++++- dace/properties.py | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/dace/data.py b/dace/data.py index bf771db1d4..37d532ac44 100644 --- a/dace/data.py +++ b/dace/data.py @@ -1102,9 +1102,29 @@ def __init__(self, pool=False): self.stype = stype - dtype = stype.dtype + if stype: + dtype = stype.dtype + else: + dtype = dtypes.int8 super(StructArray, self).__init__(dtype, shape, transient, allow_conflicts, storage, location, strides, offset, may_alias, lifetime, alignment, debuginfo, total_size, start_offset, optional, pool) + + @classmethod + def from_json(cls, json_obj, context=None): + # Create dummy object + ret = cls(None, ()) + serialize.set_properties_from_json(ret, json_obj, context=context) + + # Default shape-related properties + if not ret.offset: + ret.offset = [0] * len(ret.shape) + if not ret.strides: + # Default strides are C-ordered + ret.strides = [_prod(ret.shape[i + 1:]) for i in range(len(ret.shape))] + if ret.total_size == 0: + ret.total_size = _prod(ret.shape) + + return ret @make_properties diff --git a/dace/properties.py b/dace/properties.py index fb37ec7a7c..0bec65d0ec 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -1408,7 +1408,7 @@ def to_string(obj): def to_json(self, obj): if obj is None: return None - return obj.dtype.to_json() + return obj.to_json() @staticmethod def from_json(obj, context=None): From 82c2bb82315fdb94a2033b84295ed888859c5b62 Mon Sep 17 00:00:00 2001 From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:44:27 +0200 Subject: [PATCH 122/127] Have memory type as argument for fpga auto interleave (#1352) Co-authored-by: Tiziano De Matteis --- dace/transformation/auto/fpga.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dace/transformation/auto/fpga.py b/dace/transformation/auto/fpga.py index 4295699cdb..573341e1f6 100644 --- a/dace/transformation/auto/fpga.py +++ b/dace/transformation/auto/fpga.py @@ -44,24 +44,28 @@ def fpga_global_to_local(sdfg: SDFG, max_size: int = 1048576) -> None: print(f'Applied {len(converted)} Global-To-Local{": " if len(converted)>0 else "."} {", ".join(converted)}') -def fpga_rr_interleave_containers_to_banks(sdfg: SDFG, num_banks: int = 4): +def fpga_rr_interleave_containers_to_banks(sdfg: SDFG, num_banks: int = 4, memory_type: str = "DDR"): """ Allocates the (global) arrays to FPGA off-chip memory banks, interleaving them in a Round-Robin (RR) fashion. This applies to all the arrays in the SDFG hierarchy. :param sdfg: The SDFG to operate on. :param num_banks: number of off-chip memory banks to consider + :param memory_type: type of off-chip memory, either "DDR" or "HBM" (if the target FPGA supports it) :return: a list containing the number of (transient) arrays allocated to each bank :note: Operates in-place on the SDFG. """ + if memory_type.upper() not in {"DDR", "HBM"}: + raise ValueError("Memory type should be either \"DDR\" or \"HBM\"") + # keep track of memory allocated to each bank num_allocated = [0 for i in range(num_banks)] i = 0 for sd, aname, desc in sdfg.arrays_recursive(): if not isinstance(desc, dt.Stream) and desc.storage == dtypes.StorageType.FPGA_Global and desc.transient: - desc.location["memorytype"] = "ddr" + desc.location["memorytype"] = memory_type.upper() desc.location["bank"] = str(i % num_banks) num_allocated[i % num_banks] = num_allocated[i % num_banks] + 1 i = i + 1 From c5889a4e3092a89a5466f6b8c2fe29d3ea3ad1a1 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Mon, 21 Aug 2023 17:20:43 +0200 Subject: [PATCH 123/127] Addressed comments. --- dace/codegen/targets/cpp.py | 2 ++ dace/codegen/targets/cpu.py | 15 +++++++++------ dace/data.py | 6 +++--- dace/dtypes.py | 2 +- dace/properties.py | 8 +++++--- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 093a324d9a..d3d4f50ccd 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -370,6 +370,8 @@ def make_const(expr: str) -> str: # Register defined variable dispatcher.defined_vars.add(pointer_name, defined_type, typedef, allow_shadowing=True) + # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and structures. + # NOTE: Since structures are implemented as pointers, we replace dots with arrows. expr = expr.replace('.', '->') return (typedef + ref, pointer_name, expr) diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 20615a3136..0464672390 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -55,10 +55,13 @@ def __init__(self, frame_codegen, sdfg): # Keep track of generated NestedSDG, and the name of the assigned function self._generated_nested_sdfg = dict() + # NOTE: Multi-nesting with StructArrays must be further investigated. def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''): for k, v in struct.members.items(): if isinstance(v, data.Structure): _visit_structure(v, args, f'{prefix}.{k}') + elif isinstance(v, data.StructArray): + _visit_structure(v.stype, args, f'{prefix}.{k}') elif isinstance(v, data.Data): args[f'{prefix}.{k}'] = v @@ -71,11 +74,7 @@ def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''): elif isinstance(arg_type, data.StructArray): desc = sdfg.arrays[name] desc = desc.stype - for attr in dir(desc): - value = getattr(desc, attr) - if isinstance(value, data.Data): - assert attr in sdfg.arrays - arglist[attr] = value + _visit_structure(desc, arglist, name) for name, arg_type in arglist.items(): if isinstance(arg_type, (data.Scalar, data.Structure)): @@ -300,6 +299,8 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d name = node.data alloc_name = cpp.ptr(name, nodedesc, sdfg, self._frame) name = alloc_name + # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and + # NOTE: structures. Since structures are implemented as pointers, we replace dots with arrows. alloc_name = alloc_name.replace('.', '->') if nodedesc.transient is False: @@ -324,7 +325,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d arrsize_bytes = arrsize * nodedesc.dtype.bytes if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView): - declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n") + declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type};\n") define_var(name, DefinedType.Pointer, nodedesc.ctype) for k, v in nodedesc.members.items(): if isinstance(v, data.Data): @@ -1183,6 +1184,8 @@ def memlet_definition(self, if not types: types = self._dispatcher.defined_vars.get(ptr, is_global=True) var_type, ctypedef = types + # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and + # NOTE: structures. Since structures are implemented as pointers, we replace dots with arrows. ptr = ptr.replace('.', '->') if fpga.is_fpga_array(desc): diff --git a/dace/data.py b/dace/data.py index 37d532ac44..5f05cbfcc8 100644 --- a/dace/data.py +++ b/dace/data.py @@ -374,7 +374,7 @@ class Structure(Data): desc="Dictionary of structure members", from_json=_arrays_from_json, to_json=_arrays_to_json) - name = Property(dtype=str, desc="Structure name") + name = Property(dtype=str, desc="Structure type name") def __init__(self, members: Dict[str, Data], @@ -478,7 +478,7 @@ def as_arg(self, with_types=True, for_call=False, name=None): def __getitem__(self, s): """ This is syntactic sugar that allows us to define an array type with the following syntax: ``Structure[N,M]`` - :return: A ``data.Array`` data descriptor. + :return: A ``data.StructArray`` data descriptor. """ if isinstance(s, list) or isinstance(s, tuple): return StructArray(self, tuple(s)) @@ -1084,7 +1084,7 @@ class StructArray(Array): stype = NestedDataClassProperty(allow_none=True, default=None) def __init__(self, - stype, + stype: Structure, shape, transient=False, allow_conflicts=False, diff --git a/dace/dtypes.py b/dace/dtypes.py index 888f74f6b9..f0bac23958 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -835,9 +835,9 @@ def as_ctypes(self): fields.append((k, v.as_ctypes())) else: fields.append((k, _FFI_CTYPES[v.type])) - # fields = sorted(fields, key=lambda f: f[0]) # Create new struct class. struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields}) + # NOTE: Each call to `type` returns a different class, so we need to cache it to ensure uniqueness. _FFI_CTYPES[self] = struct_class return struct_class diff --git a/dace/properties.py b/dace/properties.py index 0bec65d0ec..0adcfe3e97 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -1392,12 +1392,14 @@ def __get__(self, obj, objtype=None) -> 'Data': @property def dtype(self): - return pydoc.locate("dace.data.Data") + from dace import data as dt + return dt.Data @staticmethod def from_string(s): - dtype = pydoc.locate("dace.data.{}".format(s)) - if dtype is None or not isinstance(dtype, pydoc.locate("dace.data.Data")): + from dace import data as dt + dtype = getattr(dt, s, None) + if dtype is None or not isinstance(dtype, dt.Data): raise ValueError("Not a valid data type: {}".format(s)) return dtype From eabbd1d6cd451556813ffea93cfa771767ef8561 Mon Sep 17 00:00:00 2001 From: Alexandros Nikolaos Ziogas Date: Tue, 22 Aug 2023 15:52:45 +0200 Subject: [PATCH 124/127] Addressed comments. --- dace/data.py | 27 +++++++++++---------------- dace/properties.py | 4 ++++ tests/sdfg/data/struct_array_test.py | 23 +++++++++++------------ tests/sdfg/data/structure_test.py | 8 -------- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/dace/data.py b/dace/data.py index 5f05cbfcc8..3b571e6537 100644 --- a/dace/data.py +++ b/dace/data.py @@ -5,7 +5,7 @@ from collections import OrderedDict from numbers import Number -from typing import Any, Dict, List, Optional, Sequence, Set, Tuple +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union import numpy import sympy as sp @@ -19,7 +19,8 @@ from dace import serialize, symbolic from dace.codegen import cppunparse from dace.properties import (DebugInfoProperty, DictProperty, EnumProperty, ListProperty, NestedDataClassProperty, - Property, ShapeProperty, SymbolicProperty, TypeClassProperty, make_properties) + OrderedDictProperty, Property, ShapeProperty, SymbolicProperty, TypeClassProperty, + make_properties) def create_datadescriptor(obj, no_custom_desc=False): @@ -370,15 +371,14 @@ def _arrays_from_json(obj, context=None): class Structure(Data): """ Base class for structures. """ - members = Property(dtype=OrderedDict, - desc="Dictionary of structure members", - from_json=_arrays_from_json, - to_json=_arrays_to_json) + members = OrderedDictProperty(default=OrderedDict(), + desc="Dictionary of structure members", + from_json=_arrays_from_json, + to_json=_arrays_to_json) name = Property(dtype=str, desc="Structure type name") def __init__(self, - members: Dict[str, Data], - order: List[str] = None, + members: Union[Dict[str, Data], List[Tuple[str, Data]]], name: str = 'Structure', transient: bool = False, storage: dtypes.StorageType = dtypes.StorageType.Default, @@ -386,19 +386,14 @@ def __init__(self, lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope, debuginfo: dtypes.DebugInfo = None): - order = order or list(members.keys()) - if set(members.keys()) != set(order): - raise ValueError('Order must contain all members of the structure.') - - # TODO: Should we make a deep-copy here? - self.members = OrderedDict((k, members[k]) for k in order) - + self.members = OrderedDict(members) for k, v in self.members.items(): v.transient = transient + self.name = name fields_and_types = OrderedDict() symbols = set() - for k, v in members.items(): + for k, v in self.members.items(): if isinstance(v, Structure): symbols |= v.free_symbols fields_and_types[k] = (v.dtype, str(v.total_size)) diff --git a/dace/properties.py b/dace/properties.py index 0adcfe3e97..61e569341f 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -145,11 +145,15 @@ def fs(obj, *args, **kwargs): self._from_json = lambda *args, **kwargs: dace.serialize.from_json(*args, known_type=dtype, **kwargs) else: self._from_json = from_json + if self.from_json != from_json: + self.from_json = from_json if to_json is None: self._to_json = dace.serialize.to_json else: self._to_json = to_json + if self.to_json != to_json: + self.to_json = to_json if meta_to_json is None: diff --git a/tests/sdfg/data/struct_array_test.py b/tests/sdfg/data/struct_array_test.py index 9b40379e53..8e0f2f4739 100644 --- a/tests/sdfg/data/struct_array_test.py +++ b/tests/sdfg/data/struct_array_test.py @@ -10,12 +10,11 @@ def test_read_struct_array(): L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') - csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], - name='CSRMatrix', - transient=True) + csr_obj_view = dace.data.StructureView( + [('indptr', dace.int32[M + 1]), ('indices', dace.int32[nnz]), ('data', dace.float32[nnz])], + name='CSRMatrix', + transient=True) sdfg = dace.SDFG('array_of_csr_to_dense') @@ -84,13 +83,13 @@ def test_read_struct_array(): def test_write_struct_array(): L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz')) - csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], - name='CSRMatrix') - csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], - name='CSRMatrix', - transient=True) + csr_obj = dace.data.Structure( + [('indptr', dace.int32[M + 1]), ('indices', dace.int32[nnz]), ('data', dace.float32[nnz])], + name='CSRMatrix') + csr_obj_view = dace.data.StructureView( + dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), + name='CSRMatrix', + transient=True) sdfg = dace.SDFG('array_dense_to_csr') diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py index 995aacb2fd..02b8f0c174 100644 --- a/tests/sdfg/data/structure_test.py +++ b/tests/sdfg/data/structure_test.py @@ -12,7 +12,6 @@ def test_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense') @@ -69,7 +68,6 @@ def test_write_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('dense_to_csr') @@ -147,10 +145,8 @@ def test_local_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix', transient=True) @@ -258,7 +254,6 @@ def test_local_structure(): def test_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -320,7 +315,6 @@ def test_write_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') @@ -402,7 +396,6 @@ def test_direct_read_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') sdfg = dace.SDFG('csr_to_dense_direct') @@ -453,7 +446,6 @@ def test_direct_read_structure(): def test_direct_read_nested_structure(): M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]), - order=['indptr', 'indices', 'data'], name='CSRMatrix') wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper') From c5ca99ad37e7ceef6da71026c3c8bb579f64117f Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 29 Aug 2023 23:05:10 -0700 Subject: [PATCH 125/127] Eliminate extraneous branch-end gotos in code generation (#1355) --- dace/codegen/control_flow.py | 77 +++++++++++++++----- dace/codegen/targets/framecode.py | 2 +- tests/codegen/control_flow_detection_test.py | 29 ++++++++ 3 files changed, 88 insertions(+), 20 deletions(-) diff --git a/dace/codegen/control_flow.py b/dace/codegen/control_flow.py index 182604c892..1b97241e47 100644 --- a/dace/codegen/control_flow.py +++ b/dace/codegen/control_flow.py @@ -82,6 +82,9 @@ class ControlFlow: # a string with its generated code. dispatch_state: Callable[[SDFGState], str] + # The parent control flow block of this one, used to avoid generating extraneous ``goto``s + parent: Optional['ControlFlow'] + @property def first_state(self) -> SDFGState: """ @@ -222,11 +225,18 @@ def as_cpp(self, codegen, symbols) -> str: out_edges = sdfg.out_edges(elem.state) for j, e in enumerate(out_edges): if e not in self.gotos_to_ignore: - # If this is the last generated edge and it leads - # to the next state, skip emitting goto + # Skip gotos to immediate successors successor = None - if (j == (len(out_edges) - 1) and (i + 1) < len(self.elements)): - successor = self.elements[i + 1].first_state + # If this is the last generated edge + if j == (len(out_edges) - 1): + if (i + 1) < len(self.elements): + # If last edge leads to next state in block + successor = self.elements[i + 1].first_state + elif i == len(self.elements) - 1: + # If last edge leads to first state in next block + next_block = _find_next_block(self) + if next_block is not None: + successor = next_block.first_state expr += elem.generate_transition(sdfg, e, successor) else: @@ -478,13 +488,14 @@ def children(self) -> List[ControlFlow]: def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[InterstateEdge], leave_edge: Edge[InterstateEdge], back_edges: List[Edge[InterstateEdge]], - dispatch_state: Callable[[SDFGState], str]) -> Union[ForScope, WhileScope]: + dispatch_state: Callable[[SDFGState], + str], parent_block: GeneralBlock) -> Union[ForScope, WhileScope]: """ Helper method that constructs the correct structured loop construct from a set of states. Can construct for or while loops. """ - body = GeneralBlock(dispatch_state, [], [], [], [], [], True) + body = GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True) guard_inedges = sdfg.in_edges(guard) increment_edges = [e for e in guard_inedges if e in back_edges] @@ -535,10 +546,10 @@ def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[Intersta # Also ignore assignments in increment edge (handled in for stmt) body.assignments_to_ignore.append(increment_edge) - return ForScope(dispatch_state, itvar, guard, init, condition, update, body, init_edges) + return ForScope(dispatch_state, parent_block, itvar, guard, init, condition, update, body, init_edges) # Otherwise, it is a while loop - return WhileScope(dispatch_state, guard, condition, body) + return WhileScope(dispatch_state, parent_block, guard, condition, body) def _cases_from_branches( @@ -617,6 +628,31 @@ def _child_of(node: SDFGState, parent: SDFGState, ptree: Dict[SDFGState, SDFGSta return False +def _find_next_block(block: ControlFlow) -> Optional[ControlFlow]: + """ + Returns the immediate successor control flow block. + """ + # Find block in parent + parent = block.parent + if parent is None: + return None + ind = next(i for i, b in enumerate(parent.children) if b is block) + if ind == len(parent.children) - 1 or isinstance(parent, (IfScope, IfElseChain, SwitchCaseScope)): + # If last block, or other children are not reachable from current node (branches), + # recursively continue upwards + return _find_next_block(parent) + return parent.children[ind + 1] + + +def _reset_block_parents(block: ControlFlow): + """ + Fixes block parents after processing. + """ + for child in block.children: + child.parent = block + _reset_block_parents(child) + + def _structured_control_flow_traversal(sdfg: SDFG, start: SDFGState, ptree: Dict[SDFGState, SDFGState], @@ -645,7 +681,7 @@ def _structured_control_flow_traversal(sdfg: SDFG, """ def make_empty_block(): - return GeneralBlock(dispatch_state, [], [], [], [], [], True) + return GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True) # Traverse states in custom order visited = set() if visited is None else visited @@ -657,7 +693,7 @@ def make_empty_block(): if node in visited or node is stop: continue visited.add(node) - stateblock = SingleState(dispatch_state, node) + stateblock = SingleState(dispatch_state, parent_block, node) oe = sdfg.out_edges(node) if len(oe) == 0: # End state @@ -708,12 +744,14 @@ def make_empty_block(): if (len(oe) == 2 and oe[0].data.condition_sympy() == sp.Not(oe[1].data.condition_sympy())): # If without else if oe[0].dst is mergestate: - branch_block = IfScope(dispatch_state, sdfg, node, oe[1].data.condition, cblocks[oe[1]]) + branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[1].data.condition, + cblocks[oe[1]]) elif oe[1].dst is mergestate: - branch_block = IfScope(dispatch_state, sdfg, node, oe[0].data.condition, cblocks[oe[0]]) + branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition, + cblocks[oe[0]]) else: - branch_block = IfScope(dispatch_state, sdfg, node, oe[0].data.condition, cblocks[oe[0]], - cblocks[oe[1]]) + branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition, + cblocks[oe[0]], cblocks[oe[1]]) else: # If there are 2 or more edges (one is not the negation of the # other): @@ -721,10 +759,10 @@ def make_empty_block(): if switch: # If all edges are of form "x == y" for a single x and # integer y, it is a switch/case - branch_block = SwitchCaseScope(dispatch_state, sdfg, node, switch[0], switch[1]) + branch_block = SwitchCaseScope(dispatch_state, parent_block, sdfg, node, switch[0], switch[1]) else: # Otherwise, create if/else if/.../else goto exit chain - branch_block = IfElseChain(dispatch_state, sdfg, node, + branch_block = IfElseChain(dispatch_state, parent_block, sdfg, node, [(e.data.condition, cblocks[e] if e in cblocks else make_empty_block()) for e in oe]) # End of branch classification @@ -739,11 +777,11 @@ def make_empty_block(): loop_exit = None scope = None if ptree[oe[0].dst] == node and ptree[oe[1].dst] != node: - scope = _loop_from_structure(sdfg, node, oe[0], oe[1], back_edges, dispatch_state) + scope = _loop_from_structure(sdfg, node, oe[0], oe[1], back_edges, dispatch_state, parent_block) body_start = oe[0].dst loop_exit = oe[1].dst elif ptree[oe[1].dst] == node and ptree[oe[0].dst] != node: - scope = _loop_from_structure(sdfg, node, oe[1], oe[0], back_edges, dispatch_state) + scope = _loop_from_structure(sdfg, node, oe[1], oe[0], back_edges, dispatch_state, parent_block) body_start = oe[1].dst loop_exit = oe[0].dst @@ -836,7 +874,8 @@ def structured_control_flow_tree(sdfg: SDFG, dispatch_state: Callable[[SDFGState if len(common_frontier) == 1: branch_merges[state] = next(iter(common_frontier)) - root_block = GeneralBlock(dispatch_state, [], [], [], [], [], True) + root_block = GeneralBlock(dispatch_state, None, [], [], [], [], [], True) _structured_control_flow_traversal(sdfg, sdfg.start_state, ptree, branch_merges, back_edges, dispatch_state, root_block) + _reset_block_parents(root_block) return root_block diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 9ee5c2ef17..dfdbbb392b 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -471,7 +471,7 @@ def dispatch_state(state: SDFGState) -> str: # If disabled, generate entire graph as general control flow block states_topological = list(sdfg.topological_sort(sdfg.start_state)) last = states_topological[-1] - cft = cflow.GeneralBlock(dispatch_state, + cft = cflow.GeneralBlock(dispatch_state, None, [cflow.SingleState(dispatch_state, s, s is last) for s in states_topological], [], [], [], [], False) diff --git a/tests/codegen/control_flow_detection_test.py b/tests/codegen/control_flow_detection_test.py index 99d6a39b29..982140f7ed 100644 --- a/tests/codegen/control_flow_detection_test.py +++ b/tests/codegen/control_flow_detection_test.py @@ -120,6 +120,33 @@ def test_single_outedge_branch(): assert np.allclose(res, 2) +def test_extraneous_goto(): + + @dace.program + def tester(a: dace.float64[20]): + if a[0] < 0: + a[1] = 1 + a[2] = 1 + + sdfg = tester.to_sdfg(simplify=True) + assert 'goto' not in sdfg.generate_code()[0].code + + +def test_extraneous_goto_nested(): + + @dace.program + def tester(a: dace.float64[20]): + if a[0] < 0: + if a[0] < 1: + a[1] = 1 + else: + a[1] = 2 + a[2] = 1 + + sdfg = tester.to_sdfg(simplify=True) + assert 'goto' not in sdfg.generate_code()[0].code + + if __name__ == '__main__': test_for_loop_detection() test_invalid_for_loop_detection() @@ -128,3 +155,5 @@ def test_single_outedge_branch(): test_edge_sympy_function('TrueFalse') test_edge_sympy_function('SwitchCase') test_single_outedge_branch() + test_extraneous_goto() + test_extraneous_goto_nested() From c34de8e3336343b0f11bddd0b61099ab1f22eb47 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Sat, 2 Sep 2023 15:34:08 +0200 Subject: [PATCH 126/127] TaskletFusion: Fix additional edges in case of none-connectors --- .../transformation/dataflow/tasklet_fusion.py | 3 ++ tests/transformations/tasklet_fusion_test.py | 44 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/dace/transformation/dataflow/tasklet_fusion.py b/dace/transformation/dataflow/tasklet_fusion.py index 99f8f625be..d6b4a3039b 100644 --- a/dace/transformation/dataflow/tasklet_fusion.py +++ b/dace/transformation/dataflow/tasklet_fusion.py @@ -249,6 +249,9 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG): t1.language) for in_edge in graph.in_edges(t1): + if in_edge.src_conn is None and isinstance(in_edge.src, dace.nodes.EntryNode): + if len(new_tasklet.in_connectors) > 0: + continue graph.add_edge(in_edge.src, in_edge.src_conn, new_tasklet, in_edge.dst_conn, in_edge.data) for in_edge in graph.in_edges(t2): diff --git a/tests/transformations/tasklet_fusion_test.py b/tests/transformations/tasklet_fusion_test.py index c7fd6802d5..743010e8c9 100644 --- a/tests/transformations/tasklet_fusion_test.py +++ b/tests/transformations/tasklet_fusion_test.py @@ -213,6 +213,49 @@ def test_map_with_tasklets(language: str, with_data: bool): ref = map_with_tasklets.f(A, B) assert (np.allclose(C, ref)) +def test_none_connector(): + @dace.program + def sdfg_none_connector(A: dace.float32[32], B: dace.float32[32]): + tmp = dace.define_local([32], dace.float32) + for i in dace.map[0:32]: + with dace.tasklet: + a >> tmp[i] + a = 0 + + tmp2 = dace.define_local([32], dace.float32) + for i in dace.map[0:32]: + with dace.tasklet: + a << A[i] + b >> tmp2[i] + b = a + 1 + + + for i in dace.map[0:32]: + with dace.tasklet: + a << tmp[i] + b << tmp2[i] + c >> B[i] + c = a + b + + sdfg = sdfg_none_connector.to_sdfg() + sdfg.simplify() + applied = sdfg.apply_transformations_repeated(MapFusion) + assert applied == 2 + + map_entry = None + for node in sdfg.start_state.nodes(): + if isinstance(node, dace.nodes.MapEntry): + map_entry = node + break + + assert map_entry is not None + assert len([edge.src_conn for edge in sdfg.start_state.out_edges(map_entry) if edge.src_conn is None]) == 1 + + applied = sdfg.apply_transformations_repeated(TaskletFusion) + assert applied == 2 + + assert sdfg.start_state.out_degree(map_entry) == 1 + assert len([edge.src_conn for edge in sdfg.start_state.out_edges(map_entry) if edge.src_conn is None]) == 0 if __name__ == '__main__': test_basic() @@ -224,3 +267,4 @@ def test_map_with_tasklets(language: str, with_data: bool): test_map_with_tasklets(language='Python', with_data=True) test_map_with_tasklets(language='CPP', with_data=False) test_map_with_tasklets(language='CPP', with_data=True) + test_none_connector() From f95f8162a4e77d7a386ccd20c9e4ef71a3ad9787 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 4 Sep 2023 23:58:33 -0700 Subject: [PATCH 127/127] Fix dynamic memlet propagation condition (#1364) --- dace/sdfg/propagation.py | 4 ++-- tests/python_frontend/argument_test.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py index 0fec4812b7..0554775dcd 100644 --- a/dace/sdfg/propagation.py +++ b/dace/sdfg/propagation.py @@ -1477,8 +1477,8 @@ def propagate_subset(memlets: List[Memlet], new_memlet.volume = simplify(sum(m.volume for m in memlets) * functools.reduce(lambda a, b: a * b, rng.size(), 1)) if any(m.dynamic for m in memlets): new_memlet.dynamic = True - elif symbolic.issymbolic(new_memlet.volume) and any(s not in defined_variables - for s in new_memlet.volume.free_symbols): + if symbolic.issymbolic(new_memlet.volume) and any(s not in defined_variables + for s in new_memlet.volume.free_symbols): new_memlet.dynamic = True new_memlet.volume = 0 diff --git a/tests/python_frontend/argument_test.py b/tests/python_frontend/argument_test.py index 1f43337eb8..cb47188029 100644 --- a/tests/python_frontend/argument_test.py +++ b/tests/python_frontend/argument_test.py @@ -2,6 +2,7 @@ import dace import pytest +import numpy as np N = dace.symbol('N') @@ -16,5 +17,29 @@ def test_extra_args(): imgcpy([[1, 2], [3, 4]], [[4, 3], [2, 1]], 0.0, 1.0) +def test_missing_arguments_regression(): + + def nester(a, b, T): + for i, j in dace.map[0:20, 0:20]: + start = 0 + end = min(T, 6) + + elem: dace.float64 = 0 + for ii in range(start, end): + if ii % 2 == 0: + elem += b[ii] + + a[j, i] = elem + + @dace.program + def tester(x: dace.float64[20, 20]): + gdx = np.ones((10, ), dace.float64) + for T in range(2): + nester(x, gdx, T) + + tester.to_sdfg().compile() + + if __name__ == '__main__': test_extra_args() + test_missing_arguments_regression()