From 78a1a03052aeb0b95422fa96be2db251b30464da Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 26 Jul 2022 17:17:54 +0200
Subject: [PATCH 001/392] Added module, method, and attribute replacements to
 allow parsing of mpi4py compatible cartesian comm methods, bcast, and
 allreduce.

---
 dace/frontend/common/distr.py | 293 +++++++++++++++++++++++++---------
 1 file changed, 221 insertions(+), 72 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 98a8f23e87..c201e7ce14 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -16,6 +16,150 @@
 RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]
 
 
+##### MPI Cartesian Communicators
+
+
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Create_cart')
+@oprepo.replaces('dace.comm.Cart_create')
+def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: ShapeType):
+    """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html).
+        :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3].
+        :return: Name of the new process-grid descriptor.
+    """
+    pgrid_name = sdfg.add_pgrid(dims)
+
+    # Dummy tasklet adds MPI variables to the program's state.
+    from dace.libraries.mpi import Dummy
+    tasklet = Dummy(pgrid_name, [
+        f'MPI_Comm {pgrid_name}_comm;',
+        f'MPI_Group {pgrid_name}_group;',
+        f'int {pgrid_name}_coords[{len(dims)}];',
+        f'int {pgrid_name}_dims[{len(dims)}];',
+        f'int {pgrid_name}_rank;',
+        f'int {pgrid_name}_size;',
+        f'bool {pgrid_name}_valid;',
+    ])
+
+    state.add_node(tasklet)
+
+    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
+    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
+    wnode = state.add_write(pgrid_name)
+    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
+
+    return pgrid_name
+
+
+@oprepo.replaces_method('Intracomm', 'Create_cart')
+def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', dims: ShapeType):
+    """ Equivalent to `dace.comm.Cart_create(dims).
+        :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3].
+        :return: Name of the new process-grid descriptor.
+    """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    return _cart_create(pv, sdfg, state, dims)
+
+
+
+@oprepo.replaces('dace.comm.Cart_sub')
+def _cart_sub(pv: 'ProgramVisitor',
+              sdfg: SDFG,
+              state: SDFGState,
+              parent_grid: str,
+              color: Sequence[Union[Integral, bool]],
+              exact_grid: RankType = None):
+    """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program.
+        The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html).
+        :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`).
+        :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`).
+        :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication.
+        :return: Name of the new sub-grid descriptor.
+    """
+    pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid)
+
+    # Count sub-grid dimensions.
+    pgrid_ndims = sum([bool(c) for c in color])
+
+    # Dummy tasklet adds MPI variables to the program's state.
+    from dace.libraries.mpi import Dummy
+    tasklet = Dummy(pgrid_name, [
+        f'MPI_Comm {pgrid_name}_comm;',
+        f'MPI_Group {pgrid_name}_group;',
+        f'int {pgrid_name}_coords[{pgrid_ndims}];',
+        f'int {pgrid_name}_dims[{pgrid_ndims}];',
+        f'int {pgrid_name}_rank;',
+        f'int {pgrid_name}_size;',
+        f'bool {pgrid_name}_valid;',
+    ])
+
+    state.add_node(tasklet)
+
+    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
+    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
+    wnode = state.add_write(pgrid_name)
+    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
+
+    return pgrid_name
+
+
+@oprepo.replaces_method('ProcessGrid', 'Sub')
+def _pgrid_sub(pv: 'ProgramVisitor',
+               sdfg: SDFG,
+               state: SDFGState,
+               parent_grid: str,
+               color: Sequence[Union[Integral, bool]]):
+    """ Equivalent to `dace.comm.Cart_sub(parent_grid, color).
+        :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`).
+        :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`).
+        :return: Name of the new sub-grid descriptor.
+    """
+
+    return _cart_sub(pv, sdfg, state, parent_grid, color)
+
+
+@oprepo.replaces_operator('ProcessGrid', 'Eq', otherclass='Comm')
+@oprepo.replaces_operator('ProcessGrid', 'Is', otherclass='Comm')
+def _pgrid_eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'):
+    from mpi4py import MPI
+    if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL:
+        return False
+    return True
+
+
+@oprepo.replaces_operator('Comm', 'Eq', otherclass='ProcessGrid')
+@oprepo.replaces_operator('Comm', 'Is', otherclass='ProcessGrid')
+def _comm_eq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'):
+    from mpi4py import MPI
+    if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL:
+        return False
+    return True
+
+
+@oprepo.replaces_operator('ProcessGrid', 'NotEq', otherclass='Comm')
+@oprepo.replaces_operator('ProcessGrid', 'IsNot', otherclass='Comm')
+def _pgrid_neq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'):
+    from mpi4py import MPI
+    if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL:
+        return True
+    return False
+
+
+@oprepo.replaces_operator('Comm', 'NotEq', otherclass='ProcessGrid')
+@oprepo.replaces_operator('Comm', 'IsNot', otherclass='ProcessGrid')
+def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'):
+    from mpi4py import MPI
+    if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL:
+        return True
+    return False
+
+
+##### MPI Collectives
+
+
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast')
 @oprepo.replaces('dace.comm.Bcast')
 def _bcast(pv: 'ProgramVisitor',
            sdfg: SDFG,
@@ -45,6 +189,41 @@ def _bcast(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces_method('Intracomm', 'Bcast')
+def _intracomm_bcast(pv: 'ProgramVisitor',
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     icomm: 'Intracomm',
+                     buffer: str,
+                     root: Union[str, sp.Expr, Number] = 0):
+
+    """ Equivalent to `dace.comm.Bcast(buffer, root)`. """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    return _bcast(pv, sdfg, state, buffer, root)
+
+
+@oprepo.replaces_method('ProcessGrid', 'Bcast')
+def _pgrid_bcast(pv: 'ProgramVisitor',
+                 sdfg: SDFG,
+                 state: SDFGState,
+                 pgrid: str,
+                 buffer: str,
+                 root: Union[str, sp.Expr, Number] = 0):
+
+    """ Equivalent to `dace.comm.Bcast(buffer, root, grid=pgrid)`. """
+
+    return _bcast(pv, sdfg, state, buffer, root, grid=pgrid)
+
+
+def _mpi4py_to_MPI(MPI, op):
+    if op is MPI.SUM:
+        return 'MPI_SUM'
+    raise NotImplementedError
+
+
 @oprepo.replaces('dace.comm.Reduce')
 def _Reduce(pv: 'ProgramVisitor',
             sdfg: SDFG,
@@ -75,8 +254,9 @@ def _Reduce(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
-def _Allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
+def _allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
 
     from dace.libraries.mpi.nodes.allreduce import Allreduce
 
@@ -90,6 +270,46 @@ def _Allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str,
     return None
 
 
+@oprepo.replaces_method('Intracomm', 'Allreduce')
+def _intracomm_allreduce(pv: 'ProgramVisitor',
+                         sdfg: SDFG,
+                         state: SDFGState,
+                         icomm: 'Intracomm',
+                         inp_buffer: 'InPlace',
+                         out_buffer: str,
+                         op: str):
+
+    """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    if inp_buffer != MPI.IN_PLACE:
+        raise ValueError('DaCe currently supports in-place Allreduce only.')
+    if isinstance(op, MPI.Op):
+        op = _mpi4py_to_MPI(MPI, op)
+    return _allreduce(pv, sdfg, state, out_buffer, op)
+
+
+@oprepo.replaces_method('ProcessGrid', 'Allreduce')
+def _pgrid_allreduce(pv: 'ProgramVisitor',
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     pgrid: str,
+                     inp_buffer: 'InPlace',
+                     out_buffer: str,
+                     op: str):
+
+    """ Equivalent to `dace.comm.Allreduce(out_buffer, op, grid=pgrid)`. """
+
+    from mpi4py import MPI
+    if inp_buffer != MPI.IN_PLACE:
+        raise ValueError('DaCe currently supports in-place Allreduce only.')
+    if isinstance(op, MPI.Op):
+        op = _mpi4py_to_MPI(MPI, op)
+    return _allreduce(pv, sdfg, state, out_buffer, op, grid=pgrid)
+
+
 @oprepo.replaces('dace.comm.Scatter')
 def _scatter(pv: 'ProgramVisitor',
              sdfg: SDFG,
@@ -519,77 +739,6 @@ def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
-@oprepo.replaces('dace.comm.Cart_create')
-def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: ShapeType):
-    """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html).
-        :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3].
-        :return: Name of the new process-grid descriptor.
-    """
-    pgrid_name = sdfg.add_pgrid(dims)
-
-    # Dummy tasklet adds MPI variables to the program's state.
-    from dace.libraries.mpi import Dummy
-    tasklet = Dummy(pgrid_name, [
-        f'MPI_Comm {pgrid_name}_comm;',
-        f'MPI_Group {pgrid_name}_group;',
-        f'int {pgrid_name}_coords[{len(dims)}];',
-        f'int {pgrid_name}_dims[{len(dims)}];',
-        f'int {pgrid_name}_rank;',
-        f'int {pgrid_name}_size;',
-        f'bool {pgrid_name}_valid;',
-    ])
-
-    state.add_node(tasklet)
-
-    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
-    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
-    wnode = state.add_write(pgrid_name)
-    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
-
-    return pgrid_name
-
-
-@oprepo.replaces('dace.comm.Cart_sub')
-def _cart_sub(pv: 'ProgramVisitor',
-              sdfg: SDFG,
-              state: SDFGState,
-              parent_grid: str,
-              color: Sequence[Union[Integral, bool]],
-              exact_grid: RankType = None):
-    """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program.
-        The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html).
-        :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`).
-        :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`).
-        :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication.
-        :return: Name of the new sub-grid descriptor.
-    """
-    pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid)
-
-    # Count sub-grid dimensions.
-    pgrid_ndims = sum([bool(c) for c in color])
-
-    # Dummy tasklet adds MPI variables to the program's state.
-    from dace.libraries.mpi import Dummy
-    tasklet = Dummy(pgrid_name, [
-        f'MPI_Comm {pgrid_name}_comm;',
-        f'MPI_Group {pgrid_name}_group;',
-        f'int {pgrid_name}_coords[{pgrid_ndims}];',
-        f'int {pgrid_name}_dims[{pgrid_ndims}];',
-        f'int {pgrid_name}_rank;',
-        f'int {pgrid_name}_size;',
-        f'bool {pgrid_name}_valid;',
-    ])
-
-    state.add_node(tasklet)
-
-    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
-    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
-    wnode = state.add_write(pgrid_name)
-    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
-
-    return pgrid_name
-
-
 @oprepo.replaces('dace.comm.Subarray')
 def _subarray(pv: 'ProgramVisitor',
               sdfg: SDFG,

From a13a81d139515511405dfff05303d2dbd77a9982 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 26 Jul 2022 17:18:51 +0200
Subject: [PATCH 002/392] ProcessGrids now appear in defined variables and are
 explicitely returned by the gettype method.

---
 dace/frontend/python/newast.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index b41745ffaa..f1bdcbd97b 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1117,6 +1117,13 @@ def __init__(self,
         # Indirections
         self.indirections = dict()
 
+        # Add mpi4py.MPI.COMM_WORLD aliases to variables
+        # try:
+        #     from mpi4py import MPI
+        #     self.variables.update({k: "MPI_COMM_WORLD" for k, v in self.globals.items() if v is MPI.COMM_WORLD})
+        # except:
+        #     pass
+
     @classmethod
     def progress_count(cls) -> int:
         """ Returns the number of parsed SDFGs so far within this run. """
@@ -1267,6 +1274,14 @@ def defined(self):
         # TODO: Is there a case of a variable-symbol?
         result.update({k: self.sdfg.symbols[v] for k, v in self.variables.items() if v in self.sdfg.symbols})
 
+        # MPI-related stuff
+        result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids})
+        # try:
+        #     from mpi4py import MPI
+        #     result.update({k: v for k, v in self.globals.items() if v is MPI.COMM_WORLD})
+        # except:
+        #     pass
+
         return result
 
     def _add_state(self, label=None):
@@ -4453,7 +4468,9 @@ def _gettype(self, opnode: ast.AST) -> List[Tuple[str, str]]:
 
         result = []
         for operand in operands:
-            if isinstance(operand, str) and operand in self.sdfg.arrays:
+            if isinstance(operand, str) and operand in self.sdfg.process_grids:
+                result.append((operand, type(self.sdfg.process_grids[operand]).__name__))
+            elif isinstance(operand, str) and operand in self.sdfg.arrays:
                 result.append((operand, type(self.sdfg.arrays[operand]).__name__))
             elif isinstance(operand, str) and operand in self.scope_arrays:
                 result.append((operand, type(self.scope_arrays[operand]).__name__))

From a8d56901b7dc8372131951e3a026fb11df4052e1 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 26 Jul 2022 17:19:30 +0200
Subject: [PATCH 003/392] Added MPIResolver to resolve mpi4py-related constants
 during preprocessing.

---
 dace/frontend/python/preprocessing.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 9f39648f09..9b6d3650c1 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -1316,6 +1316,28 @@ def find_disallowed_statements(node: ast.AST):
     return None
 
 
+class MPIResolver(ast.NodeTransformer):
+    """ Resolves mpi4py-related constants, e.g., mpi4py.MPI.COMM_WORLD. """
+    def __init__(self, globals: Dict[str, Any]):
+        from mpi4py import MPI
+        self.globals = globals
+        self.MPI = MPI
+    
+    def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]:
+        if node.id in self.globals:
+            obj = self.globals[node.id]
+            if isinstance(obj, self.MPI.Comm):
+                lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI')
+                if obj is self.MPI.COMM_WORLD:
+                    return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node)
+                elif obj is self.MPI.COMM_NULL:
+                    return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node)
+                else:
+                    raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used '
+                                          'directly inside a DaCe Python program.')
+        return node
+
+
 def preprocess_dace_program(f: Callable[..., Any],
                             argtypes: Dict[str, data.Data],
                             global_vars: Dict[str, Any],
@@ -1356,6 +1378,11 @@ def preprocess_dace_program(f: Callable[..., Any],
         newmod = global_vars[mod]
         #del global_vars[mod]
         global_vars[modval] = newmod
+    
+    try:
+        src_ast = MPIResolver(global_vars).visit(src_ast)
+    except ModuleNotFoundError:
+        pass
 
     # Resolve constants to their values (if they are not already defined in this scope)
     # and symbols to their names

From ac480bd2ec75cf0587be113707f2ac570a8c44da Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 26 Jul 2022 17:20:09 +0200
Subject: [PATCH 004/392] Added mpi4py compatiblity tests.

---
 tests/library/mpi/mpi4py_test.py | 181 +++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 tests/library/mpi/mpi4py_test.py

diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
new file mode 100644
index 0000000000..cc9968e4f8
--- /dev/null
+++ b/tests/library/mpi/mpi4py_test.py
@@ -0,0 +1,181 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+import numpy as np
+import pytest
+
+
+
+@pytest.mark.mpi
+def test_process_grid_bcast():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def pgrid_bcast(A: dace.int32[10]):
+        pgrid = MPI.COMM_WORLD.Create_cart([1, size])
+        if pgrid != MPI.COMM_NULL:
+            pgrid.Bcast(A)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = pgrid_bcast.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    if rank == 0:
+        A = np.arange(10, dtype=np.int32)
+        A_ref = A.copy()
+    else:
+        A = np.zeros((10, ), dtype=np.int32)
+        A_ref = A.copy()
+
+    func(A=A)
+    pgrid_bcast.f(A_ref)
+
+    assert(np.array_equal(A, A_ref))
+
+
+@pytest.mark.mpi
+def test_sub_grid_bcast():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def subgrid_bcast(A: dace.int32[10], rank: dace.int32):
+        pgrid = commworld.Create_cart([2, size // 2])
+        if pgrid != MPI.COMM_NULL:
+            sgrid = pgrid.Sub([False, True])
+            pgrid.Bcast(A)
+        B = np.empty_like(A)
+        B[:] = rank % 10
+        if pgrid != MPI.COMM_NULL:
+            sgrid.Bcast(B)
+        A[:] = B
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = subgrid_bcast.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    if rank == 0:
+        A = np.arange(10, dtype=np.int32)
+    else:
+        A = np.ones((10, ), dtype=np.int32)
+    A_ref = A.copy()
+
+    func(A=A, rank=rank)
+    subgrid_bcast.f(A_ref, rank)
+
+    assert(np.array_equal(A, A_ref))
+
+
+def initialize_3mm(b_NI: int, b_NJ: int, b_NK: int, b_NL: int, b_NM: int,
+                   ts_NI: int, ts_NJ: int, ts_NK, ts_NL: int, ts_NM: int,
+                   NI: int, NJ: int, NK: int, NL: int, NM: int,
+                   datatype: type = np.float64):
+
+    A = np.fromfunction(lambda i, k: b_NK + k + 1, (ts_NI, ts_NK), dtype=datatype)
+    B = np.eye(ts_NK, ts_NJ, b_NK - b_NJ)
+    C = np.fromfunction(lambda j, m: b_NJ + j + 1, (ts_NJ, ts_NM), dtype=datatype)
+    D = np.eye(ts_NM, ts_NL, b_NM - b_NL)
+
+    if b_NI + ts_NI > NI:
+        A[NI - b_NI:] = 0
+    if b_NJ + ts_NJ > NJ:
+        B[:, NJ - b_NJ:] = 0
+        C[NJ - b_NJ:] = 0
+    if b_NK + ts_NJ > NK:
+        A[:, NK - b_NK:] = 0
+        B[NK - b_NK:] = 0
+    if b_NL + ts_NL > NL:
+        D[:NL - b_NL] = 0
+    if b_NM + ts_NM > NM:
+        C[:NM - b_NM] = 0
+        D[NM - b_NM:] = 0
+
+    return A, B, C, D
+
+
+@pytest.mark.mpi
+def test_3mm():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def k3mm(A, B, C, D):
+        cart_comm = commworld.Create_cart([1, size, 1])
+        if cart_comm != MPI.COMM_NULL:
+
+            ab_reduce_comm = cart_comm.Sub([False, False, True])
+            cd_reduce_comm = cart_comm.Sub([True, False, False])
+            abcd_reduce_comm = cart_comm.Sub([False, True, False])
+
+            ab = A @ B
+            ab_reduce_comm.Allreduce(MPI.IN_PLACE, ab, op=MPI.SUM)
+            cd = C @ D
+            cd_reduce_comm.Allreduce(MPI.IN_PLACE, cd, op=MPI.SUM)
+            E = ab @ cd
+            abcd_reduce_comm.Allreduce(MPI.IN_PLACE, E, op=MPI.SUM)
+
+            return E
+
+    N = 128
+    assert(size <= 128)
+    
+    NI, NJ, NK, NL, NM = (N,) * 5
+    PNI, PNJ, PNK, PNL, PNM = 1, 2, 1, 1, 1
+
+    cart_comm = commworld.Create_cart([1, size, 1])
+    cart_rank = cart_comm.Get_rank()
+    cart_size = cart_comm.Get_size()
+    cart_coords = cart_comm.Get_coords(cart_rank)
+    
+    ts_NI = int(np.ceil(NI / PNI))
+    ts_NJ = int(np.ceil(NJ / PNJ))
+    ts_NK = int(np.ceil(NJ / PNK))
+    ts_NL = int(np.ceil(NL / PNL))
+    ts_NM = int(np.ceil(NM / PNM))
+
+    b_NI = cart_coords[0] * ts_NI
+    b_NJ = cart_coords[1] * ts_NJ
+    b_NK = cart_coords[2] * ts_NK
+    b_NL = cart_coords[2] * ts_NL
+    b_NM = cart_coords[0] * ts_NM
+    A, B, C, D = initialize_3mm(b_NI, b_NJ, b_NK, b_NL, b_NM, ts_NI, ts_NJ, ts_NK, ts_NL, ts_NM, NI, NJ, NK, NL, NM)
+
+    sdfg = None
+    if rank == 0:
+        sdfg = k3mm.to_sdfg(A=A, B=B, C=C, D=D)
+    func = utils.distributed_compile(sdfg, commworld)
+
+    E = func(A=A, B=B, C=C, D=D)
+    commworld.Barrier()
+    E_ref = k3mm.f(A, B, C, D)
+    commworld.Barrier()
+
+    if E_ref is not None:
+        assert(np.array_equal(E, E_ref))
+
+
+
+if __name__ == "__main__":
+
+    test_process_grid_bcast()
+    test_sub_grid_bcast()
+    test_3mm()

From d2292669d7b4b88fc2987239a3805ed67b60164a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:54:26 +0200
Subject: [PATCH 005/392] Made opaque type for MPI_Request a basic dace type.

---
 dace/dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index 0055eef837..a622b697c2 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -1115,6 +1115,7 @@ def isconstant(var):
 float64 = typeclass(numpy.float64)
 complex64 = typeclass(numpy.complex64)
 complex128 = typeclass(numpy.complex128)
+MPI_Request = opaque('MPI_Request')
 
 
 @undefined_safe_enum

From f933974fe9d378716c16b499e76f3e805d1b2fba Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:55:30 +0200
Subject: [PATCH 006/392] Adjusted existing Isend/Irecv replacements and added
 new ones for mpi4py compatibility.

---
 dace/frontend/common/distr.py | 99 +++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 4 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index c201e7ce14..c34fe54f41 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -370,6 +370,8 @@ def _gather(pv: 'ProgramVisitor',
     return None
 
 
+##### Point-To-Point Communication
+
 @oprepo.replaces('dace.comm.Send')
 def _send(pv: 'ProgramVisitor',
           sdfg: SDFG,
@@ -442,13 +444,19 @@ def _send(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend')
 @oprepo.replaces('dace.comm.Isend')
 def _isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, dst: Union[str, sp.Expr, Number],
-           tag: Union[str, sp.Expr, Number], request: str):
+           tag: Union[str, sp.Expr, Number], request: str = None, grid: str = None):
 
     from dace.libraries.mpi.nodes.isend import Isend
 
-    libnode = Isend('_Isend_')
+    ret_req = False
+    if not request:
+        ret_req = True
+        request, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+
+    libnode = Isend('_Isend_', grid=grid)
 
     buf_range = None
     if isinstance(buffer, tuple):
@@ -523,9 +531,47 @@ def _isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, dst:
     state.add_edge(tag_node, None, libnode, '_tag', tag_mem)
     state.add_edge(libnode, '_request', req_node, None, req_mem)
 
+    if ret_req:
+        return request
     return None
 
 
+@oprepo.replaces_method('Intracomm', 'Isend')
+def _intracomm_isend(pv: 'ProgramVisitor',
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     icomm: 'Intracomm',
+                     buffer: str,
+                     dst: Union[str, sp.Expr, Number],
+                     tag: Union[str, sp.Expr, Number]):
+
+    """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+    _isend(pv, sdfg, state, buffer, dst, tag, req)
+    return req
+
+
+@oprepo.replaces_method('ProcessGrid', 'Isend')
+def _pgrid_isend(pv: 'ProgramVisitor',
+                 sdfg: SDFG,
+                 state: SDFGState,
+                 pgrid: str,
+                 buffer: str,
+                 dst: Union[str, sp.Expr, Number],
+                 tag: Union[str, sp.Expr, Number]):
+
+    """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """
+
+    from mpi4py import MPI
+    req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+    _isend(pv, sdfg, state, buffer, dst, tag, req, grid=pgrid)
+    return req
+
+
 @oprepo.replaces('dace.comm.Recv')
 def _recv(pv: 'ProgramVisitor',
           sdfg: SDFG,
@@ -598,13 +644,19 @@ def _recv(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv')
 @oprepo.replaces('dace.comm.Irecv')
 def _irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, src: Union[str, sp.Expr, Number],
-           tag: Union[str, sp.Expr, Number], request: str):
+           tag: Union[str, sp.Expr, Number], request: str = None, grid: str = None):
 
     from dace.libraries.mpi.nodes.irecv import Irecv
 
-    libnode = Irecv('_Irecv_')
+    ret_req = False
+    if not request:
+        ret_req = True
+        request, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+
+    libnode = Irecv('_Irecv_', grid=grid)
 
     buf_range = None
     if isinstance(buffer, tuple):
@@ -677,9 +729,47 @@ def _irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, src:
     state.add_edge(tag_node, None, libnode, '_tag', tag_mem)
     state.add_edge(libnode, '_request', req_node, None, req_mem)
 
+    if ret_req:
+        return request
     return None
 
 
+@oprepo.replaces_method('Intracomm', 'Irecv')
+def _intracomm_irecv(pv: 'ProgramVisitor',
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     icomm: 'Intracomm',
+                     buffer: str,
+                     src: Union[str, sp.Expr, Number],
+                     tag: Union[str, sp.Expr, Number]):
+
+    """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+    _irecv(pv, sdfg, state, buffer, src, tag, req)
+    return req
+
+
+@oprepo.replaces_method('ProcessGrid', 'Irecv')
+def _pgrid_irecv(pv: 'ProgramVisitor',
+                 sdfg: SDFG,
+                 state: SDFGState,
+                 pgrid: str,
+                 buffer: str,
+                 src: Union[str, sp.Expr, Number],
+                 tag: Union[str, sp.Expr, Number]):
+
+    """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """
+
+    from mpi4py import MPI
+    req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+    _irecv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid)
+    return req
+
+
 @oprepo.replaces('dace.comm.Wait')
 def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str):
 
@@ -713,6 +803,7 @@ def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.Request.Waitall')
 @oprepo.replaces('dace.comm.Waitall')
 def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str):
 

From c2240131bb32005dc81746e963d4bd20e94fc2a8 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:57:04 +0200
Subject: [PATCH 007/392] Adjusted visit_Attribute of MPI_Resolver to not
 trigger to calls of MPI.Request. Added preprocessor class for converting
 modulo expressions for C/C++ compatibility.

---
 dace/frontend/python/preprocessing.py | 46 +++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 9b6d3650c1..f465ae8e02 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -1322,20 +1322,61 @@ def __init__(self, globals: Dict[str, Any]):
         from mpi4py import MPI
         self.globals = globals
         self.MPI = MPI
+        self.parent = None
+    
+    def visit(self, node):
+        node.parent = self.parent
+        self.parent = node
+        node = super().visit(node)
+        if isinstance(node, ast.AST):
+            self.parent = node.parent
+        return node
     
     def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]:
+        self.generic_visit(node)
         if node.id in self.globals:
             obj = self.globals[node.id]
             if isinstance(obj, self.MPI.Comm):
                 lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI')
                 if obj is self.MPI.COMM_WORLD:
-                    return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node)
+                    newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node)
+                    newnode.parent = node.parent
+                    return newnode
                 elif obj is self.MPI.COMM_NULL:
-                    return ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node)
+                    newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node)
+                    newnode.parent = node.parent
+                    return newnode
                 else:
                     raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used '
                                           'directly inside a DaCe Python program.')
         return node
+    
+    def visit_Attribute(self, node: ast.Attribute) -> ast.Attribute:
+        self.generic_visit(node)
+        if isinstance(node.attr, str) and node.attr == 'Request':
+            try:
+                val = astutils.evalnode(node, self.globals)
+                if val is self.MPI.Request and not isinstance(node.parent, ast.Attribute):
+                    newnode = ast.copy_location(
+                        ast.Attribute(value=ast.Name(id='dace', ctx=ast.Load), attr='MPI_Request'), node)
+                    newnode.parent = node.parent
+                    return newnode
+            except SyntaxError:
+                pass
+        return node
+
+
+class ModuloConverter(ast.NodeTransformer):
+    """ Converts a % b expressions to (a + b) % b for C/C++ compatibility. """
+
+    def visit_BinOp(self, node: ast.BinOp) -> ast.BinOp:
+        if isinstance(node.op, ast.Mod):
+            left = self.generic_visit(node.left)
+            right = self.generic_visit(node.right)
+            newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=copy.deepcopy(right)), left)
+            node.left = newleft
+            return node
+        return self.generic_visit(node)
 
 
 def preprocess_dace_program(f: Callable[..., Any],
@@ -1383,6 +1424,7 @@ def preprocess_dace_program(f: Callable[..., Any],
         src_ast = MPIResolver(global_vars).visit(src_ast)
     except ModuleNotFoundError:
         pass
+    src_ast = ModuloConverter().visit(src_ast)
 
     # Resolve constants to their values (if they are not already defined in this scope)
     # and symbols to their names

From ebe22ed82b20bc7105c020b135e0e07a4416cbd2 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:57:35 +0200
Subject: [PATCH 008/392] Replacement for numpy full now also works with
 (scalar) data.

---
 dace/frontend/python/replacements.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 4a673f1179..411f8e551c 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -281,26 +281,38 @@ def _numpy_full(pv: 'ProgramVisitor',
                 sdfg: SDFG,
                 state: SDFGState,
                 shape: Shape,
-                fill_value: Union[sp.Expr, Number],
+                fill_value: Union[sp.Expr, Number, data.Scalar],
                 dtype: dace.typeclass = None):
     """ Creates and array of the specified shape and initializes it with
         the fill value.
     """
+    is_data = False
     if isinstance(fill_value, (Number, np.bool_)):
         vtype = dtypes.DTYPE_TO_TYPECLASS[type(fill_value)]
     elif isinstance(fill_value, sp.Expr):
         vtype = _sym_type(fill_value)
     else:
-        raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value))
+        is_data = True
+        vtype = sdfg.arrays[fill_value].dtype
+        # raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value))
     dtype = dtype or vtype
     name, _ = sdfg.add_temp_transient(shape, dtype)
 
-    state.add_mapped_tasklet(
-        '_numpy_full_', {"__i{}".format(i): "0: {}".format(s)
-                         for i, s in enumerate(shape)}, {},
-        "__out = {}".format(fill_value),
-        dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
-        external_edges=True)
+    if is_data:
+        state.add_mapped_tasklet(
+            '_numpy_full_', {"__i{}".format(i): "0: {}".format(s)
+                            for i, s in enumerate(shape)},
+            dict(__inp=dace.Memlet(data=fill_value, subset='0')),
+            "__out = __inp",
+            dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
+            external_edges=True)
+    else:
+        state.add_mapped_tasklet(
+            '_numpy_full_', {"__i{}".format(i): "0: {}".format(s)
+                            for i, s in enumerate(shape)}, {},
+            "__out = {}".format(fill_value),
+            dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
+            external_edges=True)
 
     return name
 

From 2bfcea90c615ef4443cb14a43b69541952fdd184 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:58:09 +0200
Subject: [PATCH 009/392] Isend/Irecv can now use communicators other than
 COMM_WORLD.

---
 dace/libraries/mpi/nodes/irecv.py | 12 ++++++++++--
 dace/libraries/mpi/nodes/isend.py | 21 ++++++++++++---------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/dace/libraries/mpi/nodes/irecv.py b/dace/libraries/mpi/nodes/irecv.py
index 903bed7543..ad43cb4103 100644
--- a/dace/libraries/mpi/nodes/irecv.py
+++ b/dace/libraries/mpi/nodes/irecv.py
@@ -20,6 +20,11 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
 
         if buffer.dtype.veclen > 1:
             raise NotImplementedError
+        
+        comm = "MPI_COMM_WORLD"
+        if node.grid:
+            comm = f"__state->{node.grid}_comm"
+
         code = ""
         if ddt is not None:
             code = f"""static MPI_Datatype newtype;
@@ -33,7 +38,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
             mpi_dtype_str = "newtype"
             count_str = "1"
         buffer_offset = 0  #this is here because the frontend already changes the pointer
-        code += f"MPI_Irecv(_buffer, {count_str}, {mpi_dtype_str}, _src, _tag, MPI_COMM_WORLD, _request);"
+        code += f"MPI_Irecv(_buffer, {count_str}, {mpi_dtype_str}, int(_src), int(_tag), {comm}, _request);"
         if ddt is not None:
             code += f"""// MPI_Type_free(&newtype);
             """
@@ -58,8 +63,11 @@ class Irecv(MPINode):
     }
     default_implementation = "MPI"
 
-    def __init__(self, name, *args, **kwargs):
+    grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
+
+    def __init__(self, name, grid=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_src", "_tag"}, outputs={"_buffer", "_request"}, **kwargs)
+        self.grid = grid
 
     def validate(self, sdfg, state):
         """
diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py
index 342bf2b420..cfd69e46ab 100644
--- a/dace/libraries/mpi/nodes/isend.py
+++ b/dace/libraries/mpi/nodes/isend.py
@@ -20,6 +20,10 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
 
         if buffer.dtype.veclen > 1:
             raise NotImplementedError
+        
+        comm = "MPI_COMM_WORLD"
+        if node.grid:
+            comm = f"__state->{node.grid}_comm"
 
         code = ""
 
@@ -40,7 +44,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
             mpi_dtype_str = "newtype"
             count_str = "1"
         buffer_offset = 0
-        code += f"MPI_Isend(&(_buffer[{buffer_offset}]), {count_str}, {mpi_dtype_str}, _dest, _tag, MPI_COMM_WORLD, _request);"
+        code += f"MPI_Isend(&(_buffer[{buffer_offset}]), {count_str}, {mpi_dtype_str}, int(_dest), int(_tag), {comm}, _request);"
         if ddt is not None:
             code += f"""// MPI_Type_free(&newtype);
             """
@@ -68,13 +72,12 @@ class Isend(MPINode):
     }
     default_implementation = "MPI"
 
-    # Object fields
-    n = dace.properties.SymbolicProperty(allow_none=True, default=None)
-
+    grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
     nosync = dace.properties.Property(dtype=bool, default=False, desc="Do not sync if memory is on GPU")
 
-    def __init__(self, name, *args, **kwargs):
+    def __init__(self, name, grid=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_buffer", "_dest", "_tag"}, outputs={"_request"}, **kwargs)
+        self.grid = grid
 
     def validate(self, sdfg, state):
         """
@@ -93,10 +96,10 @@ def validate(self, sdfg, state):
             if e.src_conn == "_request":
                 req = sdfg.arrays[e.data.data]
 
-        if dest.dtype.base_type != dace.dtypes.int32:
-            raise ValueError("Source must be an integer!")
-        if tag.dtype.base_type != dace.dtypes.int32:
-            raise ValueError("Tag must be an integer!")
+        # if dest.dtype.base_type != dace.dtypes.int32:
+        #     raise ValueError("Destination must be an integer!")
+        # if tag.dtype.base_type != dace.dtypes.int32:
+        #     raise ValueError("Tag must be an integer!")
 
         count_str = "XXX"
         for _, _, _, dst_conn, data in state.in_edges(self):

From fd0dc4076f4032abb1f39df7fafb153714152c6c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 27 Jul 2022 10:58:58 +0200
Subject: [PATCH 010/392] Added mpi4py-compatible Isend/Irecv test.

---
 tests/library/mpi/mpi4py_test.py | 37 +++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index cc9968e4f8..7c314b7516 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -173,9 +173,40 @@ def k3mm(A, B, C, D):
         assert(np.array_equal(E, E_ref))
 
 
+@pytest.mark.mpi
+def test_isend_irecv():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def chain(rank: dace.int32, size: dace.int32):
+        src = (rank - 1) % size
+        dst = (rank + 1) % size
+        req = np.empty((2, ), dtype=MPI.Request)
+        sbuf = np.full((1,), rank, dtype=np.int32)
+        req[0] = commworld.Isend(sbuf, dst, tag=0)
+        rbuf = np.empty((1, ), dtype=np.int32)
+        req[1] = commworld.Irecv(rbuf, src, tag=0)
+        MPI.Request.Waitall(req)
+        return rbuf
+    
+    sdfg = None
+    if rank == 0:
+        sdfg = chain.to_sdfg(simplify=True)
+    func = utils.distributed_compile(sdfg, commworld)
+
+    val = func(rank=rank, size=size)
+    ref = chain.f(rank, size)
+
+    assert(val[0] == ref[0])
+
 
 if __name__ == "__main__":
 
-    test_process_grid_bcast()
-    test_sub_grid_bcast()
-    test_3mm()
+    # test_process_grid_bcast()
+    # test_sub_grid_bcast()
+    # test_3mm()
+    test_isend_irecv()

From 630b0cb4846e80ee23f3851284a5293aff93d883 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Wed, 15 Mar 2023 00:08:18 -0700
Subject: [PATCH 011/392] Doc: Installation: Add OpenBLAS symlink tip

On some systems with OpenBLAS installed, libblas.so and liblapacke.so
may need to be created manually as symlinks to libopenblas.so.

Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
 AUTHORS                    | 1 +
 doc/setup/installation.rst | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index 1e6f0e3a5d..b987562424 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -32,5 +32,6 @@ Neville Walo
 Lukas Trümper
 Cliff Hodel
 Tiancheng Chen
+Reid Wahl
 
 and other contributors listed in https://github.com/spcl/dace/graphs/contributors
diff --git a/doc/setup/installation.rst b/doc/setup/installation.rst
index 14be68a71e..6eb266dc7c 100644
--- a/doc/setup/installation.rst
+++ b/doc/setup/installation.rst
@@ -130,7 +130,8 @@ Common issues with the DaCe Python module
   * **BLAS libraries**: When using fast BLAS operators (for example, matrix multiplication with Intel MKL), sometimes CMake cannot find the
     required include files or libraries on its own. If a library is installed but not found, add the include folders to
     the ``CPATH`` environment variable, and the library folders to the ``LIBRARY_PATH`` and ``LD_LIBRARY_PATH`` environment
-    variables.
+    variables. If OpenBLAS is installed but not found, also ensure that ``libblas.so`` and ``liblapacke.so`` exist in the
+    library folders as symbolic links pointing to ``libopenblas.so``.
 
   * **Bug in DaCe**: If you suspect an issue happens within DaCe, see :ref:`debugging` for ways to pinpoint the source
     of the issue.

From b8f7a4c5f2c941bdecb2229ce118aeeac162eb91 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 21 Mar 2023 22:02:18 -0700
Subject: [PATCH 012/392] Fix parentheses in codegen

---
 dace/codegen/targets/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 3ffac97b11..4ffa29452d 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1863,7 +1863,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
                         block_expr = '(%s * %s + threadIdx.z)' % (block_expr, _topy(block_dims[2]))
 
                     # true dim i = z / ('*'.join(kdims[i+1:])) % kdims[i]
-                    block_expr = '(%s / (%s)) %% (%s)' % (
+                    block_expr = '((%s / (%s)) %% (%s))' % (
                         block_expr,
                         _topy(functools.reduce(sympy.Mul, kdims[i + 1:], 1)),
                         _topy(kdims[i]),

From b327c0f472c8185cabc46f0373aa820b473b38a6 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Wed, 22 Mar 2023 17:48:56 -0700
Subject: [PATCH 013/392] Fix atomic operation detection for
 exactly-overlapping ranges

---
 dace/codegen/targets/cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index ef4eb71aa0..181ddbada6 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -641,7 +641,7 @@ def _check_range_conflicts(subset, a, itersym, b, step):
 
             # If False or indeterminate, the range may
             # overlap across iterations
-            if ((re - rb) > m[a] * step) != False:
+            if ((re - rb) >= m[a] * step) != False:
                 continue
 
             m = re.match(a * itersym + b)

From d3a969666bf82c4677f6e31a888995eb1cd4dbb9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Wed, 22 Mar 2023 17:49:28 -0700
Subject: [PATCH 014/392] Fix report reading with threads

---
 dace/optimization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/optimization/utils.py b/dace/optimization/utils.py
index 25aaca91db..2f415cd949 100644
--- a/dace/optimization/utils.py
+++ b/dace/optimization/utils.py
@@ -151,7 +151,7 @@ def _subprocess_measure(cutout_json: Dict, dreport, repetitions: int, q: mp.Queu
                 csdfg.finalize()
 
     report = cutout.get_latest_report()
-    durations = next(iter(next(iter(report.durations.values())).values()))
+    durations = next(iter(next(iter(next(iter(report.durations.values())).values())).values()))
     q.put(np.median(np.array(durations)))
 
 class MeasureProcess(mp.Process):

From d49473ee1add7a91f0684d4ed56a143176f69a03 Mon Sep 17 00:00:00 2001
From: Luo-Yihang <luo_yihang@outlook.com>
Date: Thu, 23 Mar 2023 14:23:12 +0800
Subject: [PATCH 015/392] add profiling for jocobi_1d

---
 samples/distributed/jacobi_1d.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/samples/distributed/jacobi_1d.py b/samples/distributed/jacobi_1d.py
index 6e0478981a..3361f73237 100644
--- a/samples/distributed/jacobi_1d.py
+++ b/samples/distributed/jacobi_1d.py
@@ -4,6 +4,7 @@
 import numpy as np
 import os
 import sys
+import timeit
 from dace.sdfg.utils import load_precompiled_sdfg
 
 from mpi4py import MPI
@@ -78,6 +79,10 @@ def init_data(N, datatype):
     return A, B
 
 
+def time_to_ms(raw):
+    return int(round(raw * 1000))
+
+
 if __name__ == "__main__":
 
     # Initialization
@@ -104,10 +109,33 @@ def init_data(N, datatype):
         build_folder = dc.Config.get('default_build_folder')
         mpi_func = load_precompiled_sdfg(os.path.join(build_folder, jacobi_1d_dist.name))
 
+    ldict = locals()
+
+    comm.Barrier()
+
     mpi_func(A=A, B=B, TSTEPS=TSTEPS, N=N, lN=lN, rank=rank, size=size)
 
+    comm.Barrier()
+
+    stmt = ("mpi_func(A=A, B=B, TSTEPS=TSTEPS, N=N, "
+            "lN=lN, rank=rank, size=size)")
+    setup = "A, B = init_data(N, np.float64); comm.Barrier()"
+    repeat = 10
+
+    raw_time_list = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
+    raw_time = np.median(raw_time_list)
+
+    comm.Barrier()
+
     if rank == 0:
+        ms_time = time_to_ms(raw_time)
+        print("Median is {}ms".format(ms_time))
+
         refA, refB = init_data(N, np.float64)
-        jacobi_1d_shared(TSTEPS, refA, refB)
+        shared_sdfg = jacobi_1d_shared.compile()
+        shared_sdfg(A=refA, B=refB, TSTEPS=TSTEPS, N=N)
+
+        print("=======Validation=======")
         assert (np.allclose(A, refA))
         assert (np.allclose(B, refB))
+        print("OK")

From 00b94c924a0dfc716821c1c65fcf44b368e802f9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 23 Mar 2023 03:29:44 -0700
Subject: [PATCH 016/392] Add node label to instrumentation report viewer
 (#1227)

---
 dace/codegen/instrumentation/report.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/instrumentation/report.py b/dace/codegen/instrumentation/report.py
index 3aad20bcd4..cb0b545784 100644
--- a/dace/codegen/instrumentation/report.py
+++ b/dace/codegen/instrumentation/report.py
@@ -194,7 +194,8 @@ def _get_runtimes_string(self,
                              string,
                              row_format,
                              colw,
-                             with_element_heading=True):
+                             with_element_heading=True,
+                             title=''):
         indent = ''
         if len(runtimes) > 0:
             element_label = ''
@@ -208,7 +209,10 @@ def _get_runtimes_string(self,
                     # No parent state row present yet, print it.
                     string += row_format.format('|-State (' + str(element[1]) + ')', '', '', '', '', width=colw)
                 state = element[1]
-                element_label = '| |-Node (' + str(element[2]) + ')'
+                if title:
+                    element_label = '| |-Node (' + str(element[2]) + ', ' + title + ')'
+                else:
+                    element_label = '| |-Node (' + str(element[2]) + ')'
                 indent = '| | |'
             elif element[0] > -1 and element[1] > -1:
                 # This element is a state.
@@ -350,7 +354,7 @@ def __str__(self):
                             label = ""
 
                         string, sdfg, state = self._get_runtimes_string(label, runtimes, element, sdfg, state, string,
-                                                                        row_format, COLW, with_element_heading)
+                                                                        row_format, COLW, with_element_heading, event)
 
                         with_element_heading = False
 

From 05a6e0dfdee33e0e6c293bfa9ddafaa91c569c37 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 23 Mar 2023 21:38:54 +0100
Subject: [PATCH 017/392] Adding output arrays to input arrays if they are not
 written fully in the SDFG.

---
 .../interstate/gpu_transform_sdfg.py          | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py
index 06dae1a611..b0d8b58329 100644
--- a/dace/transformation/interstate/gpu_transform_sdfg.py
+++ b/dace/transformation/interstate/gpu_transform_sdfg.py
@@ -1,7 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 """ Contains inter-state transformations of an SDFG to run on the GPU. """
 
-from dace import data, memlet, dtypes, registry, sdfg as sd, symbolic
+from dace import data, memlet, dtypes, registry, sdfg as sd, symbolic, subsets as sbs, propagate_memlets_sdfg
 from dace.sdfg import nodes, scope
 from dace.sdfg import utils as sdutil
 from dace.transformation import transformation, helpers as xfh
@@ -162,6 +162,9 @@ def apply(self, _, sdfg: sd.SDFG):
         output_nodes = []
         global_code_nodes: Dict[sd.SDFGState, nodes.Tasklet] = defaultdict(list)
 
+        # Propagate memlets to ensure that we can find the true array subsets that are written.
+        propagate_memlets_sdfg(sdfg)
+
         for state in sdfg.nodes():
             sdict = state.scope_dict()
             for node in state.nodes():
@@ -214,6 +217,26 @@ def apply(self, _, sdfg: sd.SDFG):
             name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True)
             cloned_arrays[onodename] = name
 
+            # The following ensures that when writing to a subset of an array, we don't overwrite the rest of the array
+            # when copying back to the host. This is done by adding the array to the `inputs_nodes,` while will copy
+            # the entire array to the GPU.
+            if (onodename, onode) not in input_nodes:
+                found_full_write = False
+                full_subset = sbs.Range.from_array(onode)
+                try:
+                    for state in sdfg.nodes():
+                        for node in state.nodes():
+                            if (isinstance(node, nodes.AccessNode) and node.data == onodename):
+                                for e in state.in_edges(node):
+                                    if e.data.get_dst_subset(e, state) == full_subset:
+                                        found_full_write = True
+                                        raise StopIteration
+                except StopIteration:
+                    assert found_full_write
+                if not found_full_write:
+                    input_nodes.append((onodename, onode))
+
+
         # Replace nodes
         for state in sdfg.nodes():
             for node in state.nodes():

From 665a94aceeb93bb95d35b4abfe88b25828d3dfe3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 23 Mar 2023 21:39:17 +0100
Subject: [PATCH 018/392] Added test.

---
 tests/transformations/gpu_transform_test.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/transformations/gpu_transform_test.py b/tests/transformations/gpu_transform_test.py
index d6814273a6..8ecdcc186e 100644
--- a/tests/transformations/gpu_transform_test.py
+++ b/tests/transformations/gpu_transform_test.py
@@ -58,6 +58,26 @@ def main_program(a: dace.int32):
     assert np.array_equal(out, np.array([0, 10] * 5, dtype=np.int32))
 
 
+def test_write_subset():
+
+    @dace.program
+    def write_subset(A: dace.int32[20, 20]):
+        for i, j in dace.map[2:18, 2:18]:
+            A[i, j] = i + j
+    
+    sdfg = write_subset.to_sdfg(simplify=True)
+    sdfg.apply_transformations(GPUTransformSDFG)
+
+    ref = np.ones((20, 20), dtype=np.int32)
+    val = np.copy(ref)
+
+    write_subset.f(ref)
+    sdfg(A=val)
+
+    assert np.array_equal(ref, val)
+
+
 if __name__ == '__main__':
     test_toplevel_transient_lifetime()
     test_scalar_to_symbol_in_nested_sdfg()
+    test_write_subset()

From c9e59a07ce22326ab493794531e9602cf95a417d Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:06:49 -0700
Subject: [PATCH 019/392] Fix internal subscript access if already existed
 (#1228)

---
 dace/frontend/python/newast.py              | 22 ++++++++-------
 tests/numpy/subarray_in_nested_call_test.py | 31 ++++++++++++++++++++-
 tests/python_frontend/indirections_test.py  | 19 +++++++++++++
 3 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 9789a433e2..d9a6458bf8 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3020,23 +3020,25 @@ def _add_read_access(self,
                          arr_type: data.Data = None):
         if name in self.sdfg.arrays:
             return (name, None)
-        elif (name, rng, 'w') in self.accesses:
-            return self.accesses[(name, rng, 'w')]
-        elif (name, rng, 'r') in self.accesses:
-            return self.accesses[(name, rng, 'r')]
         elif name in self.variables:
             return (self.variables[name], None)
+
+        if (name, rng, 'w') in self.accesses:
+            new_name, new_rng = self.accesses[(name, rng, 'w')]
+        elif (name, rng, 'r') in self.accesses:
+            new_name, new_rng = self.accesses[(name, rng, 'r')]
         elif name in self.scope_vars:
             new_name, new_rng = self._add_access(name, rng, 'r', target, new_name, arr_type)
-            full_rng = subsets.Range.from_array(self.sdfg.arrays[new_name])
-            if (_subset_has_indirection(rng, self) or _subset_is_local_symbol_dependent(rng, self)):
-                new_name, new_rng = self.make_slice(new_name, rng)
-            elif full_rng != new_rng:
-                new_name, new_rng = self.make_slice(new_name, new_rng)
-            return (new_name, new_rng)
         else:
             raise NotImplementedError
 
+        full_rng = subsets.Range.from_array(self.sdfg.arrays[new_name])
+        if (_subset_has_indirection(rng, self) or _subset_is_local_symbol_dependent(rng, self)):
+            new_name, new_rng = self.make_slice(new_name, rng)
+        elif full_rng != new_rng:
+            new_name, new_rng = self.make_slice(new_name, new_rng)
+        return (new_name, new_rng)
+
     def _add_write_access(self,
                           name: str,
                           rng: subsets.Range,
diff --git a/tests/numpy/subarray_in_nested_call_test.py b/tests/numpy/subarray_in_nested_call_test.py
index 09227b3f83..322c660da0 100644
--- a/tests/numpy/subarray_in_nested_call_test.py
+++ b/tests/numpy/subarray_in_nested_call_test.py
@@ -1,4 +1,5 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import math
 import numpy as np
 import dace
 
@@ -45,6 +46,34 @@ def test_inout_connector():
     assert np.allclose(a, ref)
 
 
+def test_indirect_symbolic_access():
+
+    @dace.program
+    def tester(a: dace.float64[20], b: dace.float64[3], c: dace.float64[19]):
+        for i in dace.map[0:10]:
+            xtmp: dace.float64 = 0
+
+            local_offset_i = (i + 1) % 2
+
+            if local_offset_i < 3 % 2:
+                for kx in dace.unroll(range(math.ceil(3 / 2))):
+                    ind_i = (i + 1 - kx * 2) // 2
+                    kind_i = local_offset_i + kx * 2
+                    if ind_i >= 0 and ind_i < 20:
+                        xtmp += a[ind_i] * b[kind_i]
+
+            c[i] = xtmp
+
+    a = np.random.rand(20)
+    b = np.random.rand(15)
+    c = np.random.rand(10)
+    refc = np.copy(c)
+    tester.f(a, b, refc)
+    tester(a, b, c)
+    assert np.allclose(c, refc)
+
+
 if __name__ == '__main__':
     test()
     test_inout_connector()
+    test_indirect_symbolic_access()
diff --git a/tests/python_frontend/indirections_test.py b/tests/python_frontend/indirections_test.py
index 0ea6cf5cf0..c59dffb922 100644
--- a/tests/python_frontend/indirections_test.py
+++ b/tests/python_frontend/indirections_test.py
@@ -59,6 +59,23 @@ def test_indirection_scalar_nsdfg():
     assert (np.allclose(res, A[x]))
 
 
+@dc.program
+def indirection_scalar2_nsdfg(A: dc.float64[10], x: dc.int32[10]):
+    B = np.empty_like(A)
+    for i in dc.map[0:A.shape[0]]:
+        a = x[i]
+        B[i] = A[a]
+        B[i] = A[a]
+    return B
+
+
+def test_indirection_scalar2_nsdfg():
+    A = np.random.randn(10).astype(np.float64)
+    x = np.random.randint(0, 10, size=(10, ), dtype=np.int32)
+    res = indirection_scalar2_nsdfg(A, x)
+    assert (np.allclose(res, A[x]))
+
+
 @dc.program
 def indirection_scalar_assign_nsdfg(A: dc.float64[10], x: dc.int32[10]):
     B = np.empty_like(A)
@@ -169,6 +186,7 @@ def test_indirection_scalar_range():
 
 
 def test_indirection_scalar_range_nsdfg():
+
     @dc.program
     def indirection_scalar_range_nsdfg(A: dc.float64[10], x: dc.int32[11]):
         B = np.empty_like(A)
@@ -374,6 +392,7 @@ def test_spmv():
     test_indirection_scalar_assign()
     test_indirection_scalar_augassign()
     test_indirection_scalar_nsdfg()
+    test_indirection_scalar2_nsdfg()
     test_indirection_scalar_assign_nsdfg()
     test_indirection_scalar_augassign_nsdfg()
     test_indirection_scalar_multi()

From 9a7f90fd336a4311159f9328dbd328cf9962b22d Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Thu, 23 Mar 2023 14:27:30 -0700
Subject: [PATCH 020/392] Add test

---
 tests/codegen/wcr_atomic_test.py | 61 ++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/codegen/wcr_atomic_test.py

diff --git a/tests/codegen/wcr_atomic_test.py b/tests/codegen/wcr_atomic_test.py
new file mode 100644
index 0000000000..005561ccb2
--- /dev/null
+++ b/tests/codegen/wcr_atomic_test.py
@@ -0,0 +1,61 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Tests atomic WCR detection in code generation. """
+import dace
+import numpy as np
+
+N = dace.symbol('N')
+
+
+def test_wcr_overlapping_atomic():
+
+    @dace.program
+    def tester(A: dace.float32[2 * N + 3]):
+        for i in dace.map[0:N]:
+            A[2 * i:2 * i + 3] += 1
+
+    sdfg = tester.to_sdfg()
+    code: str = sdfg.generate_code()[0].code
+    assert code.count('atomic') == 1
+
+
+def test_wcr_strided_atomic():
+
+    @dace.program
+    def tester(A: dace.float32[2 * N]):
+        for i in dace.map[1:N - 1]:
+            A[2 * i - 1:2 * i + 2] += 1
+
+    sdfg = tester.to_sdfg()
+    code: str = sdfg.generate_code()[0].code
+    assert code.count('atomic') == 1
+
+
+def test_wcr_strided_nonatomic():
+
+    @dace.program
+    def tester(A: dace.float32[2 * N + 3]):
+        for i in dace.map[0:N]:
+            A[2 * i:2 * i + 2] += 1
+
+    sdfg = tester.to_sdfg()
+    code: str = sdfg.generate_code()[0].code
+    assert code.count('atomic') == 0
+
+
+def test_wcr_strided_nonatomic_offset():
+
+    @dace.program
+    def tester(A: dace.float32[2 * N]):
+        for i in dace.map[1:N - 1]:
+            A[2 * i - 1:2 * i + 1] += 1
+
+    sdfg = tester.to_sdfg()
+    code: str = sdfg.generate_code()[0].code
+    assert code.count('atomic') == 0
+
+
+if __name__ == '__main__':
+    test_wcr_overlapping_atomic()
+    test_wcr_strided_atomic()
+    test_wcr_strided_nonatomic()
+    test_wcr_strided_nonatomic_offset()

From b006b3694624b3142ceab4d9b7cfcdb476c39a01 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Thu, 23 Mar 2023 14:30:07 -0700
Subject: [PATCH 021/392] Fix test

---
 tests/codegen/wcr_atomic_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/codegen/wcr_atomic_test.py b/tests/codegen/wcr_atomic_test.py
index 005561ccb2..f1495282b9 100644
--- a/tests/codegen/wcr_atomic_test.py
+++ b/tests/codegen/wcr_atomic_test.py
@@ -15,7 +15,7 @@ def tester(A: dace.float32[2 * N + 3]):
 
     sdfg = tester.to_sdfg()
     code: str = sdfg.generate_code()[0].code
-    assert code.count('atomic') == 1
+    assert code.count('atomic(') == 1
 
 
 def test_wcr_strided_atomic():
@@ -27,7 +27,7 @@ def tester(A: dace.float32[2 * N]):
 
     sdfg = tester.to_sdfg()
     code: str = sdfg.generate_code()[0].code
-    assert code.count('atomic') == 1
+    assert code.count('atomic(') == 1
 
 
 def test_wcr_strided_nonatomic():
@@ -39,7 +39,7 @@ def tester(A: dace.float32[2 * N + 3]):
 
     sdfg = tester.to_sdfg()
     code: str = sdfg.generate_code()[0].code
-    assert code.count('atomic') == 0
+    assert code.count('atomic(') == 0
 
 
 def test_wcr_strided_nonatomic_offset():
@@ -51,7 +51,7 @@ def tester(A: dace.float32[2 * N]):
 
     sdfg = tester.to_sdfg()
     code: str = sdfg.generate_code()[0].code
-    assert code.count('atomic') == 0
+    assert code.count('atomic(') == 0
 
 
 if __name__ == '__main__':

From feaa964329a995a8dd2c173c58a5af4c43a22db8 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 23 Mar 2023 22:39:43 +0100
Subject: [PATCH 022/392] Improved condition.

---
 .../transformation/interstate/gpu_transform_sdfg.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py
index b0d8b58329..7d0e7be4d2 100644
--- a/dace/transformation/interstate/gpu_transform_sdfg.py
+++ b/dace/transformation/interstate/gpu_transform_sdfg.py
@@ -8,6 +8,7 @@
 from dace.properties import Property, make_properties
 from collections import defaultdict
 from copy import deepcopy as dc
+from sympy import floor
 from typing import Dict
 
 gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned]
@@ -218,7 +219,7 @@ def apply(self, _, sdfg: sd.SDFG):
             cloned_arrays[onodename] = name
 
             # The following ensures that when writing to a subset of an array, we don't overwrite the rest of the array
-            # when copying back to the host. This is done by adding the array to the `inputs_nodes,` while will copy
+            # when copying back to the host. This is done by adding the array to the `inputs_nodes,` which will copy
             # the entire array to the GPU.
             if (onodename, onode) not in input_nodes:
                 found_full_write = False
@@ -229,6 +230,15 @@ def apply(self, _, sdfg: sd.SDFG):
                             if (isinstance(node, nodes.AccessNode) and node.data == onodename):
                                 for e in state.in_edges(node):
                                     if e.data.get_dst_subset(e, state) == full_subset:
+                                        is_full = True
+                                        for pe in state.memlet_tree(e):
+                                            vol = pe.data.volume
+                                            size = pe.data.get_dst_subset(pe, state).num_elements()
+                                            if pe.data.dynamic or vol / size != floor(vol / size):
+                                                is_full = False
+                                                break
+                                        if not is_full:
+                                            continue
                                         found_full_write = True
                                         raise StopIteration
                 except StopIteration:
@@ -236,7 +246,6 @@ def apply(self, _, sdfg: sd.SDFG):
                 if not found_full_write:
                     input_nodes.append((onodename, onode))
 
-
         # Replace nodes
         for state in sdfg.nodes():
             for node in state.nodes():

From 883caa705da8a788acf2358ac12e99999b2c7e8b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 23 Mar 2023 22:39:57 +0100
Subject: [PATCH 023/392] Added more tests.

---
 tests/transformations/gpu_transform_test.py | 52 +++++++++++++++++++--
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/tests/transformations/gpu_transform_test.py b/tests/transformations/gpu_transform_test.py
index 8ecdcc186e..a0bf072e5d 100644
--- a/tests/transformations/gpu_transform_test.py
+++ b/tests/transformations/gpu_transform_test.py
@@ -45,13 +45,13 @@ def nested_program(a: dace.int32, out: dace.int32[10]):
             else:
                 out[i] = 10
                 a /= 2
-    
+
     @dace.program
     def main_program(a: dace.int32):
-        out = np.ndarray((10,), dtype=np.int32)
+        out = np.ndarray((10, ), dtype=np.int32)
         nested_program(a, out)
         return out
-    
+
     sdfg = main_program.to_sdfg(simplify=False)
     sdfg.apply_transformations(GPUTransformSDFG)
     out = sdfg(a=4)
@@ -64,7 +64,7 @@ def test_write_subset():
     def write_subset(A: dace.int32[20, 20]):
         for i, j in dace.map[2:18, 2:18]:
             A[i, j] = i + j
-    
+
     sdfg = write_subset.to_sdfg(simplify=True)
     sdfg.apply_transformations(GPUTransformSDFG)
 
@@ -77,7 +77,51 @@ def write_subset(A: dace.int32[20, 20]):
     assert np.array_equal(ref, val)
 
 
+@pytest.mark.gpu
+def test_write_full():
+
+    M, N = dace.symbol('M'), dace.symbol('N')
+
+    @dace.program
+    def write_full(A: dace.int32[M, N]):
+        for i, j in dace.map[0:M, 0:N]:
+            A[i, j] = i + j
+
+    sdfg = write_full.to_sdfg(simplify=True)
+    sdfg.apply_transformations(GPUTransformSDFG)
+
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.AccessNode) and node.data == 'A':
+                assert state.out_degree(node) == 0
+
+
+@pytest.mark.gpu
+def test_write_subset_dynamic():
+
+    @dace.program
+    def write_subset_dynamic(A: dace.int32[20, 20], x: dace.int32[20], y: dace.int32[20]):
+        for i, j in dace.map[2:18, 2:18]:
+            A[x[i], y[j]] = i + j
+
+    sdfg = write_subset_dynamic.to_sdfg(simplify=True)
+    sdfg.apply_transformations(GPUTransformSDFG)
+
+    ref = np.ones((20, 20), dtype=np.int32)
+    val = np.copy(ref)
+
+    x = np.random.permutation(20).astype(np.int32)
+    y = np.random.permutation(20).astype(np.int32)
+
+    write_subset_dynamic.f(ref, x, y)
+    sdfg(A=val, x=x, y=y)
+
+    assert np.array_equal(ref, val)
+
+
 if __name__ == '__main__':
     test_toplevel_transient_lifetime()
     test_scalar_to_symbol_in_nested_sdfg()
     test_write_subset()
+    test_write_full()
+    test_write_subset_dynamic()

From 853de8ccf3294486529f41e938d8366a3cc8b313 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 23 Mar 2023 23:27:33 +0100
Subject: [PATCH 024/392] Moved the GPU mark to the correct test.

---
 tests/transformations/gpu_transform_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/transformations/gpu_transform_test.py b/tests/transformations/gpu_transform_test.py
index a0bf072e5d..f6d299e630 100644
--- a/tests/transformations/gpu_transform_test.py
+++ b/tests/transformations/gpu_transform_test.py
@@ -58,6 +58,7 @@ def main_program(a: dace.int32):
     assert np.array_equal(out, np.array([0, 10] * 5, dtype=np.int32))
 
 
+@pytest.mark.gpu
 def test_write_subset():
 
     @dace.program
@@ -77,7 +78,6 @@ def write_subset(A: dace.int32[20, 20]):
     assert np.array_equal(ref, val)
 
 
-@pytest.mark.gpu
 def test_write_full():
 
     M, N = dace.symbol('M'), dace.symbol('N')

From 7782956e304337d9611cf718a3e9410f64aed393 Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Fri, 24 Mar 2023 10:06:36 +0100
Subject: [PATCH 025/392] Scalar Write Shadow Pass Hotfix (#1233)

---
 dace/transformation/passes/analysis.py        |  16 ++-
 ...calar_write_shadow_scopes_analysis_test.py | 101 ++++++++++++++----
 2 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/dace/transformation/passes/analysis.py b/dace/transformation/passes/analysis.py
index 9f9fb06d23..1ca92d5ffd 100644
--- a/dace/transformation/passes/analysis.py
+++ b/dace/transformation/passes/analysis.py
@@ -324,12 +324,13 @@ def depends_on(self):
     def _find_dominating_write(
         self, desc: str, state: SDFGState, read: Union[nd.AccessNode, InterstateEdge],
         access_nodes: Dict[SDFGState, Tuple[Set[nd.AccessNode], Set[nd.AccessNode]]],
-        state_idom: Dict[SDFGState, SDFGState], access_sets: Dict[SDFGState, Tuple[Set[str], Set[str]]]
+        state_idom: Dict[SDFGState, SDFGState], access_sets: Dict[SDFGState, Tuple[Set[str], Set[str]]],
+        no_self_shadowing: bool = False
     ) -> Optional[Tuple[SDFGState, nd.AccessNode]]:
         if isinstance(read, nd.AccessNode):
             # If the read is also a write, it shadows itself.
             iedges = state.in_edges(read)
-            if len(iedges) > 0 and any(not e.data.is_empty() for e in iedges):
+            if len(iedges) > 0 and any(not e.data.is_empty() for e in iedges) and not no_self_shadowing:
                 return (state, read)
 
             # Find a dominating write within the same state.
@@ -337,7 +338,7 @@ def _find_dominating_write(
             closest_candidate = None
             write_nodes = access_nodes[desc][state][1]
             for cand in write_nodes:
-                if nxsp.has_path(state._nx, cand, read):
+                if cand != read and nxsp.has_path(state._nx, cand, read):
                     if closest_candidate is None or nxsp.has_path(state._nx, closest_candidate, cand):
                         closest_candidate = cand
             if closest_candidate is not None:
@@ -411,6 +412,15 @@ def apply_pass(self, top_sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[i
                                     desc, state, oedge.data, access_nodes, idom, access_sets
                                 )
                                 result[desc][write].add((state, oedge.data))
+                # Take care of any write nodes that have not been assigned to a scope yet, i.e., writes that are not
+                # dominating any reads and are thus not part of the results yet.
+                for state in desc_states_with_nodes:
+                    for write_node in access_nodes[desc][state][1]:
+                        if not (state, write_node) in result[desc]:
+                            write = self._find_dominating_write(
+                                desc, state, write_node, access_nodes, idom, access_sets, no_self_shadowing=True
+                            )
+                            result[desc][write].add((state, write_node))
 
                 # If any write A is dominated by another write B and any reads in B's scope are also reachable by A,
                 # then merge A and its scope into B's scope.
diff --git a/tests/passes/scalar_write_shadow_scopes_analysis_test.py b/tests/passes/scalar_write_shadow_scopes_analysis_test.py
index 9d8fed54c0..b833a12a94 100644
--- a/tests/passes/scalar_write_shadow_scopes_analysis_test.py
+++ b/tests/passes/scalar_write_shadow_scopes_analysis_test.py
@@ -94,12 +94,16 @@ def test_scalar_write_shadow_split():
     pipeline = Pipeline([ScalarWriteShadowScopes()])
     results = pipeline.apply_pass(sdfg, {})[ScalarWriteShadowScopes.__name__]
 
-    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == set([(loop_1_2, loop1_read_tmp)])
-    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == set([(loop_2_2, loop2_read_tmp)])
-    assert results[0]['A'][None] == set([(loop_1_1, a1_read), (loop_1_2, loop1_read_a), (loop_2_1, a2_read),
-                                         (loop_2_2, loop2_read_a)])
-    assert results[0]['B'][None] == set([(loop_1_1, b1_read), (loop_1_2, loop1_read_b), (loop_2_1, b2_read),
-                                         (loop_2_2, loop2_read_b)])
+    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == {(loop_1_2, loop1_read_tmp)}
+    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == {(loop_2_2, loop2_read_tmp)}
+    assert results[0]['A'][None] == {
+        (loop_1_1, a1_read), (loop_1_2, loop1_read_a), (loop_2_1, a2_read), (loop_2_2, loop2_read_a),
+        (loop_2_2, loop2_write_a), (loop_1_2, loop1_write_a)
+    }
+    assert results[0]['B'][None] == {
+        (loop_1_1, b1_read), (loop_1_2, loop1_read_b), (loop_2_1, b2_read), (loop_2_2, loop2_read_b),
+        (loop_2_2, loop2_write_b), (loop_1_2, loop1_write_b)
+    }
 
 
 def test_scalar_write_shadow_fused():
@@ -176,10 +180,10 @@ def test_scalar_write_shadow_fused():
     pipeline = Pipeline([ScalarWriteShadowScopes()])
     results = pipeline.apply_pass(sdfg, {})[ScalarWriteShadowScopes.__name__]
 
-    assert results[0]['tmp'][(loop_1, tmp1_read_write)] == set([(loop_1, tmp1_read_write)])
-    assert results[0]['tmp'][(loop_2, tmp2_read_write)] == set([(loop_2, tmp2_read_write)])
-    assert results[0]['A'][None] == set([(loop_1, a1_read), (loop_2, a2_read)])
-    assert results[0]['B'][None] == set([(loop_1, b1_read), (loop_2, b2_read)])
+    assert results[0]['tmp'][(loop_1, tmp1_read_write)] == {(loop_1, tmp1_read_write)}
+    assert results[0]['tmp'][(loop_2, tmp2_read_write)] == {(loop_2, tmp2_read_write)}
+    assert results[0]['A'][None] == {(loop_1, a1_read), (loop_2, a2_read), (loop_1, a1_write), (loop_2, a2_write)}
+    assert results[0]['B'][None] == {(loop_1, b1_read), (loop_2, b2_read), (loop_1, b1_write), (loop_2, b2_write)}
 
 
 def test_scalar_write_shadow_interstate_self():
@@ -270,12 +274,16 @@ def test_scalar_write_shadow_interstate_self():
     pipeline = Pipeline([ScalarWriteShadowScopes()])
     results = pipeline.apply_pass(sdfg, {})[ScalarWriteShadowScopes.__name__]
 
-    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == set([(loop_1_2, loop1_read_tmp), (loop_1_1, tmp1_edge)])
-    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == set([(loop_2_2, loop2_read_tmp), (loop_2_1, tmp2_edge)])
-    assert results[0]['A'][None] == set([(loop_1_1, a1_read), (loop_1_2, loop1_read_a), (loop_2_1, a2_read),
-                                         (loop_2_2, loop2_read_a)])
-    assert results[0]['B'][None] == set([(loop_1_1, b1_read), (loop_1_2, loop1_read_b), (loop_2_1, b2_read),
-                                         (loop_2_2, loop2_read_b)])
+    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == {(loop_1_2, loop1_read_tmp), (loop_1_1, tmp1_edge)}
+    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == {(loop_2_2, loop2_read_tmp), (loop_2_1, tmp2_edge)}
+    assert results[0]['A'][None] == {
+        (loop_1_1, a1_read), (loop_1_2, loop1_read_a), (loop_2_1, a2_read), (loop_2_2, loop2_read_a),
+        (loop_1_2, loop1_write_a), (loop_2_2, loop2_write_a)
+    }
+    assert results[0]['B'][None] == {
+        (loop_1_1, b1_read), (loop_1_2, loop1_read_b), (loop_2_1, b2_read), (loop_2_2, loop2_read_b),
+        (loop_1_2, loop1_write_b), (loop_2_2, loop2_write_b)
+    }
 
 
 def test_scalar_write_shadow_interstate_pred():
@@ -370,12 +378,16 @@ def test_scalar_write_shadow_interstate_pred():
     pipeline = Pipeline([ScalarWriteShadowScopes()])
     results = pipeline.apply_pass(sdfg, {})[ScalarWriteShadowScopes.__name__]
 
-    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == set([(loop_1_3, loop1_read_tmp), (loop_1_2, tmp1_edge)])
-    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == set([(loop_2_3, loop2_read_tmp), (loop_2_2, tmp2_edge)])
-    assert results[0]['A'][None] == set([(loop_1_1, a1_read), (loop_1_3, loop1_read_a), (loop_2_1, a2_read),
-                                         (loop_2_3, loop2_read_a)])
-    assert results[0]['B'][None] == set([(loop_1_1, b1_read), (loop_1_3, loop1_read_b), (loop_2_1, b2_read),
-                                         (loop_2_3, loop2_read_b)])
+    assert results[0]['tmp'][(loop_1_1, tmp1_write)] == {(loop_1_3, loop1_read_tmp), (loop_1_2, tmp1_edge)}
+    assert results[0]['tmp'][(loop_2_1, tmp2_write)] == {(loop_2_3, loop2_read_tmp), (loop_2_2, tmp2_edge)}
+    assert results[0]['A'][None] == {
+        (loop_1_1, a1_read), (loop_1_3, loop1_read_a), (loop_2_1, a2_read), (loop_2_3, loop2_read_a),
+        (loop_1_3, loop1_write_a), (loop_2_3, loop2_write_a)
+    }
+    assert results[0]['B'][None] == {
+        (loop_1_1, b1_read), (loop_1_3, loop1_read_b), (loop_2_1, b2_read), (loop_2_3, loop2_read_b),
+        (loop_1_3, loop1_write_b), (loop_2_3, loop2_write_b)
+    }
 
 
 def test_loop_fake_shadow():
@@ -509,6 +521,50 @@ def test_loop_real_shadow():
     assert res[0]['A'][(loop2, loop2_access)] == {(loop2, loop2_access)}
 
 
+def test_dominationless_write_branch():
+    sdfg = dace.SDFG('dominationless_write_branch')
+    sdfg.add_array('A', [1], dace.float64, transient=True)
+    sdfg.add_array('B', [1], dace.float64)
+
+    init = sdfg.add_state('init')
+    guard = sdfg.add_state('guard')
+    left = sdfg.add_state('left')
+    merge = sdfg.add_state('merge')
+
+    init_a = init.add_access('A')
+    init_b = init.add_access('B')
+    init_t1 = init.add_tasklet('init_1', {}, {'a'}, 'a = 0')
+    init_t2 = init.add_tasklet('init_1', {'a'}, {'b'}, 'b = a + 1')
+    init.add_edge(init_t1, 'a', init_a, None, dace.Memlet('A[0]'))
+    init.add_edge(init_a, None, init_t2, 'a', dace.Memlet('A[0]'))
+    init.add_edge(init_t2, 'b', init_b, None, dace.Memlet('B[0]'))
+
+    guard_a = guard.add_access('A')
+    guard_t1 = guard.add_tasklet('guard_1', {}, {'a'}, 'a = 1')
+    guard.add_edge(guard_t1, 'a', guard_a, None, dace.Memlet('A[0]'))
+
+    left_a = left.add_access('A')
+    left_t1 = left.add_tasklet('left_1', {}, {'a'}, 'a = 2')
+    left.add_edge(left_t1, 'a', left_a, None, dace.Memlet('A[0]'))
+
+    merge_a = merge.add_access('A')
+    merge_b = merge.add_access('B')
+    merge_t1 = merge.add_tasklet('merge_1', {'a'}, {'b'}, 'b = a + 1')
+    merge.add_edge(merge_a, None, merge_t1, 'a', dace.Memlet('A[0]'))
+    merge.add_edge(merge_t1, 'b', merge_b, None, dace.Memlet('B[0]'))
+
+    sdfg.add_edge(init, guard, dace.InterstateEdge())
+    sdfg.add_edge(guard, left, dace.InterstateEdge(condition='B[0] < 10'))
+    sdfg.add_edge(guard, merge, dace.InterstateEdge(condition='B[0] >= 10'))
+    sdfg.add_edge(left, merge, dace.InterstateEdge())
+
+    ppl = Pipeline([ScalarWriteShadowScopes()])
+    res = ppl.apply_pass(sdfg, {})[ScalarWriteShadowScopes.__name__]
+
+    assert res[0]['A'][(init, init_a)] == {(init, init_a)}
+    assert res[0]['A'][(guard, guard_a)] == {(merge, merge_a), (left, left_a)}
+
+
 if __name__ == '__main__':
     test_scalar_write_shadow_split()
     test_scalar_write_shadow_fused()
@@ -517,3 +573,4 @@ def test_loop_real_shadow():
     test_loop_fake_shadow()
     test_loop_fake_complex_shadow()
     test_loop_real_shadow()
+    test_dominationless_write_branch()

From be86f043bb15079b9a37ca1d4e58eb52ce0ccc94 Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sat, 25 Mar 2023 14:53:59 +0100
Subject: [PATCH 026/392] LIKWID: increase num_events

---
 dace/codegen/instrumentation/likwid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
index f87b6c52c2..b40d27546f 100644
--- a/dace/codegen/instrumentation/likwid.py
+++ b/dace/codegen/instrumentation/likwid.py
@@ -62,7 +62,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
 #include <string>
 #include <sys/types.h>
 
-#define MAX_NUM_EVENTS 64
+#define MAX_NUM_EVENTS 256
 '''
         global_stream.write(header_code, sdfg)
 

From ce44817362097c651021641406c9777845bee41d Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sat, 25 Mar 2023 14:54:14 +0100
Subject: [PATCH 027/392] Minor fix in cutout API

---
 dace/sdfg/analysis/cutout.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dace/sdfg/analysis/cutout.py b/dace/sdfg/analysis/cutout.py
index f1aaed48d7..b557cb185e 100644
--- a/dace/sdfg/analysis/cutout.py
+++ b/dace/sdfg/analysis/cutout.py
@@ -212,8 +212,7 @@ def singlestate_cutout(cls,
         # Remove remaining dangling connectors from scope nodes and add new data containers corresponding to accesses
         # for dangling connectors on other nodes.
         translation_add_pairs: Set[Tuple[nd.AccessNode, nd.AccessNode]] = set()
-        for orig_node in in_translation.keys():
-            new_node = in_translation[orig_node]
+        for orig_node, new_node in in_translation.items():
             if isinstance(new_node, nd.Node):
                 if isinstance(orig_node, (nd.EntryNode, nd.ExitNode)):
                     used_connectors = set(e.dst_conn for e in new_state.in_edges(new_node))

From d34c7eb04bb51c5c76b20fea700e5e104c9670ec Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sat, 25 Mar 2023 14:54:56 +0100
Subject: [PATCH 028/392] MapTiling: Tile trivial check added

---
 dace/transformation/dataflow/tiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py
index 44cac119f1..cd15997ca5 100644
--- a/dace/transformation/dataflow/tiling.py
+++ b/dace/transformation/dataflow/tiling.py
@@ -78,7 +78,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
 
             dim_idx -= removed_maps
             # If tile size is trivial, skip strip-mining map dimension
-            if tile_size == map_entry.map.range.size()[dim_idx]:
+            if not self.tile_trivial and tile_size == map_entry.map.range.size()[dim_idx]:
                 continue
 
             stripmine = StripMining()

From 5e055369639f1b983a4cfac0b64d6c17eee9aec2 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 27 Mar 2023 12:17:25 +0200
Subject: [PATCH 029/392] Added helper method for making internal Map writes
 external.

---
 dace/transformation/helpers.py | 83 ++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 417642619d..74c8fa84b0 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -1339,3 +1339,86 @@ def can_run_state_on_fpga(state: SDFGState):
                 return False
 
     return True
+
+
+def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nodes.MapExit, access: nodes.AccessNode,
+                                     sink: nodes.AccessNode):
+    """
+    Any writes to the Access node `access` that occur inside the Map with exit node `map_exit` are redirected to the
+    Access node `sink` that is outside the Map. This method will remove, if possible, `access` and replace it with a
+    transient.
+
+    :param sdfg: The SDFG in which the Access node resides.
+    :param state: The State in which the Access node resides.
+    :param map_exit: The exit node of the Map.
+    :param access: The Access node being written inside the Map.
+    :param sink: The Access node to be written outside the Map.
+    """
+
+    # Special case for scalars: if there is no write conflict resolution, then abort, since it is implied that the
+    # scalar is thread-local.
+    if isinstance(access.desc(sdfg), data.Scalar):
+        if any(e.data.wcr is None for e in state.in_edges(access)):
+            return
+
+    # Compute the union of the destination subsets of the edges that write to `access.`
+    in_union = None
+    for e in state.in_edges(access):
+        subset = e.data.get_dst_subset(e, state)
+        if in_union is None:
+            in_union = subset
+        else:
+            in_union = in_union.union(subset)
+
+    # Check if the union covers the output edges of `access.`
+    covers_out = True
+    if in_union is None:
+        covers_out = False
+    else:
+        for e in state.out_edges(access):
+            subset = e.data.get_src_subset(e, state)
+            if not in_union.covers(subset):
+                covers_out = False
+                break
+
+    # If the union covers the output edges of `access`, then we can remove `access` and replace it with a transient.
+    if covers_out:
+        shape = in_union.size()
+        if shape == [1]:
+            name, _ = sdfg.add_scalar(access.data, access.desc(sdfg).dtype, transient=True, find_new_name=True)
+        else:
+            name, _ = sdfg.add_array(access.data, shape, access.desc(sdfg).dtype, transient=True, find_new_name=True)
+        new_n = state.add_access(name)
+        for e in state.in_edges(access):
+            src_subset = e.data.get_src_subset(e, state)
+            dst_subset = e.data.get_dst_subset(e, state)
+            state.add_edge(
+                e.src, e.src_conn, new_n, None,
+                Memlet(data=name, subset=dst_subset.offset_new(dst_subset, negative=True), other_subset=src_subset))
+            state.add_memlet_path(new_n,
+                                  map_exit,
+                                  sink,
+                                  memlet=Memlet(data=sink.data,
+                                                subset=copy.deepcopy(dst_subset),
+                                                other_subset=dst_subset.offset_new(dst_subset, negative=True)))
+        for e in state.out_edges(access):
+            src_subset = e.data.get_src_subset(e, state)
+            dst_subset = e.data.get_dst_subset(e, state)
+            state.add_edge(new_n,
+                           None,
+                           e.dst,
+                           e.dst_conn,
+                           memlet=Memlet(data=name,
+                                         subset=src_subset.offset(src_subset, negative=True),
+                                         other_subset=dst_subset))
+        state.remove_node(access)
+    # Otherwise, we only add a memlet path to the sink.
+    else:
+        for e in state.in_edges(access):
+            subset = e.data.get_dst_subset(e, state)
+            state.add_memlet_path(access,
+                                  map_exit,
+                                  sink,
+                                  memlet=Memlet(data=sink.data,
+                                                subset=copy.deepcopy(subset),
+                                                other_subset=copy.deepcopy(subset)))

From ff59dae62277b711c6ee43eff3519dd6c496e57e Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 27 Mar 2023 12:17:46 +0200
Subject: [PATCH 030/392] LoopToMap now makes use of new helper method.

---
 dace/transformation/interstate/loop_to_map.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index 0923b0c568..4501ac1f9a 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -301,7 +301,7 @@ def test_read_memlet(self, sdfg: SDFG, state: SDFGState, edge: gr.MultiConnector
             return False
 
         return True
-    
+
     def _is_array_thread_local(self, name: str, itervar: str, sdfg: SDFG, states: List[SDFGState]) -> bool:
         """
         This helper method checks whether an array used exclusively in the body of a detected for-loop is thread-local,
@@ -529,6 +529,15 @@ def apply(self, _, sdfg: sd.SDFG):
         source_nodes = body.source_nodes()
         sink_nodes = body.sink_nodes()
 
+        # Check intermediate notes
+        intermediate_nodes = []
+        for node in body.nodes():
+            if isinstance(node, nodes.AccessNode) and body.in_degree(node) > 0 and node not in sink_nodes:
+                # Scalars written without WCR must be thread-local
+                if isinstance(node.desc(sdfg), dt.Scalar) and any(e.data.wcr is None for e in body.in_edges(node)):
+                    continue
+                intermediate_nodes.append(node)
+
         map = nodes.Map(body.label + "_map", [itervar], [(start, end, step)])
         entry = nodes.MapEntry(map)
         exit = nodes.MapExit(map)
@@ -579,6 +588,14 @@ def apply(self, _, sdfg: sd.SDFG):
                     body.add_edge_pair(exit, e.src, n, new_memlet, internal_connector=e.src_conn)
             else:
                 body.add_nedge(n, exit, memlet.Memlet())
+        intermediate_sinks = {}
+        for n in intermediate_nodes:
+            if n.data in intermediate_sinks:
+                sink = intermediate_sinks[n.data]
+            else:
+                sink = body.add_access(n.data)
+                intermediate_sinks[n.data] = sink
+            helpers.make_map_internal_write_external(sdfg, body, exit, n, sink)
 
         # Here we handle the direct edges among source and sink access nodes.
         for e in direct_edges:

From e25b591e9ee4cf66227ae7419ea18ab35630fb17 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 27 Mar 2023 12:17:58 +0200
Subject: [PATCH 031/392] Added test.

---
 tests/transformations/loop_to_map_test.py | 58 +++++++++++++++++++----
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/tests/transformations/loop_to_map_test.py b/tests/transformations/loop_to_map_test.py
index b549ffa420..73a0bfc4cc 100644
--- a/tests/transformations/loop_to_map_test.py
+++ b/tests/transformations/loop_to_map_test.py
@@ -166,6 +166,7 @@ def test_loop_to_map_variable_reassigned(n=None):
 
 
 def test_output_copy():
+
     @dace.program
     def l2mtest_copy(A: dace.float64[20, 20]):
         for i in range(1, 20):
@@ -185,6 +186,7 @@ def l2mtest_copy(A: dace.float64[20, 20]):
 
 
 def test_output_accumulate():
+
     @dace.program
     def l2mtest_accumulate(A: dace.float64[20, 20]):
         for i in range(1, 20):
@@ -242,6 +244,7 @@ def detect_greater(i: _[0:size]):
 
 
 def test_empty_loop():
+
     @dace.program
     def empty_loop():
         for i in range(10):
@@ -344,6 +347,7 @@ def test_need_for_transient():
         start = i * 10
         assert np.array_equal(B[i], np.arange(start + 9, start - 1, -1, dtype=np.int32))
 
+
 def test_iteration_variable_used_outside():
     N = dace.symbol("N", dace.int32)
 
@@ -427,6 +431,7 @@ def test_symbol_array_mix(overwrite):
 
     assert sdfg.apply_transformations(LoopToMap) == (1 if overwrite else 0)
 
+
 @pytest.mark.parametrize('parallel', (False, True))
 def test_symbol_array_mix_2(parallel):
     sdfg = dace.SDFG('tester')
@@ -495,8 +500,8 @@ def test_shared_local_transient_single_state():
     sdfg.add_edge(body, guard, dace.InterstateEdge(assignments={'i': 'i + 1'}))
     sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i >= 10'))
 
-    sdfg.add_array('A', (10,), dace.int32, transient=True)
-    sdfg.add_array('__return', (10,), dace.int32)
+    sdfg.add_array('A', (10, ), dace.int32, transient=True)
+    sdfg.add_array('__return', (10, ), dace.int32)
 
     t1 = body.add_tasklet('t1', {}, {'__out'}, '__out = 5 + j')
     anode = body.add_access('A')
@@ -533,8 +538,8 @@ def test_thread_local_transient_single_state():
     sdfg.add_edge(body, guard, dace.InterstateEdge(assignments={'i': 'i + 1'}))
     sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i >= 10'))
 
-    sdfg.add_array('A', (i+1,), dace.int32, transient=True)
-    sdfg.add_array('__return', (10,), dace.int32)
+    sdfg.add_array('A', (i + 1, ), dace.int32, transient=True)
+    sdfg.add_array('__return', (10, ), dace.int32)
 
     t1 = body.add_tasklet('t1', {}, {'__out'}, '__out = 5 + j')
     anode = body.add_access('A')
@@ -570,8 +575,8 @@ def test_shared_local_transient_multi_state():
     sdfg.add_edge(body1, guard, dace.InterstateEdge(assignments={'i': 'i + 1'}))
     sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i >= 10'))
 
-    sdfg.add_array('A', (10,), dace.int32, transient=True)
-    sdfg.add_array('__return', (10,), dace.int32)
+    sdfg.add_array('A', (10, ), dace.int32, transient=True)
+    sdfg.add_array('__return', (10, ), dace.int32)
 
     t1 = body0.add_tasklet('t1', {}, {'__out'}, '__out = 5 + i + 1')
     anode0 = body0.add_access('A')
@@ -611,8 +616,8 @@ def test_thread_local_transient_multi_state():
     sdfg.add_edge(body1, guard, dace.InterstateEdge(assignments={'i': 'i + 1'}))
     sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i >= 10'))
 
-    sdfg.add_array('A', (i+1,), dace.int32, transient=True)
-    sdfg.add_array('__return', (10,), dace.int32)
+    sdfg.add_array('A', (i + 1, ), dace.int32, transient=True)
+    sdfg.add_array('__return', (10, ), dace.int32)
 
     t1 = body0.add_tasklet('t1', {}, {'__out'}, '__out = 5 + i + 1')
     anode0 = body0.add_access('A')
@@ -642,7 +647,7 @@ def nested_loops(A: dace.int32[10, 10, 10], l: dace.int32):
 
     ref = np.arange(1000, dtype=np.int32).reshape(10, 10, 10)
     nested_loops.f(ref, 5)
-    
+
     sdfg = nested_loops.to_sdfg()
 
     def find_loop(sdfg: dace.SDFG, itervar: str) -> Tuple[dace.SDFGState, dace.SDFGState, dace.SDFGState]:
@@ -683,6 +688,40 @@ def find_loop(sdfg: dace.SDFG, itervar: str) -> Tuple[dace.SDFGState, dace.SDFGS
     assert np.allclose(ref, val)
 
 
+def test_internal_write():
+
+    @dace.program
+    def internal_write(inp0: dace.int32[10], inp1: dace.int32[10], out: dace.int32[10]):
+        tmp = np.ndarray((10, ), dtype=np.int32)
+        for i in range(10):
+            tmp[i] = inp0[i] + 5
+            out[i] = inp1[i] + tmp[i]
+
+    sdfg = internal_write.to_sdfg(simplify=False)
+    from dace.transformation.pass_pipeline import Pipeline
+    from dace.transformation.passes import FuseStates
+    mypass = Pipeline([FuseStates()])
+    mypass.apply_pass(sdfg, {})
+    sdfg.apply_transformations_repeated(LoopToMap)
+    for node, state in sdfg.all_nodes_recursive():
+        if isinstance(node, nodes.AccessNode):
+            if isinstance(node.desc(state.parent), dace.data.Scalar) and any(e.data.wcr is None
+                                                                             for e in state.in_edges(node)):
+                continue
+            assert state.scope_dict()[node] is None
+
+    rng = np.random.default_rng(42)
+    inp0 = rng.integers(0, 100, size=10, dtype=np.int32)
+    inp1 = rng.integers(0, 100, size=10, dtype=np.int32)
+    ref = np.empty((10, ), dtype=np.int32)
+    val = np.empty((10, ), dtype=np.int32)
+
+    internal_write.f(inp0, inp1, ref)
+    internal_write(inp0, inp1, val)
+
+    assert np.array_equal(val, ref)
+
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
@@ -717,3 +756,4 @@ def find_loop(sdfg: dace.SDFG, itervar: str) -> Tuple[dace.SDFGState, dace.SDFGS
     test_shared_local_transient_multi_state()
     test_thread_local_transient_multi_state()
     test_nested_loops()
+    test_internal_write()

From 87a1af1c4b224fd055a952fe51fe5ee33f5d74cb Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 10:41:15 +0200
Subject: [PATCH 032/392] Updated algorithm computing an InterstateEdge's free
 symbols and algorithm computing symbols that are used before being assigned
 in an SDFG.

---
 dace/sdfg/sdfg.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 6a63aa8cdb..bee601e7b1 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import ast
 import collections
 import copy
@@ -214,7 +214,31 @@ def read_symbols(self) -> Set[str]:
     @property
     def free_symbols(self) -> Set[str]:
         """ Returns a set of symbols used in this edge's properties. """
-        return self.read_symbols() - set(self.assignments.keys())
+        # NOTE: The former algorithm for computing an edge's free symbols was:
+        #       `self.read_symbols() - set(self.assignments.keys())`
+        #       The issue with the above algorithm is that any symbols that are first read and then assigned will not
+        #       be considered free symbols. For example, the former algorithm will fail for the following edges:
+        #       - assignments = {'i': 'i + 1'}
+        #       - condition = 'i < 10', assignments = {'i': '3'}
+        #       - assignments = {'j': 'i + 1', 'i': '3'}
+        #       The new algorithm below addresses the issue by iterating over the edge's condition and assignments and
+        #       exlcuding keys from being considered "defined" if they have been already read.
+
+        # Symbols in conditions are always free, because the condition is executed before the assignments
+        cond_symbols = set(map(str, dace.symbolic.symbols_in_ast(self.condition.code[0])))
+        # Symbols in assignment keys are candidate defined symbols
+        lhs_symbols = set()
+        # Symbols in assignment values are candidate free symbols
+        rhs_symbols = set()
+        for lhs, rhs in self.assignments.items():
+            # Always add LHS symbols to the set of candidate free symbols
+            rhs_symbols |= symbolic.free_symbols_and_functions(rhs)
+            # Add the RHS to the set of candidate defined symbols ONLY if it has not been read yet
+            # This also solves the ordering issue that may arise in cases like the 3rd example above
+            if lhs not in cond_symbols and lhs not in rhs_symbols:
+                lhs_symbols.add(lhs)
+        # Return the set of candidate free symbols minus the set of candidate defined symbols
+        return (cond_symbols | rhs_symbols) - lhs_symbols
 
     def replace_dict(self, repl: Dict[str, str], replace_keys=True) -> None:
         """
@@ -1288,8 +1312,11 @@ def free_symbols(self) -> Set[str]:
 
             # Add free inter-state symbols
             for e in self.out_edges(state):
-                defined_syms |= set(e.data.assignments.keys())
+                # NOTE: First we get the true InterstateEdge free symbols, then we compute the newly defined symbols by
+                # subracting the (true) free symbols from the edge's assignment keys. This way we can correctly
+                # compute the symbols that are used before being assigned.
                 efsyms = e.data.free_symbols
+                defined_syms |= set(e.data.assignments.keys()) - efsyms
                 used_before_assignment.update(efsyms - defined_syms)
                 free_syms |= efsyms
 

From 4f6550cd3931e1489094c28517a36e7ad3e86d41 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 10:41:36 +0200
Subject: [PATCH 033/392] Added free symbols-related tests.

---
 tests/sdfg/free_symbols_test.py | 61 ++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/sdfg/free_symbols_test.py b/tests/sdfg/free_symbols_test.py
index 81a8c03a20..3d162203d1 100644
--- a/tests/sdfg/free_symbols_test.py
+++ b/tests/sdfg/free_symbols_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import math
 
@@ -72,8 +72,67 @@ def test_constants():
     assert sdfg.free_symbols == {'M', 'N'}
 
 
+def test_interstate_edge_symbols():
+    i, j, k = (dace.symbol(s) for s in 'ijk')
+
+    edge = dace.InterstateEdge(assignments={'i': 'j + k'})
+    assert 'j' in edge.free_symbols
+    assert 'k' in edge.free_symbols
+    assert 'i' not in edge.free_symbols
+
+    edge = dace.InterstateEdge(assignments={'i': 'i+1'})
+    assert 'i' in edge.free_symbols
+
+    edge = dace.InterstateEdge(condition='i < j', assignments={'i': '3'})
+    assert 'i' in edge.free_symbols
+    assert 'j' in edge.free_symbols
+
+    edge = dace.InterstateEdge(assignments={'j': 'i + 1', 'i': '3'})
+    assert 'i' in edge.free_symbols
+    assert 'j' not in edge.free_symbols
+
+
+def test_nested_sdfg_free_symbols():
+    i, j, k = (dace.symbol(s) for s in 'ijk')
+
+    outer_sdfg = dace.SDFG('outer')
+    outer_init_state = outer_sdfg.add_state('outer_init')
+    outer_guard_state = outer_sdfg.add_state('outer_guard')
+    outer_body_state_1 = outer_sdfg.add_state('outer_body_1')
+    outer_body_state_2 = outer_sdfg.add_state('outer_body_2')
+    outer_exit_state = outer_sdfg.add_state('outer_exit')
+    outer_sdfg.add_edge(outer_init_state, outer_guard_state, dace.InterstateEdge(assignments={'i': '0'}))
+    outer_sdfg.add_edge(outer_guard_state, outer_body_state_1,
+                        dace.InterstateEdge(condition='i < 10', assignments={'j': 'i + 1'}))
+    outer_sdfg.add_edge(outer_guard_state, outer_exit_state, dace.InterstateEdge(condition='i >= 10'))
+    outer_sdfg.add_edge(outer_body_state_1, outer_guard_state,
+                        dace.InterstateEdge(condition='j >= 10', assignments={'i': 'i + 1'}))
+    outer_sdfg.add_edge(outer_body_state_1, outer_body_state_2, dace.InterstateEdge(condition='j < 10'))
+    outer_sdfg.add_edge(outer_body_state_2, outer_body_state_1, dace.InterstateEdge(assignments={'j': 'j + 1'}))
+
+    inner_sdfg = dace.SDFG('inner')
+    inner_init_state = inner_sdfg.add_state('inner_init')
+    inner_guard_state = inner_sdfg.add_state('inner_guard')
+    inner_body_state = inner_sdfg.add_state('inner_body')
+    inner_exit_state = inner_sdfg.add_state('inner_exit')
+    inner_sdfg.add_edge(inner_init_state, inner_guard_state, dace.InterstateEdge(assignments={'k': 'j + 1'}))
+    inner_sdfg.add_edge(inner_guard_state, inner_body_state, dace.InterstateEdge(condition='k < 10'))
+    inner_sdfg.add_edge(inner_guard_state, inner_exit_state,
+                        dace.InterstateEdge(condition='k >= 10', assignments={'j': 'j + 1'}))
+    inner_sdfg.add_edge(inner_body_state, inner_guard_state, dace.InterstateEdge(assignments={'k': 'k + 1'}))
+
+    outer_body_state_2.add_nested_sdfg(inner_sdfg, None, {}, {}, symbol_mapping={'j': 'j'})
+
+    assert not outer_sdfg.free_symbols
+    assert 'i' not in inner_sdfg.free_symbols
+    assert 'j' in inner_sdfg.free_symbols
+    assert 'k' not in inner_sdfg.free_symbols
+
+
 if __name__ == '__main__':
     test_single_state()
     test_state_subgraph()
     test_sdfg()
     test_constants()
+    test_interstate_edge_symbols()
+    test_nested_sdfg_free_symbols()

From 101017c89b0e1f7ee75ad5bedbc39a2d3f447fb0 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 11:28:50 +0200
Subject: [PATCH 034/392] Updated the helper method for the case where the
 intermediate write is coming out of nested Maps.

---
 dace/transformation/helpers.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 74c8fa84b0..252005f87a 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -1360,6 +1360,9 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
     if isinstance(access.desc(sdfg), data.Scalar):
         if any(e.data.wcr is None for e in state.in_edges(access)):
             return
+    # Ignore views
+    if isinstance(access.desc(sdfg), data.View):
+        return
 
     # Compute the union of the destination subsets of the edges that write to `access.`
     in_union = None
@@ -1389,12 +1392,24 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
         else:
             name, _ = sdfg.add_array(access.data, shape, access.desc(sdfg).dtype, transient=True, find_new_name=True)
         new_n = state.add_access(name)
+        visited = set()
         for e in state.in_edges(access):
+            if e in visited:
+                continue
+            offset = e.data.get_dst_subset(e, state)
+            # NOTE: There can be nested Maps. Therefore, we need to iterate over the MemletTree.
+            for e2 in state.memlet_tree(e):
+                if e2 in visited:
+                    continue
+                visited.add(e2)
+                src_subset = e2.data.get_src_subset(e2, state)
+                dst_subset = e2.data.get_dst_subset(e2, state)
+                dst = new_n if e2.dst is access else e2.dst
+                state.add_edge(
+                    e2.src, e2.src_conn, dst, e2.dst_conn,
+                    Memlet(data=name, subset=dst_subset.offset_new(offset, negative=True), other_subset=src_subset))
             src_subset = e.data.get_src_subset(e, state)
             dst_subset = e.data.get_dst_subset(e, state)
-            state.add_edge(
-                e.src, e.src_conn, new_n, None,
-                Memlet(data=name, subset=dst_subset.offset_new(dst_subset, negative=True), other_subset=src_subset))
             state.add_memlet_path(new_n,
                                   map_exit,
                                   sink,
@@ -1402,6 +1417,11 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
                                                 subset=copy.deepcopy(dst_subset),
                                                 other_subset=dst_subset.offset_new(dst_subset, negative=True)))
         for e in state.out_edges(access):
+            if e in visited:
+                continue
+            visited.add(e)
+            # NOTE: We assume here that the intermediate write is happening just before the Map's exit node.
+            # Is this always correct?
             src_subset = e.data.get_src_subset(e, state)
             dst_subset = e.data.get_dst_subset(e, state)
             state.add_edge(new_n,
@@ -1411,6 +1431,8 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
                            memlet=Memlet(data=name,
                                          subset=src_subset.offset(src_subset, negative=True),
                                          other_subset=dst_subset))
+        for e in visited:
+            state.remove_edge(e)
         state.remove_node(access)
     # Otherwise, we only add a memlet path to the sink.
     else:

From 7db36f455553a5bcba5eb562d6989892eb7a736d Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 11:39:59 +0200
Subject: [PATCH 035/392] Updated test to make sense in light of new free
 symbol computation algorithm and its effects on SDFG validation.

---
 .../transformations/state_elimination_test.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/transformations/state_elimination_test.py b/tests/transformations/state_elimination_test.py
index 828580a85c..4946c2c653 100644
--- a/tests/transformations/state_elimination_test.py
+++ b/tests/transformations/state_elimination_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 from dace.transformation.interstate import EndStateElimination, StateAssignElimination, StateFusion
 
@@ -16,18 +16,24 @@ def test_eliminate_end_state():
 
 
 def test_eliminate_end_state_noassign():
+    outer_sdfg = dace.SDFG('state_elimination_test_outer')
+    outer_state = outer_sdfg.add_state()
+
     sdfg = dace.SDFG('state_elimination_test')
     state1 = sdfg.add_state()
     state2 = sdfg.add_state()
     state3 = sdfg.add_state()
     sdfg.add_edge(state1, state2, dace.InterstateEdge())
     sdfg.add_edge(state2, state3, dace.InterstateEdge(assignments=dict(k='k + 1')))
-    sdfg.simplify()
-    sdfg.simplify()
-    assert sdfg.number_of_nodes() == 2
-    sdfg.apply_transformations(EndStateElimination)
-    sdfg.simplify()
-    assert sdfg.number_of_nodes() == 1
+
+    nsdfg = outer_state.add_nested_sdfg(sdfg, outer_sdfg, {}, {}, symbol_mapping={'k': 3})
+
+    nsdfg.sdfg.simplify()
+    nsdfg.sdfg.simplify()
+    assert nsdfg.sdfg.number_of_nodes() == 2
+    nsdfg.sdfg.apply_transformations(EndStateElimination)
+    nsdfg.sdfg.simplify()
+    assert nsdfg.sdfg.number_of_nodes() == 1
 
 
 def test_state_assign_elimination():

From 47b7d1ae08b6b607ca486cdcad42a8bddff64609 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 18:37:34 +0200
Subject: [PATCH 036/392] Taking care of outgoing memlet trees

---
 dace/transformation/helpers.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 252005f87a..474ae351c5 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -1419,18 +1419,18 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
         for e in state.out_edges(access):
             if e in visited:
                 continue
-            visited.add(e)
-            # NOTE: We assume here that the intermediate write is happening just before the Map's exit node.
-            # Is this always correct?
-            src_subset = e.data.get_src_subset(e, state)
-            dst_subset = e.data.get_dst_subset(e, state)
-            state.add_edge(new_n,
-                           None,
-                           e.dst,
-                           e.dst_conn,
-                           memlet=Memlet(data=name,
-                                         subset=src_subset.offset(src_subset, negative=True),
-                                         other_subset=dst_subset))
+            offset = e.data.get_src_subset(e, state)
+            # NOTE: There can be nested Maps. Therefore, we need to iterate over the MemletTree.
+            for e2 in state.memlet_tree(e):
+                if e2 in visited:
+                    continue
+                visited.add(e2)
+                src_subset = e2.data.get_src_subset(e2, state)
+                dst_subset = e2.data.get_dst_subset(e2, state)
+                src = new_n if e2.src is access else e2.src
+                state.add_edge(
+                    src, e2.src_conn, e2.dst, e2.dst_conn,
+                    Memlet(data=name, subset=src_subset.offset_new(offset, negative=True), other_subset=dst_subset))
         for e in visited:
             state.remove_edge(e)
         state.remove_node(access)

From fad121b9ca4e61150e556213a5e2d01391799231 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 3 Apr 2023 18:38:10 +0200
Subject: [PATCH 037/392] Do not try to move Views outside the Map.

---
 dace/transformation/interstate/loop_to_map.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index 4501ac1f9a..47f48a2e55 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -590,6 +590,8 @@ def apply(self, _, sdfg: sd.SDFG):
                 body.add_nedge(n, exit, memlet.Memlet())
         intermediate_sinks = {}
         for n in intermediate_nodes:
+            if isinstance(sdfg.arrays[n.data], dt.View):
+                continue
             if n.data in intermediate_sinks:
                 sink = intermediate_sinks[n.data]
             else:

From 4c9c6f1314feafe21cb696d1973f97c401370bb6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Apr 2023 19:24:54 +0200
Subject: [PATCH 038/392] Fixed issue with nested symbol dependency in array
 accesses.

---
 dace/frontend/python/newast.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index d9a6458bf8..e6dda139e4 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -2913,8 +2913,11 @@ def _add_access(
                 for s, sr in self.symbols.items():
                     if s in symbolic.symlist(r).values():
                         ignore_indices.append(i)
-                        if any(t in self.sdfg.arrays for t in sr.free_symbols):
+                        if any(t in self.sdfg.arrays or t in (str(sym) for sym in self.symbols)
+                               for t in sr.free_symbols):
                             sym_rng.append(subsets.Range([(0, parent_array.shape[i] - 1, 1)]))
+                            repl_dict = {}
+                            break
                         else:
                             sym_rng.append(sr)
                             # NOTE: Assume that the i-th index of the range is

From 905c681d47bc6f787ac1ef6b0b345346ce401890 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Apr 2023 19:25:05 +0200
Subject: [PATCH 039/392] Added test.

---
 .../nested_name_accesses_test.py              | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/python_frontend/nested_name_accesses_test.py b/tests/python_frontend/nested_name_accesses_test.py
index cd495c4c24..7333ee044f 100644
--- a/tests/python_frontend/nested_name_accesses_test.py
+++ b/tests/python_frontend/nested_name_accesses_test.py
@@ -138,6 +138,24 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     assert (np.allclose(out, ref))
 
 
+def test_nested_offset_access_nested_dependency():
+    @dc.program
+    def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 5]):
+        out = np.zeros((5, 5, 5), np.float64)
+        for i, j in dc.map[0:5, 0:5]:
+            out[i, j, 0] = 0.25 * (inp[i + 1, j, 1] + inp[i, j, 1])
+            for k in range(1, 4):
+                for l in range(k, 5):
+                    out[i, j, k] = 0.25 * (inp[i + 1, j, l - k + 1] + inp[i, j, l - k + 1])
+        return out
+
+    inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
+    out = nested_offset_access_nested_dep(inp)
+    ref = nested_offset_access_nested_dep.f(inp)
+    assert (np.allclose(out, ref))
+
+
+
 if __name__ == "__main__":
     test_nested_name_accesses()
     test_nested_offset_access()
@@ -146,3 +164,4 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     test_nested_multi_offset_access_dappy()
     test_nested_dec_offset_access()
     test_nested_dec_offset_access_dappy()
+    test_nested_offset_access_nested_dependency()

From b27cfe2a0237566d48ce55f4dfabcc7baf2ae201 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Apr 2023 21:26:17 +0200
Subject: [PATCH 040/392] Attemptin codecov uploaded migration.

---
 .github/workflows/general-ci.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
index b01eb34822..d656a10371 100644
--- a/.github/workflows/general-ci.yml
+++ b/.github/workflows/general-ci.yml
@@ -31,8 +31,10 @@ jobs:
         sudo apt-get install -y libpapi-dev papi-tools  # Instrumentation dependencies
         sudo apt-get install -y verilator # RTL simulation dependencies
         python -m pip install --upgrade pip
-        pip install flake8 pytest-xdist coverage codecov
+        pip install flake8 pytest-xdist coverage
         pip install -e ".[testing]"
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
 
     - name: Test dependencies
       run: |
@@ -52,7 +54,7 @@ jobs:
             export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
         fi
         pytest -n auto --cov-report=xml --cov=dace --tb=short -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument"
-        codecov
+        ./codecov
 
     - name: Test OpenBLAS LAPACK
       run: |
@@ -68,7 +70,7 @@ jobs:
             export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
         fi
         pytest -n 1 --cov-report=xml --cov=dace --tb=short -m "lapack"
-        codecov
+        ./codecov
 
     - name: Run other tests
       run: |
@@ -81,4 +83,4 @@ jobs:
         ./tests/polybench_test.sh
         ./tests/xform_test.sh
         coverage combine .; coverage report; coverage xml
-        codecov
+        ./codecov

From e1690ddd58b371ec970c5affa2b106472741a188 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Apr 2023 23:05:47 +0200
Subject: [PATCH 041/392] Added dappy support.

---
 dace/frontend/python/newast.py                | 53 +++++++++++--------
 .../nested_name_accesses_test.py              | 23 ++++++++
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index e6dda139e4..20cba37e3e 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -728,29 +728,38 @@ def _add_access(
                 for s, sr in self.symbols.items():
                     if s in symbolic.symlist(r).values():
                         ignore_indices.append(i)
-                        sym_rng.append(sr)
-                        # NOTE: Assume that the i-th index of the range is
-                        # dependent on a local symbol s, i.e, rng[i] = f(s).
-                        # Therefore, the i-th index will not be squeezed
-                        # even if it has length equal to 1. However, it must
-                        # still be offsetted by f(min(sr)), so that the indices
-                        # for the squeezed connector start from 0.
-                        # Example:
-                        # Memlet range: [i+1, j, k+1]
-                        # k: local symbol with range(1, 4)
-                        # i,j: global symbols
-                        # Squeezed range: [f(k)] = [k+1]
-                        # Offset squeezed range: [f(k)-f(min(range(1, 4)))] =
-                        #                        [f(k)-f(1)] = [k-1]
-                        # NOTE: The code takes into account the case where an
-                        # index is dependent on multiple symbols. See also
-                        # tests/python_frontend/nested_name_accesses_test.py.
-                        step = sr[0][2]
-                        if (step < 0) == True:
-                            repl_dict[s] = sr[0][1]
+                        if any(t in self.sdfg.arrays or t in (str(sym) for sym in self.symbols)
+                               for t in sr.free_symbols):
+                            sym_rng.append(subsets.Range([(0, parent_array.shape[i] - 1, 1)]))
+                            repl_dict = {}
+                            break
                         else:
-                            repl_dict[s] = sr[0][0]
-                offset.append(r[0].subs(repl_dict))
+                            sym_rng.append(sr)
+                            # NOTE: Assume that the i-th index of the range is
+                            # dependent on a local symbol s, i.e, rng[i] = f(s).
+                            # Therefore, the i-th index will not be squeezed
+                            # even if it has length equal to 1. However, it must
+                            # still be offsetted by f(min(sr)), so that the indices
+                            # for the squeezed connector start from 0.
+                            # Example:
+                            # Memlet range: [i+1, j, k+1]
+                            # k: local symbol with range(1, 4)
+                            # i,j: global symbols
+                            # Squeezed range: [f(k)] = [k+1]
+                            # Offset squeezed range: [f(k)-f(min(range(1, 4)))] =
+                            #                        [f(k)-f(1)] = [k-1]
+                            # NOTE: The code takes into account the case where an
+                            # index is dependent on multiple symbols. See also
+                            # tests/python_frontend/nested_name_accesses_test.py.
+                            step = sr[0][2]
+                            if (step < 0) == True:
+                                repl_dict[s] = sr[0][1]
+                            else:
+                                repl_dict[s] = sr[0][0]
+                if repl_dict:
+                    offset.append(r[0].subs(repl_dict))
+                else:
+                    offset.append(0)
 
             if ignore_indices:
                 tmp_memlet = Memlet.simple(parent_name, rng)
diff --git a/tests/python_frontend/nested_name_accesses_test.py b/tests/python_frontend/nested_name_accesses_test.py
index 7333ee044f..41a5b77e95 100644
--- a/tests/python_frontend/nested_name_accesses_test.py
+++ b/tests/python_frontend/nested_name_accesses_test.py
@@ -150,6 +150,28 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 5]):
         return out
 
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
+    with dc.config.set_temporary('testing', 'serialization', value=False):
+        out = nested_offset_access_nested_dep(inp)
+    ref = nested_offset_access_nested_dep.f(inp)
+    assert (np.allclose(out, ref))
+
+
+def test_nested_offset_access_nested_dependency_dappy():
+    @dc.program
+    def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 10]):
+        out = np.zeros((5, 5, 10), np.float64)
+        for i, j in dc.map[0:5, 0:5]:
+            out[i, j, 0] = 0.25 * (inp[i + 1, j, 1] + inp[i, j, 1])
+            for k in range(1, 5):
+                for l in range(k, 4):
+                    with dc.tasklet():
+                        in1 << inp[i + 1, j, k + l + 1]
+                        in2 << inp[i, j, k + l + 1]
+                        out1 >> out[i, j, k + l]
+                        out1 = 0.25 * (in1 + in2)
+        return out
+
+    inp = np.reshape(np.arange(6 * 5 * 10, dtype=np.float64), (6, 5, 10)).copy()
     out = nested_offset_access_nested_dep(inp)
     ref = nested_offset_access_nested_dep.f(inp)
     assert (np.allclose(out, ref))
@@ -165,3 +187,4 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 5]):
     test_nested_dec_offset_access()
     test_nested_dec_offset_access_dappy()
     test_nested_offset_access_nested_dependency()
+    test_nested_offset_access_nested_dependency_dappy()

From 862f5bbfde07120b36c180408d1913cb961e03b0 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Apr 2023 23:07:59 +0200
Subject: [PATCH 042/392] Updated the rest of the CI

---
 .github/workflows/fpga-ci.yml          | 6 ++++--
 .github/workflows/gpu-ci.yml           | 6 ++++--
 .github/workflows/heterogeneous-ci.yml | 6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml
index a3b38c4536..9da22c5157 100644
--- a/.github/workflows/fpga-ci.yml
+++ b/.github/workflows/fpga-ci.yml
@@ -20,9 +20,11 @@ jobs:
         rm -rf .dacecache tests/.dacecache
         . /opt/setupenv
         python -m pip install --upgrade pip
-        pip install pytest-xdist flake8 coverage codecov
+        pip install pytest-xdist flake8 coverage
         pip uninstall -y dace
         pip install -e ".[testing]"
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
 
     - name: Run FPGA Tests
       run: |
@@ -35,7 +37,7 @@ jobs:
         reachable=0
         ping -W 2 -c 1 codecov.io || reachable=$?
         if [ $reachable -eq 0 ]; then
-          codecov
+          ./codecov
         else
           echo "Codecov.io is unreachable"
         fi
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 7f4a3a8f0e..7c3be4e62e 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -20,10 +20,12 @@ jobs:
         rm -rf .dacecache tests/.dacecache
         . /opt/setupenv
         python -m pip install --upgrade pip
-        pip install flake8 pytest-xdist coverage codecov
+        pip install flake8 pytest-xdist coverage
         pip install mpi4py
         pip uninstall -y dace
         pip install -e ".[testing]"
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
 
     - name: Test dependencies
       run: |
@@ -52,7 +54,7 @@ jobs:
         reachable=0
         ping -W 2 -c 1 codecov.io || reachable=$?
         if [ $reachable -eq 0 ]; then
-          codecov
+          ./codecov
         else
           echo "Codecov.io is unreachable"
         fi
diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml
index c8232544d4..0c5d246349 100644
--- a/.github/workflows/heterogeneous-ci.yml
+++ b/.github/workflows/heterogeneous-ci.yml
@@ -20,10 +20,12 @@ jobs:
         rm -rf .dacecache tests/.dacecache
         . /opt/setupenv
         python -m pip install --upgrade pip
-        pip install flake8 pytest-xdist coverage codecov
+        pip install flake8 pytest-xdist coverage
         pip install mpi4py pytest-mpi
         pip uninstall -y dace
         pip install -e ".[testing]"
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
 
     - name: Test dependencies
       run: |
@@ -73,7 +75,7 @@ jobs:
         reachable=0
         ping -W 2 -c 1 codecov.io || reachable=$?
         if [ $reachable -eq 0 ]; then
-          codecov
+          ./codecov
         else
           echo "Codecov.io is unreachable"
         fi

From 341461cc8bbc44b6adb10af6ac98dd512ef14a95 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 13 Apr 2023 10:12:32 +0200
Subject: [PATCH 043/392] Updated test to override serialization-enabling env
 variable.

---
 tests/python_frontend/nested_name_accesses_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python_frontend/nested_name_accesses_test.py b/tests/python_frontend/nested_name_accesses_test.py
index 41a5b77e95..ffc2b68e40 100644
--- a/tests/python_frontend/nested_name_accesses_test.py
+++ b/tests/python_frontend/nested_name_accesses_test.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import dace as dc
 import numpy as np
+import os
 
 N = dc.symbol('N')
 
@@ -150,8 +151,11 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 5]):
         return out
 
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
+    last_value = os.environ.get('DACE_testing_serialization', '0')
+    os.environ['DACE_testing_serialization'] = '0'
     with dc.config.set_temporary('testing', 'serialization', value=False):
         out = nested_offset_access_nested_dep(inp)
+    os.environ['DACE_testing_serialization'] = last_value
     ref = nested_offset_access_nested_dep.f(inp)
     assert (np.allclose(out, ref))
 

From 66816e73d4ab9c76e4c7bdcbfed0d8c60283613c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 18 Apr 2023 19:24:36 +0200
Subject: [PATCH 044/392] Ensure that the array is not thread-local.

---
 dace/transformation/helpers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 474ae351c5..73da318e94 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -1366,13 +1366,20 @@ def make_map_internal_write_external(sdfg: SDFG, state: SDFGState, map_exit: nod
 
     # Compute the union of the destination subsets of the edges that write to `access.`
     in_union = None
+    map_dependency = False
     for e in state.in_edges(access):
         subset = e.data.get_dst_subset(e, state)
+        if any(str(s) in map_exit.map.params for s in subset.free_symbols):
+            map_dependency = True
         if in_union is None:
             in_union = subset
         else:
             in_union = in_union.union(subset)
 
+    # If none of the input subsets depend on the map parameters, then abort, since the array is thread-local.
+    if not map_dependency:
+        return
+
     # Check if the union covers the output edges of `access.`
     covers_out = True
     if in_union is None:

From 3c5114cec87eb2492c4b98bbf490ec6718c9e818 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 18 Apr 2023 15:50:08 -0700
Subject: [PATCH 045/392] Inline preprocessor command

---
 dace/frontend/python/interface.py             |  10 ++
 dace/frontend/python/preprocessing.py         |  68 ++++++++-
 .../inline_preprocessing_test.py              | 135 ++++++++++++++++++
 3 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 tests/python_frontend/inline_preprocessing_test.py

diff --git a/dace/frontend/python/interface.py b/dace/frontend/python/interface.py
index f199805f69..ea1970dafd 100644
--- a/dace/frontend/python/interface.py
+++ b/dace/frontend/python/interface.py
@@ -334,6 +334,16 @@ def nounroll(generator):
     yield from generator
 
 
+def inline(expression):
+    """
+    Explicitly annotates that an expression should be evaluated and inlined during parsing.
+
+    :param expression: The expression to evaluate.
+    :note: Only use with stateless and compile-time evaluateable expressions!
+    """
+    return expression
+
+
 def in_program() -> bool:
     """
     Returns True if in a DaCe program parsing context. This function can be used to test whether the current
diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 0f6f1b3320..1efcb6d38e 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -52,6 +52,7 @@ class StructTransformer(ast.NodeTransformer):
     A Python AST transformer that replaces ``Call`` nodes to create structs with
     the custom ``StructInitializer`` AST node.
     """
+
     def __init__(self, gvars):
         super().__init__()
         self._structs = {k: v for k, v in gvars.items() if isinstance(v, dtypes.struct)}
@@ -597,6 +598,8 @@ def global_value_to_node(self,
 
                 # From this point on, any failure will result in a callback
                 newnode = ast.Name(id=cbname, ctx=ast.Load())
+                if isinstance(parent_node, ast.Call):
+                    newnode.oldnode = parent_node.func
 
                 # Decorated or functions with missing source code
                 sast, _, _, _ = astutils.function_to_ast(value)
@@ -748,7 +751,7 @@ def visit_Subscript(self, node: ast.Subscript) -> Any:
         return self._visit_potential_constant(node, True)
 
     def visit_Call(self, node: ast.Call) -> Any:
-        from dace.frontend.python.interface import in_program  # Avoid import loop
+        from dace.frontend.python.interface import in_program, inline  # Avoid import loop
 
         if hasattr(node.func, 'n') and isinstance(node.func.n, SDFGConvertible):
             # Skip already-parsed calls
@@ -760,6 +763,9 @@ def visit_Call(self, node: ast.Call) -> Any:
             # Built-in functions are resolved directly
             if global_func is in_program:
                 return self.global_value_to_node(True, parent_node=node, qualname=astutils.unparse(node), recurse=True)
+            # Inline contents are kept as-is
+            if global_func is inline:
+                return node
 
             if self.resolve_functions:
                 global_val = astutils.evalnode(node, self.globals)
@@ -787,6 +793,8 @@ def visit_Call(self, node: ast.Call) -> Any:
                                                 detect_callables=callables)
             if newnode is not None:
                 node.func = newnode
+                if hasattr(newnode, 'oldnode'):
+                    node.oldnode = newnode.oldnode
                 return self.generic_visit(node)
         return self.generic_visit(node)
 
@@ -1206,6 +1214,63 @@ def visit_AsyncFor(self, node) -> Any:
         return self.visit_For(node)
 
 
+class ExpressionInliner(ast.NodeTransformer):
+    """
+    Replaces dace.inline() expressions by their bodies if they can be
+    compile-time evaluated.
+    """
+
+    def __init__(self, globals: Dict[str, Any], filename: str, closure_resolver: GlobalResolver):
+        super().__init__()
+        self.globals = globals
+        self.filename = filename
+        self.resolver = closure_resolver
+
+    def visit_Call(self, node: ast.Call) -> Any:
+        # Avoid import loop
+        from dace.frontend.python.interface import inline
+
+        node = self.generic_visit(node)
+
+        try:
+            nfunc = astutils.evalnode(node.func, self.globals)
+        except SyntaxError:
+            nfunc = None
+
+        if nfunc is not inline:
+            return node
+
+        if len(node.args) != 1:
+            raise DaceSyntaxError(None, node, 'dace.inline must be called with one argument')
+
+        # Try to inline the expression on the current AST
+        try:
+            contents = astutils.evalnode(node.args[0], self.globals)
+        except SyntaxError:
+            raise DaceSyntaxError(
+                None, node, 'Cannot inline expression with dace.inline, it '
+                'cannot be evaluated at compile time.')
+
+        ##########################################
+
+        # Already AST
+        def _convert_to_ast(contents: Any):
+            if isinstance(contents, ast.AST):
+                newnode = contents
+            elif isinstance(contents, (numbers.Number, str)):
+                # Compatibility check since Python changed their AST nodes
+                newnode = astutils.create_constant(contents)
+            elif isinstance(contents, (list, tuple, set)):
+                newnode = ast.copy_location(ast.Tuple(elts=[_convert_to_ast(c) for c in contents], ctx=ast.Load()),
+                                            node)
+            else:
+                # Augment closure with new value
+                newnode = self.resolver.global_value_to_node(e, node, f'inlined_{id(contents)}', True, keep_object=True)
+            return newnode
+
+        return _convert_to_ast(contents)
+
+
 class CallTreeResolver(ast.NodeVisitor):
 
     def __init__(self, closure: SDFGClosure, globals: Dict[str, Any]) -> None:
@@ -1525,6 +1590,7 @@ def check_code(src_ast):
             src_ast = closure_resolver.visit(src_ast)
             DisallowedAssignmentChecker(src_file).visit(src_ast)
             src_ast = LoopUnroller(resolved, src_file, closure_resolver).visit(src_ast)
+            src_ast = ExpressionInliner(resolved, src_file, closure_resolver).visit(src_ast)
             src_ast = ContextManagerInliner(resolved, src_file, closure_resolver).visit(src_ast)
             src_ast = ConditionalCodeResolver(resolved).visit(src_ast)
             src_ast = DeadCodeEliminator().visit(src_ast)
diff --git a/tests/python_frontend/inline_preprocessing_test.py b/tests/python_frontend/inline_preprocessing_test.py
new file mode 100644
index 0000000000..cfcd7e7ac8
--- /dev/null
+++ b/tests/python_frontend/inline_preprocessing_test.py
@@ -0,0 +1,135 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+Tests the ``dace.inline`` preprocessor call.
+"""
+
+import dace
+from dace.frontend.python.common import DaceSyntaxError
+import math
+import numpy as np
+import pytest
+
+
+def _find_in_tasklet(sdfg: dace.SDFG, term: str) -> bool:
+    for n, _ in sdfg.all_nodes_recursive():
+        if isinstance(n, dace.nodes.Tasklet) and term in n.code.as_string:
+            return True
+    return False
+
+
+def _find_in_memlet(sdfg: dace.SDFG, term: str) -> bool:
+    for e, _ in sdfg.all_edges_recursive():
+        if isinstance(e.data, dace.Memlet) and term in str(e.data.subset):
+            return True
+    return False
+
+
+def test_inlinepp_simple():
+
+    def complex_function(a: int, b: float):
+        c = np.random.rand()
+        return int(c + ((math.ceil(b) + a) // 2) - c)
+
+    N = 20
+
+    @dace.program
+    def tester(a):
+        # a[11] = 13
+        a[dace.inline(complex_function(N + 1, 0.4))] = dace.inline(complex_function(5, N) + 1)
+
+    a = np.random.rand(N)
+    tester(a)
+    assert np.allclose(a[11], 13)
+
+    sdfg = tester.to_sdfg(a)
+    assert _find_in_tasklet(sdfg, '13'), 'Inlined expression not found in tasklets'
+    assert _find_in_memlet(sdfg, '11'), 'Inlined expression not found in memlets'
+
+
+def test_inlinepp_fail():
+
+    def f(x):
+        return x + 1
+
+    @dace.program
+    def tester(a):
+        a[dace.inline(a[0])] = 1
+
+    a = np.random.rand(20)
+    with pytest.raises(DaceSyntaxError):
+        tester(a)
+
+
+def test_inlinepp_tuple_retval():
+
+    def divmod(a, b):
+        return a // b, a % b
+
+    @dace.program
+    def tester(a: dace.float64[20], b: dace.float64[20]):
+        for i in dace.map[0:20]:
+            d, m = dace.inline(divmod(4, 3))
+            a[i] = d
+            b[i] = m
+
+    a = np.random.rand(20)
+    b = np.random.rand(20)
+    tester(a, b)
+    d, m = divmod(4, 3)
+    assert np.allclose(a, d)
+    assert np.allclose(b, m)
+
+
+def test_inlinepp_stateful():
+    ctr = 11
+
+    def stateful():
+        nonlocal ctr
+        ctr += 1
+        return ctr
+
+    @dace.program
+    def tester(a: dace.float64[3]):
+        a[0] = dace.inline(stateful())
+        a[1] = dace.inline(stateful())
+        a[2] = dace.inline(stateful() * 2)
+
+    sdfg = tester.to_sdfg()
+    assert _find_in_tasklet(sdfg, '12')
+    assert _find_in_tasklet(sdfg, '13')
+    assert _find_in_tasklet(sdfg, '28')
+
+    a = np.random.rand(3)
+    sdfg(a)
+    assert np.allclose(a, np.array([12, 13, 28]))
+
+
+def test_inlinepp_in_unroll():
+    ctr = 11
+
+    def stateful(i):
+        nonlocal ctr
+        ctr += 1
+        return ctr + i
+
+    @dace.program
+    def tester(a: dace.float64[3]):
+        for i in dace.unroll(range(3)):
+            a[i] = dace.inline(stateful(i))
+
+    sdfg = tester.to_sdfg()
+    assert _find_in_tasklet(sdfg, '12')
+    assert _find_in_tasklet(sdfg, '14')
+    assert _find_in_tasklet(sdfg, '16')
+
+    a = np.random.rand(3)
+    sdfg(a)
+    assert np.allclose(a, np.array([12, 14, 16]))
+
+
+if __name__ == '__main__':
+    test_inlinepp_simple()
+    test_inlinepp_fail()
+    test_inlinepp_tuple_retval()
+    test_inlinepp_stateful()
+    test_inlinepp_in_unroll()

From 8bced0a5ffa97a4df9b9d18c1bb4f80749ad64d0 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Apr 2023 14:45:18 +0200
Subject: [PATCH 046/392] Do not add to intermediate nodes array accesses that
 are likely thread-local.

---
 dace/transformation/interstate/loop_to_map.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index 47f48a2e55..0b0baabddb 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -536,6 +536,15 @@ def apply(self, _, sdfg: sd.SDFG):
                 # Scalars written without WCR must be thread-local
                 if isinstance(node.desc(sdfg), dt.Scalar) and any(e.data.wcr is None for e in body.in_edges(node)):
                     continue
+                # Arrays written with subsets that do not depend on the loop variable must be thread-local
+                map_dependency = False
+                for e in state.in_edges(node):
+                    subset = e.data.get_dst_subset(e, state)
+                    if any(str(s) == itervar for s in subset.free_symbols):
+                        map_dependency = True
+                        break
+                if not map_dependency:
+                    continue
                 intermediate_nodes.append(node)
 
         map = nodes.Map(body.label + "_map", [itervar], [(start, end, step)])

From ee663e480ec4c9631d5b3830806e03237fcb9171 Mon Sep 17 00:00:00 2001
From: Tiancheng Chen <tiachen@student.ethz.ch>
Date: Wed, 19 Apr 2023 18:50:59 +0200
Subject: [PATCH 047/392] Fix map indirection (#1240)

---
 dace/frontend/python/newast.py                |  9 ++---
 .../indirection_in_map_test.py                | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+), 5 deletions(-)
 create mode 100644 tests/python_frontend/indirection_in_map_test.py

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 20cba37e3e..3b22f1649d 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -254,8 +254,7 @@ def repl_callback(repldict):
     except Exception:
         # Print the offending line causing the exception
         li = visitor.current_lineinfo
-        print('Exception raised while parsing DaCe program:\n'
-              f'  in File "{li.filename}", line {li.start_line}')
+        print('Exception raised while parsing DaCe program:\n' f'  in File "{li.filename}", line {li.start_line}')
         lines = preprocessed_ast.src.split('\n')
         lineid = li.start_line - preprocessed_ast.src_line - 1
         if lineid >= 0 and lineid < len(lines):
@@ -602,7 +601,6 @@ class TaskletTransformer(ExtNodeTransformer):
     """ A visitor that traverses a data-centric tasklet, removes memlet
         annotations and returns input and output memlets.
     """
-
     def __init__(self,
                  visitor,
                  defined,
@@ -1793,7 +1791,9 @@ def _parse_map_inputs(self, name: str, params: List[Tuple[str, str]],
                         if candidate in self.variables and self.variables[candidate] in self.sdfg.arrays:
                             candidate = self.variables[candidate]
 
-                        if candidate in self.sdfg.arrays and isinstance(self.sdfg.arrays[candidate], data.Scalar):
+                        if candidate in self.sdfg.arrays and (isinstance(self.sdfg.arrays[candidate], data.Scalar) or
+                                                              (isinstance(self.sdfg.arrays[candidate], data.Array)
+                                                               and self.sdfg.arrays[candidate].shape == (1, ))):
                             newvar = '__%s_%s%d' % (name, vid, ctr)
                             repldict[atomstr] = newvar
                             map_inputs[newvar] = Memlet.from_array(candidate, self.sdfg.arrays[candidate])
@@ -4804,7 +4804,6 @@ def _parse_subscript_slice(self,
         """ Parses the slice attribute of an ast.Subscript node.
             Scalar data are promoted to symbols.
         """
-
         def _promote(node: ast.AST) -> Union[Any, str, symbolic.symbol]:
             node_str = astutils.unparse(node)
             sym = None
diff --git a/tests/python_frontend/indirection_in_map_test.py b/tests/python_frontend/indirection_in_map_test.py
new file mode 100644
index 0000000000..0377101cd1
--- /dev/null
+++ b/tests/python_frontend/indirection_in_map_test.py
@@ -0,0 +1,40 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import numpy as np
+import scipy as sp
+import scipy.sparse as sparse
+
+M = dace.symbol('M')
+N = dace.symbol('N')
+K = dace.symbol('K')
+nnz_A = dace.symbol('nnz_A')
+nnz_B = dace.symbol('nnz_B')
+'''
+C = A @ B
+[M, K] = [M, N] @ [N, K]
+C[i, k] = A[i, j] * B[j, k]
+'''
+
+
+@dace.program
+def spmspm_csr_csr(A2_pos: dace.int32[M + 1], A2_crd: dace.int32[nnz_A], A_val: dace.float64[nnz_A],
+                   B2_pos: dace.int32[N + 1], B2_crd: dace.int32[nnz_B], B_val: dace.float64[nnz_B],
+                   C: dace.float64[M, K]):
+    for i in dace.map[0:M]:
+        for pj in dace.map[A2_pos[i]:A2_pos[i + 1]]:
+            for pk in dace.map[B2_pos[A2_crd[pj]]:B2_pos[A2_crd[pj] + 1]]:
+                C[i, B2_crd[pk]] += A_val[pj] * B_val[pk]
+
+
+def test_spmspm_csr_csr():
+    csr_A = sparse.random(200, 100, density=0.5, format='csr')
+    csr_B = sparse.random(100, 150, density=0.5, format='csr')
+    ref_dense_C = (csr_A @ csr_B).todense()
+    dace_dense_C = np.zeros_like(ref_dense_C)
+    spmspm_csr_csr(csr_A.indptr, np.copy(csr_A.indices), np.copy(csr_A.data), csr_B.indptr, np.copy(csr_B.indices),
+                   np.copy(csr_B.data), dace_dense_C)
+    assert np.allclose(ref_dense_C, dace_dense_C)
+
+
+if __name__ == '__main__':
+    test_spmspm_csr_csr()

From 42d9d7a61dcdcadbf796af8292cef2538aef3cff Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Apr 2023 21:07:10 +0200
Subject: [PATCH 048/392] DaCeKeywordRemover.visit_BinOp: Abort special pow
 handling is the evaluated node is a number and is not equal to the parsed
 value (datatype mismatch).

---
 dace/codegen/targets/cpp.py                    | 14 ++++++++------
 ..._tasket_test.py => unparse_tasklet_test.py} | 18 +++++++++++++++++-
 2 files changed, 25 insertions(+), 7 deletions(-)
 rename tests/codegen/{unparse_tasket_test.py => unparse_tasklet_test.py} (72%)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 181ddbada6..868419db3d 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -8,6 +8,7 @@
 import functools
 import itertools
 import math
+import numbers
 import warnings
 
 import sympy as sp
@@ -1251,12 +1252,13 @@ def visit_BinOp(self, node: ast.BinOp):
         if isinstance(node.op, ast.Pow):
             from dace.frontend.python import astutils
             try:
-                unparsed = symbolic.pystr_to_symbolic(
-                    astutils.evalnode(node.right, {
-                        **self.constants, 'dace': dace,
-                        'math': math
-                    }))
-                evaluated = symbolic.symstr(symbolic.evaluate(unparsed, self.constants), cpp_mode=True)
+                evaluated_node = astutils.evalnode(node.right, {**self.constants, 'dace': dace,'math': math})
+                unparsed = symbolic.pystr_to_symbolic(evaluated_node)
+                evaluated_constant = symbolic.evaluate(unparsed, self.constants)
+                evaluated = symbolic.symstr(evaluated_constant, cpp_mode=True)
+                value = ast.parse(evaluated).body[0].value
+                if isinstance(evaluated_node, numbers.Number) and evaluated_node != value.n:
+                    raise TypeError
                 node.right = ast.parse(evaluated).body[0].value
             except (TypeError, AttributeError, NameError, KeyError, ValueError, SyntaxError):
                 return self.generic_visit(node)
diff --git a/tests/codegen/unparse_tasket_test.py b/tests/codegen/unparse_tasklet_test.py
similarity index 72%
rename from tests/codegen/unparse_tasket_test.py
rename to tests/codegen/unparse_tasklet_test.py
index 388d765714..5281c109ba 100644
--- a/tests/codegen/unparse_tasket_test.py
+++ b/tests/codegen/unparse_tasklet_test.py
@@ -1,5 +1,6 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
+import numpy as np
 
 
 def test_integer_power():
@@ -46,7 +47,22 @@ def program(a: dace.float64[10], b: dace.float64[10]):
     program.to_sdfg(simplify=False).compile()
 
 
+def test_pow_with_implicit_casting():
+
+    @dace.program
+    def f32_pow_failure(array):
+        return array**3.3
+
+    rng = np.random.default_rng(42)
+    arr = rng.random((10, ), dtype=np.float32)
+    ref = f32_pow_failure.f(arr)
+    val = f32_pow_failure(arr)
+    assert np.allclose(ref, val)
+    assert ref.dtype == val.dtype
+
+
 if __name__ == '__main__':
     test_integer_power()
     test_integer_power_constant()
     test_equality()
+    test_pow_with_implicit_casting()

From 4e0980f77f0eddba372f593ff29b36e64b20132d Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Apr 2023 20:39:06 +0200
Subject: [PATCH 049/392] Repropagate the edge itself is there is no next
 (previous) edge. This happens when the memlet path stops in a MapEntry due to
 carrying a dynamic Map input.

---
 dace/transformation/dataflow/map_interchange.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/dataflow/map_interchange.py b/dace/transformation/dataflow/map_interchange.py
index a8cad69f8d..0e935f1dfd 100644
--- a/dace/transformation/dataflow/map_interchange.py
+++ b/dace/transformation/dataflow/map_interchange.py
@@ -117,11 +117,19 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
         for e in new_entry_edges:
             path = graph.memlet_path(e)
             index = next(i for i, edge in enumerate(path) if e is edge)
-            e.data.subset = propagate_memlet(graph, path[index + 1].data, outer_map_entry, True).subset
+            if index < len(path) - 1:
+                edge_to_propagate = path[index + 1]
+            else:
+                edge_to_propagate = e
+            e.data.subset = propagate_memlet(graph, edge_to_propagate.data, outer_map_entry, True).subset
         for e in new_exit_edges:
             path = graph.memlet_path(e)
             index = next(i for i, edge in enumerate(path) if e is edge)
-            e.data.subset = propagate_memlet(graph, path[index - 1].data, outer_map_exit, True).subset
+            if index > 0:
+                edge_to_propagate = path[index - 1]
+            else:
+                edge_to_propagate = e
+            e.data.subset = propagate_memlet(graph, edge_to_propagate.data, outer_map_exit, True).subset
 
     @staticmethod
     def annotates_memlets():

From 00ff22b9ea945d6bb8e76ba7a306c4d8499854d3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Apr 2023 20:44:08 +0200
Subject: [PATCH 050/392] Added test.

---
 .../dataflow/map_interchange.py               |  2 +-
 tests/transformations/map_interchange_test.py | 65 ++++++++++++++++++-
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/dataflow/map_interchange.py b/dace/transformation/dataflow/map_interchange.py
index 0e935f1dfd..659175f137 100644
--- a/dace/transformation/dataflow/map_interchange.py
+++ b/dace/transformation/dataflow/map_interchange.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """ Implements the map interchange transformation. """
 
 from dace.sdfg import SDFG, SDFGState
diff --git a/tests/transformations/map_interchange_test.py b/tests/transformations/map_interchange_test.py
index 45c7831e74..3ded7c5d53 100644
--- a/tests/transformations/map_interchange_test.py
+++ b/tests/transformations/map_interchange_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import numpy as np
 from dace.transformation.dataflow import MapInterchange
@@ -43,5 +43,68 @@ def test_map_interchange():
     assert np.allclose(B, expected)
 
 
+def test_map_interchange_with_dynamic_map_inputs():
+
+    C1_dimension = dace.symbol('C1_dimension')
+    C2_dimension = dace.symbol('C2_dimension')
+    D1_dimension = dace.symbol('D1_dimension')
+    D2_dimension = dace.symbol('D2_dimension')
+    size_A_vals = dace.symbol('size_A_vals')
+    size_B2_crd = dace.symbol('size_B2_crd')
+    size_B2_pos = dace.symbol('size_B2_pos')
+    size_B_vals = dace.symbol('size_B_vals')
+    size_C_vals = dace.symbol('size_C_vals')
+    size_D_vals = dace.symbol('size_D_vals')
+
+    @dace.program
+    def sched_sddmm0compute(A_vals: dace.float64[size_A_vals], B2_crd: dace.int32[size_B2_crd],
+                            B2_pos: dace.int32[size_B2_pos], B_vals: dace.float64[size_B_vals],
+                            C_vals: dace.float64[size_C_vals], D_vals: dace.float64[size_D_vals]):
+
+        for i in dace.map[0:C1_dimension:1]:
+            for j in dace.map[0:D1_dimension:1]:
+                jC = i * C2_dimension + j
+                for kB in dace.map[B2_pos[i]:B2_pos[(i + 1)]:1]:
+                    k = B2_crd[kB]
+                    kD = j * D2_dimension + k
+                    A_vals[kB] = A_vals[kB] + (B_vals[kB] * C_vals[jC]) * D_vals[kD]
+
+    sdfg = sched_sddmm0compute.to_sdfg()
+
+    # Find MapEntries of Maps over 'j' and 'kB'
+    ome, ime = None, None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                if node.map.params[0] == 'j':
+                    ome = node
+                elif node.map.params[0] == 'kB':
+                    ime = node
+
+    # Assert the pattern MapEntry[j] -> MapEntry[kB] exists
+    assert ome is not None and ime is not None
+    state = sdfg.states()[0]
+    assert len(list(state.edges_between(ome, ime))) > 0
+    assert len(list(state.edges_between(ime, ome))) == 0
+
+    # Interchange the Maps
+    MapInterchange.apply_to(sdfg, outer_map_entry=ome, inner_map_entry=ime)
+    ome, ime = None, None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                if node.map.params[0] == 'j':
+                    ome = node
+                elif node.map.params[0] == 'kB':
+                    ime = node
+
+    # Assert the pattern MapEntry[kB] -> MapEntry[j] exists
+    assert ome is not None and ime is not None
+    state = sdfg.states()[0]
+    assert len(list(state.edges_between(ome, ime))) == 0
+    assert len(list(state.edges_between(ime, ome))) > 0
+
+
 if __name__ == '__main__':
     test_map_interchange()
+    test_map_interchange_with_dynamic_map_inputs()

From 604a4b863c3f06fe664e4d2d2cd94fe030453bc3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Apr 2023 21:23:48 +0200
Subject: [PATCH 051/392] Test does not make sense without SDFG simplification.
 Updated.

---
 tests/transformations/map_interchange_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/transformations/map_interchange_test.py b/tests/transformations/map_interchange_test.py
index 3ded7c5d53..eaaaf86c91 100644
--- a/tests/transformations/map_interchange_test.py
+++ b/tests/transformations/map_interchange_test.py
@@ -69,7 +69,7 @@ def sched_sddmm0compute(A_vals: dace.float64[size_A_vals], B2_crd: dace.int32[si
                     kD = j * D2_dimension + k
                     A_vals[kB] = A_vals[kB] + (B_vals[kB] * C_vals[jC]) * D_vals[kD]
 
-    sdfg = sched_sddmm0compute.to_sdfg()
+    sdfg = sched_sddmm0compute.to_sdfg(simplify=True)
 
     # Find MapEntries of Maps over 'j' and 'kB'
     ome, ime = None, None

From ce2df2d9619930082577852227faf6f86f22bbd2 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 14:32:38 +0200
Subject: [PATCH 052/392] Added support for dnyamic thread blocks with step !=
 1.

---
 dace/codegen/targets/cuda.py | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 4ffa29452d..50b44261a0 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1961,6 +1961,8 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
         # generator)
         callsite_stream.write('{', sdfg, state_id, scope_entry)
 
+        open_if = False
+
         if scope_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic:
             if self.backend == 'hip':
                 raise NotImplementedError('Dynamic thread-block maps on HIP are currently unsupported')
@@ -1973,8 +1975,8 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                     raise ValueError('Block size has to be constant for block-wide dynamic map schedule (got %s)' %
                                      str(bdim))
                 total_block_size *= bdim
-            if _expr(scope_map.range[0][2]) != 1:
-                raise NotImplementedError('Skip not implemented for dynamic thread-block map schedule')
+            # if _expr(scope_map.range[0][2]) != 1:
+            #     raise NotImplementedError('Skip not implemented for dynamic thread-block map schedule')
 
             ##### TODO (later): Generalize
             # Find thread-block param map and its name
@@ -2000,20 +2002,42 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                 'if ({} < {}) {{'.format(outer_scope.map.params[0],
                                          _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), sdfg,
                 state_id, scope_entry)
+            open_if = True
 
             for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
                 callsite_stream.write(
                     self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn,
                                                         e.dst.in_connectors[e.dst_conn]), sdfg, state_id, scope_entry)
 
+            dynmap_var = scope_map.params[0]
+            dynmap_begin = scope_map.range[0][0]
+            dynmap_end = scope_map.range[0][1] + 1
+            dynmap_step = scope_map.range[0][2]
+            if dynmap_step != 1:
+                dynmap_var = f'{dynmap_var}_idx'
+                dynmap_begin = 0
+                dynmap_end = f'int_ceil({dynmap_end - dynmap_begin}, {dynmap_step})'
+            # callsite_stream.write(
+            #     '__dace_dynmap_begin = {begin};\n'
+            #     '__dace_dynmap_end = {end};'.format(begin=scope_map.range[0][0], end=scope_map.range[0][1] + 1), sdfg,
+            #     state_id, scope_entry)
             callsite_stream.write(
                 '__dace_dynmap_begin = {begin};\n'
-                '__dace_dynmap_end = {end};'.format(begin=scope_map.range[0][0], end=scope_map.range[0][1] + 1), sdfg,
+                '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), sdfg,
                 state_id, scope_entry)
 
             # close if
             callsite_stream.write('}', sdfg, state_id, scope_entry)
 
+            # callsite_stream.write(
+            #     'dace::DynamicMap<{fine_grained}, {bsize}>::'
+            #     'schedule(dace_dyn_map_shared, __dace_dynmap_begin, '
+            #     '__dace_dynmap_end, {kmapIdx}, [&](auto {kmapIdx}, '
+            #     'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool(
+            #         'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
+            #                               bsize=total_block_size,
+            #                               kmapIdx=outer_scope.map.params[0],
+            #                               param=scope_map.params[0]), sdfg, state_id, scope_entry)
             callsite_stream.write(
                 'dace::DynamicMap<{fine_grained}, {bsize}>::'
                 'schedule(dace_dyn_map_shared, __dace_dynmap_begin, '
@@ -2022,7 +2046,10 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                     'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
                                           bsize=total_block_size,
                                           kmapIdx=outer_scope.map.params[0],
-                                          param=scope_map.params[0]), sdfg, state_id, scope_entry)
+                                          param=dynmap_var), sdfg, state_id, scope_entry)
+            
+            if dynmap_step != 1:
+                callsite_stream.write(f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', sdfg, state_id, scope_entry)
 
         elif scope_map.schedule == dtypes.ScheduleType.GPU_Device:
             dfg_kernel = self._kernel_state.scope_subgraph(self._kernel_map)

From 0d825eeaba1606e52fb5e2954138f3d87c83f8eb Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 14:48:41 +0200
Subject: [PATCH 053/392] Dynamic Map inputs must be set both in the
 if-condition and the schedule body of the dynamic thread block.

---
 dace/codegen/targets/cuda.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 50b44261a0..6b80825903 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1961,8 +1961,6 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
         # generator)
         callsite_stream.write('{', sdfg, state_id, scope_entry)
 
-        open_if = False
-
         if scope_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic:
             if self.backend == 'hip':
                 raise NotImplementedError('Dynamic thread-block maps on HIP are currently unsupported')
@@ -2004,6 +2002,10 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                 state_id, scope_entry)
             open_if = True
 
+            # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule.
+            # They define inside the schedule the bounds of the any nested Maps.
+            # They define outside the schedule the bounds of the dynamic Map's for-loop invocation.
+            # NOTE: The value of the dynamic Map's variable may differ inside and outside the schedule.
             for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
                 callsite_stream.write(
                     self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn,
@@ -2048,6 +2050,11 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                                           kmapIdx=outer_scope.map.params[0],
                                           param=dynmap_var), sdfg, state_id, scope_entry)
             
+            for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
+                callsite_stream.write(
+                    self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn,
+                                                        e.dst.in_connectors[e.dst_conn]), sdfg, state_id, scope_entry)
+            
             if dynmap_step != 1:
                 callsite_stream.write(f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', sdfg, state_id, scope_entry)
 

From 3989e9826c17d3dee0230f7a2c8f6b9379e57089 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 14:50:44 +0200
Subject: [PATCH 054/392] Clean up

---
 dace/codegen/targets/cuda.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 6b80825903..16780fd8d5 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1973,8 +1973,6 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                     raise ValueError('Block size has to be constant for block-wide dynamic map schedule (got %s)' %
                                      str(bdim))
                 total_block_size *= bdim
-            # if _expr(scope_map.range[0][2]) != 1:
-            #     raise NotImplementedError('Skip not implemented for dynamic thread-block map schedule')
 
             ##### TODO (later): Generalize
             # Find thread-block param map and its name
@@ -2000,7 +1998,6 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                 'if ({} < {}) {{'.format(outer_scope.map.params[0],
                                          _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), sdfg,
                 state_id, scope_entry)
-            open_if = True
 
             # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule.
             # They define inside the schedule the bounds of the any nested Maps.
@@ -2019,27 +2016,13 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                 dynmap_var = f'{dynmap_var}_idx'
                 dynmap_begin = 0
                 dynmap_end = f'int_ceil({dynmap_end - dynmap_begin}, {dynmap_step})'
-            # callsite_stream.write(
-            #     '__dace_dynmap_begin = {begin};\n'
-            #     '__dace_dynmap_end = {end};'.format(begin=scope_map.range[0][0], end=scope_map.range[0][1] + 1), sdfg,
-            #     state_id, scope_entry)
             callsite_stream.write(
                 '__dace_dynmap_begin = {begin};\n'
-                '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), sdfg,
-                state_id, scope_entry)
+                '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), sdfg, state_id, scope_entry)
 
             # close if
             callsite_stream.write('}', sdfg, state_id, scope_entry)
 
-            # callsite_stream.write(
-            #     'dace::DynamicMap<{fine_grained}, {bsize}>::'
-            #     'schedule(dace_dyn_map_shared, __dace_dynmap_begin, '
-            #     '__dace_dynmap_end, {kmapIdx}, [&](auto {kmapIdx}, '
-            #     'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool(
-            #         'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
-            #                               bsize=total_block_size,
-            #                               kmapIdx=outer_scope.map.params[0],
-            #                               param=scope_map.params[0]), sdfg, state_id, scope_entry)
             callsite_stream.write(
                 'dace::DynamicMap<{fine_grained}, {bsize}>::'
                 'schedule(dace_dyn_map_shared, __dace_dynmap_begin, '
@@ -2049,14 +2032,16 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
                                           bsize=total_block_size,
                                           kmapIdx=outer_scope.map.params[0],
                                           param=dynmap_var), sdfg, state_id, scope_entry)
-            
+
             for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
                 callsite_stream.write(
                     self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn,
                                                         e.dst.in_connectors[e.dst_conn]), sdfg, state_id, scope_entry)
-            
+
             if dynmap_step != 1:
-                callsite_stream.write(f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', sdfg, state_id, scope_entry)
+                callsite_stream.write(
+                    f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', sdfg,
+                    state_id, scope_entry)
 
         elif scope_map.schedule == dtypes.ScheduleType.GPU_Device:
             dfg_kernel = self._kernel_state.scope_subgraph(self._kernel_map)

From d9e643264f230dcd6f4f9862eb4ee53f881d1bc3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 14:51:09 +0200
Subject: [PATCH 055/392] Added tests.

---
 tests/dynamic_tb_map_cudatest.py | 150 +++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py
index 411b973dc9..ea217497a3 100644
--- a/tests/dynamic_tb_map_cudatest.py
+++ b/tests/dynamic_tb_map_cudatest.py
@@ -12,8 +12,10 @@
 
 @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H])
 def spmv(A_row, A_col, A_val, x, b):
+
     @dace.mapscope(_[0:H])
     def compute_row(i):
+
         @dace.map(_[A_row[i]:A_row[i + 1]])
         def compute(j):
             a << A_val[j]
@@ -64,5 +66,153 @@ def test_dynamic_map():
     assert diff <= 1e-5
 
 
+def _copy_to_gpu(sdfg):
+    for k, v in sdfg.arrays.items():
+        if not v.transient and isinstance(v, dace.data.Array):
+            v.storage = dace.dtypes.StorageType.GPU_Global
+
+
+@pytest.mark.gpu
+def test_nested_dynamic_map():
+    """ Tests the case where the dynamic map inputs are defined in an outer scope. """
+
+    M = dace.symbol('M')
+    N = dace.symbol('N')
+    K = dace.symbol('K')
+    nnz_A = dace.symbol('nnz_A')
+    nnz_D = dace.symbol('nnz_D')
+
+    @dace.program
+    def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.int32[M + 1],
+              A_vals: dace.float32[nnz_A], B: dace.float32[M, K], C: dace.float32[K, N]):
+        for i in dace.map[0:M]:
+            for j in dace.map[A2_pos[i]:A2_pos[i + 1]]:
+                for k in dace.map[0:K]:
+                    D_vals[j] += A_vals[j] * B[i, k] * C[k, A2_crd[j]]
+
+    sdfg = sddmm.to_sdfg(simplify=True)
+
+    ime, jme, kme = None, None, None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                if node.map.params[0] == 'i':
+                    ime = node
+                elif node.map.params[0] == 'j':
+                    jme = node
+                elif node.map.params[0] == 'k':
+                    kme = node
+    assert ime is not None and jme is not None and kme is not None
+
+    from dace.transformation.dataflow import MapInterchange, TrivialTaskletElimination
+    MapInterchange.apply_to(sdfg, outer_map_entry=jme, inner_map_entry=kme)
+    sdfg.apply_transformations_repeated(TrivialTaskletElimination)
+
+    sdfg.apply_gpu_transformations()
+    ime.map.schedule = dace.ScheduleType.GPU_Device
+    kme.map.schedule = dace.ScheduleType.GPU_ThreadBlock_Dynamic
+
+    dtype = np.float32
+    rng = np.random.default_rng(42)
+    problem_size = 1024
+    density = 0.01
+    B = rng.random((problem_size, problem_size), dtype=dtype)
+    C = rng.random((problem_size, problem_size), dtype=dtype)
+    A = scipy.sparse.random(problem_size, problem_size, density=density, format='csr', dtype=dtype, random_state=rng)
+    val = np.zeros_like(A.data)
+    ref = np.empty_like(A.data)
+
+    sdfg(D_vals=val,
+         A2_crd=A.indices.copy(),
+         A2_pos=A.indptr.copy(),
+         A_vals=A.data.copy(),
+         B=B,
+         C=C,
+         M=problem_size,
+         N=problem_size,
+         K=problem_size,
+         nnz_A=A.nnz,
+         nnz_D=A.nnz)
+    tmp = B @ C
+    for row in range(problem_size):
+        for j in range(A.indptr[row], A.indptr[row + 1]):
+            col = A.indices[j]
+            ref[j] = A.data[j] * tmp[row, col]
+    assert np.allclose(val, ref.data)
+
+
+@pytest.mark.gpu
+def test_dynamic_map_with_step():
+
+    M = dace.symbol('M')
+    N = dace.symbol('N')
+    nnz_A = dace.symbol('nnz_A')
+    nnz_D = dace.symbol('nnz_D')
+
+    @dace.program
+    def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.int32[M + 1],
+              A_vals: dace.float32[nnz_A], B: dace.float32[M], C: dace.float32[N]):
+        for i in dace.map[0:M]:
+            for j in dace.map[A2_pos[i]:A2_pos[i + 1]]:
+                D_vals[j] += A_vals[j] * B[i] * C[A2_crd[j]]
+
+    sdfg = sddvm.to_sdfg(simplify=True)
+
+    ime, jme = None, None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                if node.map.params[0] == 'i':
+                    ime = node
+                elif node.map.params[0] == 'j':
+                    jme = node
+    assert ime is not None and jme is not None
+
+    from dace.transformation.dataflow import StripMining, TrivialTaskletElimination
+    sdfg.apply_transformations_repeated(TrivialTaskletElimination)
+    StripMining.apply_to(sdfg, map_entry=jme)
+
+    tile_jme = None, None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                if node.map.params[0] == 'tile_j':
+                    tile_jme = node
+    assert tile_jme is not None
+
+    sdfg.apply_gpu_transformations()
+    ime.map.schedule = dace.ScheduleType.GPU_Device
+    tile_jme.map.schedule = dace.ScheduleType.GPU_ThreadBlock_Dynamic
+
+    dtype = np.float32
+    rng = np.random.default_rng(42)
+    problem_size = 1024
+    density = 0.01
+    B = rng.random((problem_size, ), dtype=dtype)
+    C = rng.random((problem_size, ), dtype=dtype)
+    A = scipy.sparse.random(problem_size, problem_size, density=density, format='csr', dtype=dtype, random_state=rng)
+    val = np.zeros_like(A.data)
+    ref = np.empty_like(A.data)
+
+    sdfg(D_vals=val,
+         A2_crd=A.indices.copy(),
+         A2_pos=A.indptr.copy(),
+         A_vals=A.data.copy(),
+         B=B,
+         C=C,
+         M=problem_size,
+         N=problem_size,
+         nnz_A=A.nnz,
+         nnz_D=A.nnz)
+    tmp = np.outer(B, C)
+    for row in range(problem_size):
+        for j in range(A.indptr[row], A.indptr[row + 1]):
+            col = A.indices[j]
+            ref[j] = A.data[j] * tmp[row, col]
+    assert np.allclose(val, ref.data)
+
+
 if __name__ == '__main__':
     test_dynamic_map()
+    test_nested_dynamic_map()
+    test_dynamic_map_with_step()

From cc99ddb0c01f68a0df48e1a638cfd6536b3bf47f Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 16:55:33 +0200
Subject: [PATCH 056/392] Fixed issue with redefinitions of dynamic Map inputs
 in the host-side GPU kernel invocation.

---
 dace/codegen/targets/cuda.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 16780fd8d5..056eec7e7e 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1600,6 +1600,10 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
             (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed)), sdfg, state_id,
             scope_entry)
 
+        # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
+        if dace.sdfg.has_dynamic_map_inputs(state, scope_entry):
+            callsite_stream.write('{', sdfg, state_id, scope_entry)
+
         # Synchronize all events leading to dynamic map range connectors
         for e in dace.sdfg.dynamic_map_inputs(state, scope_entry):
             if hasattr(e, '_cuda_event'):
@@ -1619,6 +1623,10 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                                       for aname, arg in kernel_args.items()] + extra_call_args)), sdfg, state_id,
             scope_entry)
 
+        # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
+        if dace.sdfg.has_dynamic_map_inputs(state, scope_entry):
+            callsite_stream.write('}', sdfg, state_id, scope_entry)
+
         synchronize_streams(sdfg, state, state_id, scope_entry, scope_exit, callsite_stream, self)
 
         # Instrumentation (post-kernel)

From 5a410d533680a515b9160e0e0a163d38291d74f3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Apr 2023 16:55:46 +0200
Subject: [PATCH 057/392] Added test.

---
 tests/dynamic_tb_map_cudatest.py | 89 ++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py
index ea217497a3..b24e5f2ea6 100644
--- a/tests/dynamic_tb_map_cudatest.py
+++ b/tests/dynamic_tb_map_cudatest.py
@@ -66,10 +66,90 @@ def test_dynamic_map():
     assert diff <= 1e-5
 
 
-def _copy_to_gpu(sdfg):
-    for k, v in sdfg.arrays.items():
-        if not v.transient and isinstance(v, dace.data.Array):
-            v.storage = dace.dtypes.StorageType.GPU_Global
+@pytest.mark.gpu
+def test_dynamic_maps():
+    """ Tests the case of multiple dynamic maps in a row that share dynamic inputs."""
+
+    W = dace.symbol('W')
+    H = dace.symbol('H')
+    nnz = dace.symbol('nnz')
+
+    @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H],
+                  dace.float32[H])
+    def spmv_2x(A_row, A_col, A_val, x, b, c):
+
+        for i in range(H):
+            row_start = A_row[i]
+            row_end = A_row[i + 1]
+            for j in dace.map[row_start:row_end]:
+                b[i] += A_val[j] * x[A_col[j]]
+            for j in dace.map[row_start:row_end]:
+                c[i] += A_val[j] * x[A_col[j]]
+
+    height = 1024
+    width = 1024
+
+    # Prepare spmv SDFG for GPU
+    sdfg = spmv_2x.to_sdfg()
+    # Rename dynamic inputs to cause name clashes
+    main_entry = None
+    main_dict = {}
+    for node, state in sdfg.all_nodes_recursive():
+        if isinstance(node, dace.sdfg.nodes.MapEntry):
+            if main_entry is None:
+                main_entry = node
+                for e in dace.sdfg.dynamic_map_inputs(state, node):
+                    main_dict[e.data.data] = e.dst_conn
+            else:
+                repl_dict = {}
+                for e in dace.sdfg.dynamic_map_inputs(state, node):
+                    node.remove_in_connector(e.dst_conn)
+                    node.add_in_connector(main_dict[e.data.data])
+                    repl_dict[e.dst_conn] = main_dict[e.data.data]
+                    e._dst_conn = main_dict[e.data.data]
+                node.map.range.replace(repl_dict)
+
+    sdfg.apply_gpu_transformations()
+
+    for node in sdfg.all_nodes_recursive():
+        if isinstance(node[0], dace.sdfg.nodes.MapEntry) \
+                and node[0].schedule == dace.dtypes.ScheduleType.Sequential:
+            node[0].schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock_Dynamic
+
+    # Fill input data
+    # each row has up (including) 256 elements
+    A_row = np.random.randint(257, size=height + 1, dtype=dace.uint32.type)
+    A_row[0] = 0
+    A_row = np.cumsum(A_row, dtype=dace.uint32.type)
+
+    # Column data
+    A_col = dace.ndarray([A_row[height]], dtype=dace.uint32)
+    for i in range(height):
+        A_col[A_row[i]:A_row[i + 1]] = np.sort(np.random.choice(width, A_row[i + 1] - A_row[i], replace=False))
+
+    # values
+    A_val = np.random.rand(A_row[height]).astype(dace.float32.type)
+
+    A_sparse = scipy.sparse.csr_matrix((A_val, A_col, A_row), dtype=dace.float32.type, shape=(1024, 1024))
+
+    x = np.random.rand(width).astype(dace.float32.type)
+    b = np.zeros(height, dtype=dace.float32.type)
+    c = np.zeros(height, dtype=dace.float32.type)
+
+    sdfg(A_row=A_row,
+         A_col=A_col,
+         A_val=A_val,
+         x=x,
+         b=b,
+         c=c,
+         H=A_sparse.shape[0],
+         W=A_sparse.shape[1],
+         nnz=A_sparse.nnz)
+
+    diff0 = np.linalg.norm(A_sparse.dot(x) - b) / float(height)
+    diff1 = np.linalg.norm(A_sparse.dot(x) - c) / float(height)
+    assert diff0 <= 1e-5
+    assert diff1 <= 1e-5
 
 
 @pytest.mark.gpu
@@ -214,5 +294,6 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
 
 if __name__ == '__main__':
     test_dynamic_map()
+    test_dynamic_maps()
     test_nested_dynamic_map()
     test_dynamic_map_with_step()

From c0d3df9518811a9c5a1632643e0fa50d67e03f9f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 May 2023 21:37:17 +0000
Subject: [PATCH 058/392] Bump flask from 1.1.2 to 2.3.2

Bumps [flask](https://github.com/pallets/flask) from 1.1.2 to 2.3.2.
- [Release notes](https://github.com/pallets/flask/releases)
- [Changelog](https://github.com/pallets/flask/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/flask/compare/1.1.2...2.3.2)

---
updated-dependencies:
- dependency-name: flask
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index dcf0e2467b..4ac426d6f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ click==7.1.2
 cmake==3.18.2.post1
 decorator==4.4.2
 distro==1.5.0
-Flask==1.1.2
+Flask==2.3.2
 idna==2.10
 itsdangerous==2.0.0a1
 Jinja2==3.0.0a1

From eab9b6dc23ac259b1398d357ee32fa28eeb5cd02 Mon Sep 17 00:00:00 2001
From: C-TC <tiachen@student.ethz.ch>
Date: Tue, 2 May 2023 03:44:56 +0200
Subject: [PATCH 059/392] initial commit for gpu grid-strided tiling

---
 dace/transformation/dataflow/__init__.py      |   3 +-
 .../dataflow/gpu_grid_stride_tiling.py        | 285 ++++++++++++++++++
 .../gpu_grid_stride_tiling_test.py            |  77 +++++
 3 files changed, 364 insertions(+), 1 deletion(-)
 create mode 100644 dace/transformation/dataflow/gpu_grid_stride_tiling.py
 create mode 100644 tests/transformations/gpu_grid_stride_tiling_test.py

diff --git a/dace/transformation/dataflow/__init__.py b/dace/transformation/dataflow/__init__.py
index a0db82845e..303f1d0a64 100644
--- a/dace/transformation/dataflow/__init__.py
+++ b/dace/transformation/dataflow/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """ This module initializes the dataflow transformations package. """
 
 # Map-related
@@ -22,6 +22,7 @@
 from .buffer_tiling import BufferTiling
 from .vectorization import Vectorization
 from .copy_to_map import CopyToMap
+from .gpu_grid_stride_tiling import GPUGridStridedTiling
 
 # Data-related
 from .stream_transient import StreamTransient, AccumulateTransient
diff --git a/dace/transformation/dataflow/gpu_grid_stride_tiling.py b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
new file mode 100644
index 0000000000..644e3dca49
--- /dev/null
+++ b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
@@ -0,0 +1,285 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" This module contains classes and functions that implement the grid-strided map tiling
+    transformation."""
+
+from typing import Dict
+import dace
+from copy import deepcopy as dcpy
+from dace import dtypes, subsets, symbolic
+from dace.sdfg import SDFG, SDFGState
+from dace.properties import make_properties, Property, SymbolicProperty
+from dace.sdfg import nodes
+from dace.sdfg import utils as sdutil
+from dace.transformation import transformation
+from dace.transformation.dataflow import MapInterchange
+from dace.transformation.dataflow.strip_mining import calc_set_image, calc_set_union
+import sympy
+
+
+@make_properties
+class GPUGridStridedTiling(transformation.SingleStateTransformation):
+    """
+    Implements the grid-strided map tiling transformation on two nested maps.
+
+    E.g.
+    i = ib:ie:is -> j = jb:je:js
+    After transformation:
+    i0 = 0:GridDim -> j0 = 0:BlockDim -> i1 = ib+i0*is:ie:GridDim*is -> j1 = jb+j0*js:je:BlockDim*js
+    where GridDim = min(MaxGridDim, (ie-ib)//is)
+    """
+
+    outer_map_entry = transformation.PatternNode(nodes.MapEntry)
+    inner_map_entry = transformation.PatternNode(nodes.MapEntry)
+
+    # Properties
+
+    new_dim_prefix = Property(dtype=str, default="tile", desc="Prefix for new dimension name")
+    max_grid_dim = SymbolicProperty(default=65535, desc="Maximum grid dimension")
+    block_dim = Property(default=128, desc="Block dimension")
+
+    @classmethod
+    def expressions(cls):
+        return [sdutil.node_path_graph(cls.outer_map_entry, cls.inner_map_entry)]
+
+    def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
+
+        outer_map_entry = self.outer_map_entry
+        inner_map_entry = self.inner_map_entry
+
+        # Check that the destination of all the outgoing edges
+        # from the outer map's entry is the inner map's entry.
+        for e in graph.out_edges(outer_map_entry):
+            if e.dst != inner_map_entry:
+                return False
+        # Check that the source of all the incoming edges
+        # to the inner map's entry is the outer map's entry.
+        for e in graph.in_edges(inner_map_entry):
+            if e.src != outer_map_entry:
+                return False
+
+        # Check the edges between the exits of the two maps.
+        inner_map_exit = graph.exit_node(inner_map_entry)
+        outer_map_exit = graph.exit_node(outer_map_entry)
+
+        # Check that the destination of all the outgoing edges
+        # from the inner map's exit is the outer map's exit.
+        for e in graph.out_edges(inner_map_exit):
+            if e.dst != outer_map_exit:
+                return False
+        # Check that the source of all the incoming edges
+        # to the outer map's exit is the inner map's exit.
+        for e in graph.in_edges(outer_map_exit):
+            if e.src != inner_map_exit:
+                return False
+
+        # Currently only support nested maps with a single dimension in each.
+        if len(outer_map_entry.map.params) != 1 or len(inner_map_entry.map.params) != 1:
+            return False
+
+        return True
+
+    def _find_new_dim(self, sdfg: SDFG, state: SDFGState, entry: nodes.MapEntry, prefix: str, target_dim: str):
+        """ Finds a variable that is not already defined in scope. """
+        candidate = '%s_%s' % (prefix, target_dim)
+        index = 1
+        defined_vars = set(str(s) for s in (state.symbols_defined_at(entry).keys() | sdfg.symbols.keys()))
+        while candidate in defined_vars:
+            candidate = '%s%d_%s' % (prefix, index, target_dim)
+            index += 1
+        return candidate
+
+    def apply(self, graph: SDFGState, sdfg: SDFG):
+        i_entry = self.inner_map_entry
+        o_entry = self.outer_map_entry
+        i_exit = graph.exit_node(i_entry)
+        o_exit = graph.exit_node(o_entry)
+
+        new_dim_prefix = self.new_dim_prefix
+        max_grid_dim = self.max_grid_dim
+        block_dim = self.block_dim
+
+        max_grid_dim = symbolic.pystr_to_symbolic(max_grid_dim)
+        block_dim = symbolic.pystr_to_symbolic(block_dim)
+
+        # Get the map params
+        o_from, o_to, o_step = o_entry.map.range[0]
+        i_from, i_to, i_step = i_entry.map.range[0]
+
+        tile_o_dim_new = self._find_new_dim(sdfg, graph, o_entry, new_dim_prefix, o_entry.map.params[0])
+        tile_i_dim_new = self._find_new_dim(sdfg, graph, i_entry, new_dim_prefix, i_entry.map.params[0])
+
+        grid_dim = sympy.Min(max_grid_dim, (o_to + 1 - o_from) // o_step)
+
+        # TODO: how to deal with approximated values?
+        # begin, end, step of all four maps
+        tile_o_range_new = (0, grid_dim - 1, 1)
+        tile_i_range_new = (0, block_dim - 1, 1)
+        o_range_new = (o_from + symbolic.pystr_to_symbolic(tile_o_dim_new) * o_step, o_to, grid_dim * o_step)
+        i_range_new = (i_from + symbolic.pystr_to_symbolic(tile_i_dim_new) * i_step, i_to, block_dim * i_step)
+
+        # Create the new maps
+        tile_o_map = nodes.Map(o_entry.map.label, [tile_o_dim_new],
+                               subsets.Range([tile_o_range_new]),
+                               schedule=dtypes.ScheduleType.GPU_Device)
+        tile_i_map = nodes.Map(i_entry.map.label, [tile_i_dim_new],
+                               subsets.Range([tile_i_range_new]),
+                               schedule=dtypes.ScheduleType.GPU_ThreadBlock)
+
+        # Create the new map entries and exits
+        tile_o_entry = nodes.MapEntry(tile_o_map)
+        tile_i_entry = nodes.MapEntry(tile_i_map)
+        tile_o_exit = nodes.MapExit(tile_o_map)
+        tile_i_exit = nodes.MapExit(tile_i_map)
+
+        # Set block size
+        tile_i_entry.map.gpu_block_size = [self.block_dim, 1, 1]
+
+        # Update Range and ScheduleType of the maps
+        o_entry.map.range = subsets.Range([o_range_new])
+        o_entry.map.schedule = dtypes.ScheduleType.Sequential
+        i_entry.map.range = subsets.Range([i_range_new])
+        i_entry.map.schedule = dtypes.ScheduleType.Sequential
+
+        # Redirect edges
+        tile_o_entry.in_connectors = dcpy(o_entry.in_connectors)
+        tile_i_entry.in_connectors = dcpy(i_entry.in_connectors)
+        tile_o_exit.out_connectors = dcpy(o_exit.out_connectors)
+        tile_i_exit.out_connectors = dcpy(i_exit.out_connectors)
+        sdutil.change_edge_src(graph, o_exit, tile_o_exit)
+        sdutil.change_edge_src(graph, i_exit, tile_i_exit)
+        sdutil.change_edge_dest(graph, o_entry, tile_o_entry)
+        sdutil.change_edge_dest(graph, i_entry, tile_i_entry)
+
+        # Connect previous map nodes and corresponding tile map nodes
+        # Code borrowed from StripMining transformation
+        for map_entry, new_map_entry, map_exit, new_map_exit in [
+            (o_entry, tile_o_entry, o_exit, tile_o_exit),
+            (i_entry, tile_i_entry, i_exit, tile_i_exit),
+        ]:
+            # Create new entry edges
+            new_in_edges = dict()
+            entry_in_conn = {}
+            entry_out_conn = {}
+            for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry):
+                if (src_conn is not None and src_conn[:4] == 'OUT_'
+                        and not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar)):
+                    new_subset = calc_set_image(
+                        map_entry.map.params,
+                        map_entry.map.range,
+                        memlet.subset,
+                    )
+                    conn = src_conn[4:]
+                    key = (memlet.data, 'IN_' + conn, 'OUT_' + conn)
+                    if key in new_in_edges.keys():
+                        old_subset = new_in_edges[key].subset
+                        new_in_edges[key].subset = calc_set_union(old_subset, new_subset)
+                    else:
+                        entry_in_conn['IN_' + conn] = None
+                        entry_out_conn['OUT_' + conn] = None
+                        new_memlet = dcpy(memlet)
+                        new_memlet.subset = new_subset
+                        if memlet.dynamic:
+                            new_memlet.num_accesses = memlet.num_accesses
+                        else:
+                            new_memlet.num_accesses = new_memlet.num_elements().simplify()
+                        new_in_edges[key] = new_memlet
+                else:
+                    if src_conn is not None and src_conn[:4] == 'OUT_':
+                        conn = src_conn[4:]
+                        in_conn = 'IN_' + conn
+                        out_conn = 'OUT_' + conn
+                    else:
+                        in_conn = src_conn
+                        out_conn = src_conn
+                    if in_conn:
+                        entry_in_conn[in_conn] = None
+                    if out_conn:
+                        entry_out_conn[out_conn] = None
+                    new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet)
+            new_map_entry.out_connectors = entry_out_conn
+            map_entry.in_connectors = entry_in_conn
+            for (_, in_conn, out_conn), memlet in new_in_edges.items():
+                graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet)
+
+            # Create new exit edges
+            new_out_edges = dict()
+            exit_in_conn = {}
+            exit_out_conn = {}
+            for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit):
+                if (dst_conn is not None and dst_conn[:3] == 'IN_'
+                        and not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar)):
+                    new_subset = calc_set_image(
+                        map_entry.map.params,
+                        map_entry.map.range,
+                        memlet.subset,
+                    )
+                    conn = dst_conn[3:]
+                    key = (memlet.data, 'IN_' + conn, 'OUT_' + conn)
+                    if key in new_out_edges.keys():
+                        old_subset = new_out_edges[key].subset
+                        new_out_edges[key].subset = calc_set_union(old_subset, new_subset)
+                    else:
+                        exit_in_conn['IN_' + conn] = None
+                        exit_out_conn['OUT_' + conn] = None
+                        new_memlet = dcpy(memlet)
+                        new_memlet.subset = new_subset
+                        if memlet.dynamic:
+                            new_memlet.num_accesses = memlet.num_accesses
+                        else:
+                            new_memlet.num_accesses = new_memlet.num_elements().simplify()
+                        new_out_edges[key] = new_memlet
+                else:
+                    if dst_conn is not None and dst_conn[:3] == 'IN_':
+                        conn = dst_conn[3:]
+                        in_conn = 'IN_' + conn
+                        out_conn = 'OUT_' + conn
+                    else:
+                        in_conn = dst_conn
+                        out_conn = dst_conn
+                    if in_conn:
+                        exit_in_conn[in_conn] = None
+                    if out_conn:
+                        exit_out_conn[out_conn] = None
+                    new_out_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet)
+            new_map_exit.in_connectors = exit_in_conn
+            map_exit.out_connectors = exit_out_conn
+            for (_, in_conn, out_conn), memlet in new_out_edges.items():
+                graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet)
+
+        # if inner map contains dynamic range, need to move the dynamic connectors
+        # from tile_inner map entry to inner map entry to facilitate MapInterchange.
+        # Because we brute-forcely did sdutil.change_edge_dest(graph, i_entry, tile_i_entry)
+        # TODO: what about map exit connectors?
+        data_dict: Dict[str, dace.Memlet] = {}  # map data array to memlet
+        for e in graph.edges_between(o_entry, tile_i_entry):
+            if e.dst_conn is not None and e.dst_conn[:3] != 'IN_' and e.src_conn[:4] == 'OUT_':
+                # trim edges
+                graph.remove_edge(e)
+                # add edges between tile_i_entry and i_entry
+                graph.add_edge(tile_i_entry, e.src_conn, i_entry, e.dst_conn, dcpy(e.data))
+
+                # add edges between o_entry and tile_i_entry
+                if e.data.data not in data_dict.keys():
+                    # new edge data, add to data_dict
+                    data_dict[e.data.data] = dcpy(e.data)
+                    in_conn = 'IN_' + e.src_conn[4:]
+                    assert e.src_conn[4:] == e.data.data
+                    graph.add_edge(o_entry, e.src_conn, tile_i_entry, in_conn, data_dict[e.data.data])
+                else:
+                    # already added edge data, just add edge volume
+                    # TODO: how to add subset?
+                    data_dict[e.data.data].volume += e.data.volume
+
+                # trim connectors
+                tile_i_entry.remove_in_connector(e.dst_conn)
+
+                # TODO: fix missing added connectors
+
+        # sdfg.view()
+
+        # Interchange middle two maps
+        MapInterchange.apply_to(sdfg, outer_map_entry=o_entry, inner_map_entry=tile_i_entry)
+
+    @staticmethod
+    def annotates_memlets():
+        return True
diff --git a/tests/transformations/gpu_grid_stride_tiling_test.py b/tests/transformations/gpu_grid_stride_tiling_test.py
new file mode 100644
index 0000000000..e1bf8aa1da
--- /dev/null
+++ b/tests/transformations/gpu_grid_stride_tiling_test.py
@@ -0,0 +1,77 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""Tests for GPU grid-strided tiling transformation."""
+from numpy.random import default_rng
+from typing import List, Tuple
+from copy import deepcopy
+import numpy as np
+import cupy as cp
+import dace
+from dace.transformation.dataflow import MapInterchange, StripMining, MapReduceFusion, MapExpansion, MapToForLoop, TrivialTaskletElimination, GPUGridStridedTiling
+from dace.transformation.interstate import GPUTransformSDFG
+
+
+def copy_to_gpu(sdfg):
+    for k, v in sdfg.arrays.items():
+        if not v.transient and isinstance(v, dace.data.Array):
+            v.storage = dace.dtypes.StorageType.GPU_Global
+
+
+def find_map_entry(sdfg: dace.SDFG, map_name_list: List[str]) -> Tuple[dace.sdfg.nodes.MapEntry]:
+    if isinstance(map_name_list, str):
+        map_name_list = [
+            map_name_list,
+        ]
+    ret_list = [None] * len(map_name_list)
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.sdfg.nodes.MapEntry):
+                for i, map_name in enumerate(map_name_list):
+                    if map_name == node.map.params[0]:
+                        ret_list[i] = node
+    # check if all map entries are found
+    assert all([x is not None for x in ret_list])
+
+    # unpack if only one map entry is found
+    if len(ret_list) == 1:
+        return ret_list[0]
+    else:
+        return tuple(ret_list)
+
+
+def test_gpu_grid_stride_tiling():
+
+    M = dace.symbol('M')
+    N = dace.symbol('N')
+    K = dace.symbol('K')
+    nnz_A = dace.symbol('nnz_A')
+    nnz_D = dace.symbol('nnz_D')
+
+    @dace.program
+    def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.int32[M + 1],
+              A_vals: dace.float32[nnz_A], B: dace.float32[M, K], C: dace.float32[K, N]):
+        for i in dace.map[0:M]:
+            for j in dace.map[A2_pos[i]:A2_pos[i + 1]]:
+                for k in dace.map[0:K]:
+                    D_vals[j] += A_vals[j] * B[i, k] * C[k, A2_crd[j]]
+
+    sdfg = sddmm.to_sdfg()
+
+    sdfg.simplify()
+
+    ime, jme, _ = find_map_entry(sdfg, ["i", "j", "k"])
+
+    sdfg.apply_transformations_repeated(TrivialTaskletElimination)
+
+    copy_to_gpu(sdfg)
+    GPUGridStridedTiling.apply_to(sdfg, outer_map_entry=ime, inner_map_entry=jme)
+    sdfg.view()
+
+    for e, _ in sdfg.all_edges_recursive():
+        if isinstance(e.data, dace.Memlet) and e.data.wcr:
+            e.data.wcr_nonatomic = True
+
+    sdfg.validate()
+
+
+if __name__ == '__main__':
+    test_gpu_grid_stride_tiling()

From a2b86051aa9dc4b331964cb9aee3d09e2f63ba8a Mon Sep 17 00:00:00 2001
From: C-TC <tiachen@student.ethz.ch>
Date: Tue, 2 May 2023 12:54:51 +0200
Subject: [PATCH 060/392] fix missing connectors, fix memlet, add dummy test

---
 .../dataflow/gpu_grid_stride_tiling.py        | 25 ++++++++-------
 .../gpu_grid_stride_tiling_test.py            | 31 +++++++++++++++----
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/dace/transformation/dataflow/gpu_grid_stride_tiling.py b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
index 644e3dca49..1ecbae9762 100644
--- a/dace/transformation/dataflow/gpu_grid_stride_tiling.py
+++ b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
@@ -13,6 +13,7 @@
 from dace.transformation import transformation
 from dace.transformation.dataflow import MapInterchange
 from dace.transformation.dataflow.strip_mining import calc_set_image, calc_set_union
+from dace.sdfg.propagation import propagate_memlet
 import sympy
 
 
@@ -250,36 +251,38 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
         # from tile_inner map entry to inner map entry to facilitate MapInterchange.
         # Because we brute-forcely did sdutil.change_edge_dest(graph, i_entry, tile_i_entry)
         # TODO: what about map exit connectors?
-        data_dict: Dict[str, dace.Memlet] = {}  # map data array to memlet
+        data_dict = {}  # map data array to memlet
         for e in graph.edges_between(o_entry, tile_i_entry):
             if e.dst_conn is not None and e.dst_conn[:3] != 'IN_' and e.src_conn[:4] == 'OUT_':
                 # trim edges
                 graph.remove_edge(e)
                 # add edges between tile_i_entry and i_entry
+                tile_i_entry.add_out_connector(e.src_conn)
+                i_entry.add_in_connector(e.dst_conn)
                 graph.add_edge(tile_i_entry, e.src_conn, i_entry, e.dst_conn, dcpy(e.data))
 
                 # add edges between o_entry and tile_i_entry
                 if e.data.data not in data_dict.keys():
                     # new edge data, add to data_dict
-                    data_dict[e.data.data] = dcpy(e.data)
                     in_conn = 'IN_' + e.src_conn[4:]
                     assert e.src_conn[4:] == e.data.data
-                    graph.add_edge(o_entry, e.src_conn, tile_i_entry, in_conn, data_dict[e.data.data])
-                else:
-                    # already added edge data, just add edge volume
-                    # TODO: how to add subset?
-                    data_dict[e.data.data].volume += e.data.volume
+                    o_entry.add_out_connector(e.src_conn)
+                    tile_i_entry.add_in_connector(in_conn)
+                    data_dict[e.data.data] = graph.add_edge(o_entry, e.src_conn, tile_i_entry, in_conn, dcpy(e.data))
 
                 # trim connectors
                 tile_i_entry.remove_in_connector(e.dst_conn)
 
-                # TODO: fix missing added connectors
-
-        # sdfg.view()
+        for e in graph.edges_between(tile_i_entry, i_entry) + graph.edges_between(o_entry, tile_i_entry):
+            # propogate edge memlet
+            path = graph.memlet_path(e)
+            edge_to_propogate = next(edge for edge in path if e is edge)
+            if edge_to_propogate is not None:
+                e.data.subset = propagate_memlet(graph, edge_to_propogate.data, tile_i_entry, True).subset
 
         # Interchange middle two maps
         MapInterchange.apply_to(sdfg, outer_map_entry=o_entry, inner_map_entry=tile_i_entry)
 
     @staticmethod
     def annotates_memlets():
-        return True
+        return False
diff --git a/tests/transformations/gpu_grid_stride_tiling_test.py b/tests/transformations/gpu_grid_stride_tiling_test.py
index e1bf8aa1da..d80932c773 100644
--- a/tests/transformations/gpu_grid_stride_tiling_test.py
+++ b/tests/transformations/gpu_grid_stride_tiling_test.py
@@ -39,6 +39,30 @@ def find_map_entry(sdfg: dace.SDFG, map_name_list: List[str]) -> Tuple[dace.sdfg
 
 
 def test_gpu_grid_stride_tiling():
+    M = dace.symbol('M')
+    N = dace.symbol('N')
+
+    @dace.program
+    def dummy(A: dace.float32[M, N], B: dace.float32[M, N]):
+        for i in dace.map[1:M:2]:
+            for j in dace.map[3:N:4]:
+                A[i, j] = B[i, j] + 1.0
+
+    sdfg = dummy.to_sdfg()
+    sdfg.simplify()
+    ime, jme = find_map_entry(sdfg, ["i", "j"])
+    sdfg.apply_transformations_repeated(TrivialTaskletElimination)
+    copy_to_gpu(sdfg)
+    GPUGridStridedTiling.apply_to(sdfg, outer_map_entry=ime, inner_map_entry=jme)
+    for e, _ in sdfg.all_edges_recursive():
+        if isinstance(e.data, dace.Memlet) and e.data.wcr:
+            e.data.wcr_nonatomic = True
+
+    sdfg.validate()
+    sdfg.view()
+
+
+def test_gpu_grid_stride_tiling_with_indirection():
 
     M = dace.symbol('M')
     N = dace.symbol('N')
@@ -55,17 +79,11 @@ def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
                     D_vals[j] += A_vals[j] * B[i, k] * C[k, A2_crd[j]]
 
     sdfg = sddmm.to_sdfg()
-
     sdfg.simplify()
-
     ime, jme, _ = find_map_entry(sdfg, ["i", "j", "k"])
-
     sdfg.apply_transformations_repeated(TrivialTaskletElimination)
-
     copy_to_gpu(sdfg)
     GPUGridStridedTiling.apply_to(sdfg, outer_map_entry=ime, inner_map_entry=jme)
-    sdfg.view()
-
     for e, _ in sdfg.all_edges_recursive():
         if isinstance(e.data, dace.Memlet) and e.data.wcr:
             e.data.wcr_nonatomic = True
@@ -75,3 +93,4 @@ def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
 
 if __name__ == '__main__':
     test_gpu_grid_stride_tiling()
+    test_gpu_grid_stride_tiling_with_indirection()

From b3495dd6053c34ccbb9da1633070ab943869d5ef Mon Sep 17 00:00:00 2001
From: C-TC <tiachen@student.ethz.ch>
Date: Tue, 2 May 2023 12:57:10 +0200
Subject: [PATCH 061/392] remove sdfg view() in test

---
 tests/transformations/gpu_grid_stride_tiling_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/transformations/gpu_grid_stride_tiling_test.py b/tests/transformations/gpu_grid_stride_tiling_test.py
index d80932c773..27a81c5e44 100644
--- a/tests/transformations/gpu_grid_stride_tiling_test.py
+++ b/tests/transformations/gpu_grid_stride_tiling_test.py
@@ -59,7 +59,6 @@ def dummy(A: dace.float32[M, N], B: dace.float32[M, N]):
             e.data.wcr_nonatomic = True
 
     sdfg.validate()
-    sdfg.view()
 
 
 def test_gpu_grid_stride_tiling_with_indirection():

From 1d99109cc5cafe66e69c8f07474b05901ef690b8 Mon Sep 17 00:00:00 2001
From: C-TC <tiachen@student.ethz.ch>
Date: Tue, 2 May 2023 15:31:31 +0200
Subject: [PATCH 062/392] attempt to fix memlet, remove unused packages in test

---
 dace/transformation/dataflow/gpu_grid_stride_tiling.py | 8 +-------
 tests/transformations/gpu_grid_stride_tiling_test.py   | 9 +++------
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/dace/transformation/dataflow/gpu_grid_stride_tiling.py b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
index 1ecbae9762..1a4c379277 100644
--- a/dace/transformation/dataflow/gpu_grid_stride_tiling.py
+++ b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
@@ -251,7 +251,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
         # from tile_inner map entry to inner map entry to facilitate MapInterchange.
         # Because we brute-forcely did sdutil.change_edge_dest(graph, i_entry, tile_i_entry)
         # TODO: what about map exit connectors?
-        data_dict = {}  # map data array to memlet
+        data_dict = {}  # map data array to new edge
         for e in graph.edges_between(o_entry, tile_i_entry):
             if e.dst_conn is not None and e.dst_conn[:3] != 'IN_' and e.src_conn[:4] == 'OUT_':
                 # trim edges
@@ -273,12 +273,6 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 # trim connectors
                 tile_i_entry.remove_in_connector(e.dst_conn)
 
-        for e in graph.edges_between(tile_i_entry, i_entry) + graph.edges_between(o_entry, tile_i_entry):
-            # propogate edge memlet
-            path = graph.memlet_path(e)
-            edge_to_propogate = next(edge for edge in path if e is edge)
-            if edge_to_propogate is not None:
-                e.data.subset = propagate_memlet(graph, edge_to_propogate.data, tile_i_entry, True).subset
 
         # Interchange middle two maps
         MapInterchange.apply_to(sdfg, outer_map_entry=o_entry, inner_map_entry=tile_i_entry)
diff --git a/tests/transformations/gpu_grid_stride_tiling_test.py b/tests/transformations/gpu_grid_stride_tiling_test.py
index 27a81c5e44..f6ed5b5ef1 100644
--- a/tests/transformations/gpu_grid_stride_tiling_test.py
+++ b/tests/transformations/gpu_grid_stride_tiling_test.py
@@ -1,13 +1,8 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """Tests for GPU grid-strided tiling transformation."""
-from numpy.random import default_rng
 from typing import List, Tuple
-from copy import deepcopy
-import numpy as np
-import cupy as cp
 import dace
-from dace.transformation.dataflow import MapInterchange, StripMining, MapReduceFusion, MapExpansion, MapToForLoop, TrivialTaskletElimination, GPUGridStridedTiling
-from dace.transformation.interstate import GPUTransformSDFG
+from dace.transformation.dataflow import TrivialTaskletElimination, GPUGridStridedTiling
 
 
 def copy_to_gpu(sdfg):
@@ -59,6 +54,7 @@ def dummy(A: dace.float32[M, N], B: dace.float32[M, N]):
             e.data.wcr_nonatomic = True
 
     sdfg.validate()
+    sdfg.compile()
 
 
 def test_gpu_grid_stride_tiling_with_indirection():
@@ -88,6 +84,7 @@ def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
             e.data.wcr_nonatomic = True
 
     sdfg.validate()
+    sdfg.compile()
 
 
 if __name__ == '__main__':

From 742e974e67556df79c54b4bbb93f13810e6f2d97 Mon Sep 17 00:00:00 2001
From: C-TC <tiachen@student.ethz.ch>
Date: Tue, 2 May 2023 18:35:42 +0200
Subject: [PATCH 063/392] add tests

---
 .../dataflow/gpu_grid_stride_tiling.py        |  2 -
 .../gpu_grid_stride_tiling_test.py            | 68 ++++++++++++-------
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/dace/transformation/dataflow/gpu_grid_stride_tiling.py b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
index 1a4c379277..e9475168c2 100644
--- a/dace/transformation/dataflow/gpu_grid_stride_tiling.py
+++ b/dace/transformation/dataflow/gpu_grid_stride_tiling.py
@@ -265,7 +265,6 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 if e.data.data not in data_dict.keys():
                     # new edge data, add to data_dict
                     in_conn = 'IN_' + e.src_conn[4:]
-                    assert e.src_conn[4:] == e.data.data
                     o_entry.add_out_connector(e.src_conn)
                     tile_i_entry.add_in_connector(in_conn)
                     data_dict[e.data.data] = graph.add_edge(o_entry, e.src_conn, tile_i_entry, in_conn, dcpy(e.data))
@@ -273,7 +272,6 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 # trim connectors
                 tile_i_entry.remove_in_connector(e.dst_conn)
 
-
         # Interchange middle two maps
         MapInterchange.apply_to(sdfg, outer_map_entry=o_entry, inner_map_entry=tile_i_entry)
 
diff --git a/tests/transformations/gpu_grid_stride_tiling_test.py b/tests/transformations/gpu_grid_stride_tiling_test.py
index f6ed5b5ef1..4418054cc9 100644
--- a/tests/transformations/gpu_grid_stride_tiling_test.py
+++ b/tests/transformations/gpu_grid_stride_tiling_test.py
@@ -1,14 +1,11 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """Tests for GPU grid-strided tiling transformation."""
 from typing import List, Tuple
+import pytest
 import dace
 from dace.transformation.dataflow import TrivialTaskletElimination, GPUGridStridedTiling
-
-
-def copy_to_gpu(sdfg):
-    for k, v in sdfg.arrays.items():
-        if not v.transient and isinstance(v, dace.data.Array):
-            v.storage = dace.dtypes.StorageType.GPU_Global
+import numpy as np
+import scipy.sparse as sparse
 
 
 def find_map_entry(sdfg: dace.SDFG, map_name_list: List[str]) -> Tuple[dace.sdfg.nodes.MapEntry]:
@@ -33,41 +30,52 @@ def find_map_entry(sdfg: dace.SDFG, map_name_list: List[str]) -> Tuple[dace.sdfg
         return tuple(ret_list)
 
 
+@pytest.mark.gpu
 def test_gpu_grid_stride_tiling():
-    M = dace.symbol('M')
-    N = dace.symbol('N')
+    M = 300
+    N = 300
 
     @dace.program
     def dummy(A: dace.float32[M, N], B: dace.float32[M, N]):
-        for i in dace.map[1:M:2]:
-            for j in dace.map[3:N:4]:
+        for i in dace.map[0:M]:
+            for j in dace.map[0:N]:
                 A[i, j] = B[i, j] + 1.0
 
     sdfg = dummy.to_sdfg()
     sdfg.simplify()
     ime, jme = find_map_entry(sdfg, ["i", "j"])
     sdfg.apply_transformations_repeated(TrivialTaskletElimination)
-    copy_to_gpu(sdfg)
+    sdfg.apply_gpu_transformations()
     GPUGridStridedTiling.apply_to(sdfg, outer_map_entry=ime, inner_map_entry=jme)
-    for e, _ in sdfg.all_edges_recursive():
-        if isinstance(e.data, dace.Memlet) and e.data.wcr:
-            e.data.wcr_nonatomic = True
 
     sdfg.validate()
-    sdfg.compile()
 
+    B = np.random.rand(M, N).astype(np.float32)
+    A_ref = np.zeros((M, N), dtype=np.float32)
+    A_test = np.zeros((M, N), dtype=np.float32)
+    A_ref = B + 1.0
+    sdfg(A=A_test, B=B)
+    assert np.allclose(A_ref, A_test)
 
+
+@pytest.mark.gpu
 def test_gpu_grid_stride_tiling_with_indirection():
 
-    M = dace.symbol('M')
-    N = dace.symbol('N')
-    K = dace.symbol('K')
-    nnz_A = dace.symbol('nnz_A')
-    nnz_D = dace.symbol('nnz_D')
+    M = 300
+    N = 300
+    K = 300
+    density = 0.01
+    dtype = np.float32
+    A = sparse.random(M, N, density=density, format='csr', dtype=dtype)
+    nnz = A.nnz
+    B = np.random.rand(M, K).astype(dtype)
+    C = np.random.rand(K, N).astype(dtype)
+    D_test = np.zeros_like(A.data)
+    D_ref = np.zeros_like(A.data)
 
     @dace.program
-    def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.int32[M + 1],
-              A_vals: dace.float32[nnz_A], B: dace.float32[M, K], C: dace.float32[K, N]):
+    def sddmm(D_vals: dace.float32[nnz], A2_crd: dace.int32[nnz], A2_pos: dace.int32[M + 1], A_vals: dace.float32[nnz],
+              B: dace.float32[M, K], C: dace.float32[K, N]):
         for i in dace.map[0:M]:
             for j in dace.map[A2_pos[i]:A2_pos[i + 1]]:
                 for k in dace.map[0:K]:
@@ -77,14 +85,26 @@ def sddmm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
     sdfg.simplify()
     ime, jme, _ = find_map_entry(sdfg, ["i", "j", "k"])
     sdfg.apply_transformations_repeated(TrivialTaskletElimination)
-    copy_to_gpu(sdfg)
+    sdfg.apply_gpu_transformations()
     GPUGridStridedTiling.apply_to(sdfg, outer_map_entry=ime, inner_map_entry=jme)
     for e, _ in sdfg.all_edges_recursive():
         if isinstance(e.data, dace.Memlet) and e.data.wcr:
             e.data.wcr_nonatomic = True
 
     sdfg.validate()
-    sdfg.compile()
+
+    # reference
+    for i in range(M):
+        for j in range(A.indptr[i], A.indptr[i + 1]):
+            D_ref[j] += A.data[j] * (B[i, :] @ C[:, A.indices[j]])
+
+    sdfg(A_vals=np.copy(A.data),
+         A2_crd=np.copy(A.indices),
+         A2_pos=A.indptr,
+         B=B,
+         C=C,
+         D_vals=D_test)
+    assert np.allclose(D_ref, D_test)
 
 
 if __name__ == '__main__':

From 46729664d8d1a363490f14daf4463e127521019a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 3 May 2023 15:41:07 +0200
Subject: [PATCH 064/392] Reworked addition and naming of closure arrays to
 (nested) SDFGs.

---
 dace/frontend/python/newast.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 3b22f1649d..0e9ad41adf 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3645,7 +3645,22 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
                     # If the symbol is a callback, but is not used in the nested SDFG, skip it
                     continue
 
-                outer_name = self.sdfg.add_datadesc(aname, desc, find_new_name=True)
+                # First, we do an inverse lookup on the already added closure arrays for `arr`.
+                is_new_arr = True
+                for k, v in self.nested_closure_arrays.items():
+                    if arr is v[0]:
+                        is_new_arr = False
+                        break
+                # `arr` has not been added yet: add it with a (possibly) new name.
+                if is_new_arr:
+                    outer_name = self.sdfg.add_datadesc(aname, desc, find_new_name=True)
+                # `arr` has already been added, but is not in the SDFG: add it with the same name.
+                # NOTE: This may occur when `arr` has already been added in a nested scope.
+                elif aname not in self.sdfg.arrays:
+                    outer_name = self.sdfg.add_datadesc(aname, desc, find_new_name=False)
+                # `arr` has already been added, and is in the SDFG: use the same name but don't add it again.
+                else:
+                    outer_name = aname
                 if not desc.transient:
                     self.nested_closure_arrays[outer_name] = (arr, desc)
                     # Add closure arrays as function arguments

From a63c5e47798a9e7c73ca8bffbce3f90d4b9d8d56 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 3 May 2023 15:41:37 +0200
Subject: [PATCH 065/392] Added test.

---
 .../fields_and_global_arrays_test.py          | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/python_frontend/fields_and_global_arrays_test.py b/tests/python_frontend/fields_and_global_arrays_test.py
index c50e2e19a9..b7f5e46ee9 100644
--- a/tests/python_frontend/fields_and_global_arrays_test.py
+++ b/tests/python_frontend/fields_and_global_arrays_test.py
@@ -687,6 +687,34 @@ def __call__(self, A):
     assert np.allclose(1.0, A)
 
 
+def test_multiple_global_accesses():
+
+    A = np.ones((10, 10))
+
+    def get_A():
+        return A
+    
+    def get_A2():
+        return A
+    
+    @dace.program
+    def multiple_gets():
+        inp0 = np.empty_like(A)
+        inp1 = np.empty_like(A)
+        inp2 = np.empty_like(A)
+        for i, j in dace.map[0:10, 0:10]:
+            A0 = get_A()
+            inp0[i, j] = A0[i, j]
+            A1 = get_A()
+            inp1[i, j] = A1[i, j]
+            A2 = get_A2()
+            inp2[i, j] = A2[i, j]
+        return inp0 + inp1 + inp2
+    
+    val = multiple_gets()
+    assert np.array_equal(val, np.ones((10, 10)) * 3)
+
+
 if __name__ == '__main__':
     test_dynamic_closure()
     test_external_ndarray_readonly()
@@ -718,3 +746,4 @@ def __call__(self, A):
     test_two_inner_methods()
     test_transient_field()
     test_nested_transient_field()
+    test_multiple_global_accesses()

From a8601adb20ba24e59baac4659d0e745d16ae7ad9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 13:27:23 +0200
Subject: [PATCH 066/392] Append the id (memory address) of the context manager
 to its internal SDFG name.

---
 dace/frontend/python/preprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 1efcb6d38e..e1c44f8a7b 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -936,7 +936,8 @@ def _add_entries(self, node: ast.With) -> List[ast.AST]:
                                  'evaluatable context managers are supported.')
 
             # Create manager as part of closure
-            mgr_name = f'__with_{node.lineno}_{i}' if len(node.items) > 1 else f'__with_{node.lineno}'
+            mgr_id = id(ctxmgr)
+            mgr_name = f'__with_{node.lineno}_{i}_{mgr_id}' if len(node.items) > 1 else f'__with_{node.lineno}_{mgr_id}'
             mgr = self.resolver.global_value_to_node(ctxmgr, node, mgr_name, keep_object=True)
             ctx_mgr_names.append((mgr.id, ctxmgr))
 

From 6e380a990aa9ce7ce29a6fd810e3750ac7cab09b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 13:28:35 +0200
Subject: [PATCH 067/392] Added a test.

---
 tests/python_frontend/context_manager_test.py | 35 +++++++++++++++++++
 .../context_managers/context_a.py             | 33 +++++++++++++++++
 .../context_managers/context_b.py             | 33 +++++++++++++++++
 3 files changed, 101 insertions(+)
 create mode 100644 tests/python_frontend/context_managers/context_a.py
 create mode 100644 tests/python_frontend/context_managers/context_b.py

diff --git a/tests/python_frontend/context_manager_test.py b/tests/python_frontend/context_manager_test.py
index c331e4c128..fddd536214 100644
--- a/tests/python_frontend/context_manager_test.py
+++ b/tests/python_frontend/context_manager_test.py
@@ -36,5 +36,40 @@ def prog(A: dace.float64[20]):
     assert ctx.should_pass
 
 
+def test_ctxmgr_name_clash():
+    
+    from context_managers.context_a import my_dace_ctxmgr_program as prog_a
+    from context_managers.context_b import my_dace_ctxmgr_program as prog_b
+
+    rng = np.random.default_rng(42)
+
+    def dace_blocker(f):
+        return f
+    
+    @dace_blocker
+    def randint():
+        return rng.integers(0, 2)
+
+    @dace.program(auto_optimize=True)
+    def ctxmgr_name_clashing():
+        i: dace.int64 = randint()
+        if i == 0:
+            prog_a()
+        else:
+            prog_b()
+        return i
+    
+    a_count = 0
+    b_count = 0
+    for _ in range(100):
+        res = ctxmgr_name_clashing()
+        if res[0] == 0:
+            a_count += 1
+        else:
+            b_count += 1
+    assert a_count > 0 and b_count > 0
+
+
 if __name__ == '__main__':
     test_context_manager_decorator()
+    test_ctxmgr_name_clash()
diff --git a/tests/python_frontend/context_managers/context_a.py b/tests/python_frontend/context_managers/context_a.py
new file mode 100644
index 0000000000..0e88229934
--- /dev/null
+++ b/tests/python_frontend/context_managers/context_a.py
@@ -0,0 +1,33 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import numpy as np
+
+
+def dace_blocker(f):
+    return f
+
+
+class MyContextManager:
+
+    def __init__(self, seed):
+        self.rng = np.random.default_rng(seed)
+    
+    @dace_blocker
+    def __enter__(self):
+        a = self.rng.integers(1, 10)
+        b = self.rng.integers(1, 10)
+        print(f'Computing LCM of {a} and {b}')
+        return np.lcm(a, b)
+    
+    @dace_blocker
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+ctx = MyContextManager(42)
+
+
+@dace.program
+def my_dace_ctxmgr_program():
+    with ctx as c:
+        print(c)
diff --git a/tests/python_frontend/context_managers/context_b.py b/tests/python_frontend/context_managers/context_b.py
new file mode 100644
index 0000000000..233d32b7c2
--- /dev/null
+++ b/tests/python_frontend/context_managers/context_b.py
@@ -0,0 +1,33 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import numpy as np
+
+
+def dace_blocker(f):
+    return f
+
+
+class MyContextManager:
+
+    def __init__(self, seed):
+        self.rng = np.random.default_rng(seed)
+    
+    @dace_blocker
+    def __enter__(self):
+        a = self.rng.integers(51, 100)
+        b = self.rng.integers(51, 100)
+        print(f'Computing GCD of {a} and {b}')
+        return np.gcd(a, b)
+    
+    @dace_blocker
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+ctx = MyContextManager(42)
+
+
+@dace.program
+def my_dace_ctxmgr_program():
+    with ctx as c:
+        print(c)

From eff83d12b0c6ff4f001a6c6e9a19406a9a369327 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 13:54:44 +0200
Subject: [PATCH 068/392] Reworked test to showcase (lack of) issue.

---
 dace/frontend/python/preprocessing.py            |  3 ++-
 tests/python_frontend/context_manager_test.py    | 16 ++++++++++++----
 .../context_managers/context_a.py                |  2 +-
 .../context_managers/context_b.py                |  2 +-
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index e1c44f8a7b..b9ebd98c65 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -937,7 +937,8 @@ def _add_entries(self, node: ast.With) -> List[ast.AST]:
 
             # Create manager as part of closure
             mgr_id = id(ctxmgr)
-            mgr_name = f'__with_{node.lineno}_{i}_{mgr_id}' if len(node.items) > 1 else f'__with_{node.lineno}_{mgr_id}'
+            # mgr_name = f'__with_{node.lineno}_{i}_{mgr_id}' if len(node.items) > 1 else f'__with_{node.lineno}_{mgr_id}'
+            mgr_name = f'__with_{node.lineno}_{i}' if len(node.items) > 1 else f'__with_{node.lineno}'
             mgr = self.resolver.global_value_to_node(ctxmgr, node, mgr_name, keep_object=True)
             ctx_mgr_names.append((mgr.id, ctxmgr))
 
diff --git a/tests/python_frontend/context_manager_test.py b/tests/python_frontend/context_manager_test.py
index fddd536214..e7a175b5ce 100644
--- a/tests/python_frontend/context_manager_test.py
+++ b/tests/python_frontend/context_manager_test.py
@@ -38,8 +38,8 @@ def prog(A: dace.float64[20]):
 
 def test_ctxmgr_name_clash():
     
-    from context_managers.context_a import my_dace_ctxmgr_program as prog_a
-    from context_managers.context_b import my_dace_ctxmgr_program as prog_b
+    from context_managers.context_a import my_dace_ctxmgr_program as prog_a, ctx as ctx_a
+    from context_managers.context_b import my_dace_ctxmgr_program as prog_b, ctx as ctx_b
 
     rng = np.random.default_rng(42)
 
@@ -50,7 +50,7 @@ def dace_blocker(f):
     def randint():
         return rng.integers(0, 2)
 
-    @dace.program(auto_optimize=True)
+    @dace.program
     def ctxmgr_name_clashing():
         i: dace.int64 = randint()
         if i == 0:
@@ -61,8 +61,16 @@ def ctxmgr_name_clashing():
     
     a_count = 0
     b_count = 0
+    sdfg = ctxmgr_name_clashing.to_sdfg(simplify=True)
+    func = sdfg.compile()
     for _ in range(100):
-        res = ctxmgr_name_clashing()
+        res = func(__with_32___enter__=ctx_a.__enter__,
+                   __with_32___exit__=ctx_a.__exit__,
+                   __with_32___enter___0=ctx_b.__enter__,
+                   __with_32___exit___0=ctx_b.__exit__,
+                   print=print,
+                   print_0=print,
+                   randint=randint)
         if res[0] == 0:
             a_count += 1
         else:
diff --git a/tests/python_frontend/context_managers/context_a.py b/tests/python_frontend/context_managers/context_a.py
index 0e88229934..d2127c7423 100644
--- a/tests/python_frontend/context_managers/context_a.py
+++ b/tests/python_frontend/context_managers/context_a.py
@@ -20,7 +20,7 @@ def __enter__(self):
         return np.lcm(a, b)
     
     @dace_blocker
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
         pass
 
 
diff --git a/tests/python_frontend/context_managers/context_b.py b/tests/python_frontend/context_managers/context_b.py
index 233d32b7c2..2f31ba823b 100644
--- a/tests/python_frontend/context_managers/context_b.py
+++ b/tests/python_frontend/context_managers/context_b.py
@@ -20,7 +20,7 @@ def __enter__(self):
         return np.gcd(a, b)
     
     @dace_blocker
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
         pass
 
 

From df4008614c82b543ca622e2e6c84fced62435f24 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 20:51:18 +0200
Subject: [PATCH 069/392] Serialized dtypes.pyobject must deserialize as
 dtypes.pyobject, not dtypes.opaque.

---
 dace/dtypes.py    | 1 +
 dace/serialize.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index cb32ca8bfc..00192eeb9b 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -599,6 +599,7 @@ def from_json(json_obj, context=None):
 
         try:
             typeclass = json_to_typeclass(json_obj['ctype'], context)
+            return typeclass()
         except KeyError:
             typeclass = json_obj['ctype']
 
diff --git a/dace/serialize.py b/dace/serialize.py
index c149e5ca92..cada479d0f 100644
--- a/dace/serialize.py
+++ b/dace/serialize.py
@@ -65,6 +65,7 @@ def to_json(obj):
     "DebugInfo": dace.dtypes.DebugInfo,
     "string": dace.dtypes.string,
     "bool_": dace.dtypes.bool,
+    "pyobject": dace.dtypes.pyobject,
     # All classes annotated with the make_properties decorator will register
     # themselves here.
 }

From cacad3d766f423ec9a8286f37fa311f49e17a3a4 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 20:52:17 +0200
Subject: [PATCH 070/392] Internal context manager names must depend only on
 the local program code (no line numbers) and be unique.

---
 dace/frontend/python/preprocessing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index b9ebd98c65..48bf5383d7 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -878,6 +878,7 @@ def __init__(self, globals: Dict[str, Any], filename: str, closure_resolver: Glo
         self.globals: Dict[str, Any] = globals
         self.filename = filename
         self.resolver = closure_resolver
+        self.names: Set[str] = set()
 
     def _visit_node_with_body(self, node):
         node = self.generic_visit_filtered(node, {'body'})
@@ -936,11 +937,10 @@ def _add_entries(self, node: ast.With) -> List[ast.AST]:
                                  'evaluatable context managers are supported.')
 
             # Create manager as part of closure
-            mgr_id = id(ctxmgr)
-            # mgr_name = f'__with_{node.lineno}_{i}_{mgr_id}' if len(node.items) > 1 else f'__with_{node.lineno}_{mgr_id}'
-            mgr_name = f'__with_{node.lineno}_{i}' if len(node.items) > 1 else f'__with_{node.lineno}'
+            mgr_name = data.find_new_name(f'__with_{item.context_expr.qualname if hasattr(item.context_expr, "qualname") else item.context_expr.id}', self.names)
             mgr = self.resolver.global_value_to_node(ctxmgr, node, mgr_name, keep_object=True)
             ctx_mgr_names.append((mgr.id, ctxmgr))
+            self.names.add(mgr_name)
 
             # Call __enter__
             enter_call = ast.copy_location(ast.parse(f'{mgr.id}.__enter__()').body[0], node)

From 883a297b848dc6d2507ef6710e8c1d64492c0cf2 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 9 May 2023 20:52:44 +0200
Subject: [PATCH 071/392] Reworked test to catch reported issues.

---
 tests/python_frontend/context_manager_test.py | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/tests/python_frontend/context_manager_test.py b/tests/python_frontend/context_manager_test.py
index e7a175b5ce..a28e7b56e9 100644
--- a/tests/python_frontend/context_manager_test.py
+++ b/tests/python_frontend/context_manager_test.py
@@ -38,8 +38,8 @@ def prog(A: dace.float64[20]):
 
 def test_ctxmgr_name_clash():
     
-    from context_managers.context_a import my_dace_ctxmgr_program as prog_a, ctx as ctx_a
-    from context_managers.context_b import my_dace_ctxmgr_program as prog_b, ctx as ctx_b
+    from context_managers.context_a import my_dace_ctxmgr_program as prog_a
+    from context_managers.context_b import my_dace_ctxmgr_program as prog_b
 
     rng = np.random.default_rng(42)
 
@@ -51,31 +51,39 @@ def randint():
         return rng.integers(0, 2)
 
     @dace.program
-    def ctxmgr_name_clashing():
+    def ctxmgr_name_clashing_0():
         i: dace.int64 = randint()
         if i == 0:
             prog_a()
         else:
             prog_b()
         return i
-    
-    a_count = 0
-    b_count = 0
-    sdfg = ctxmgr_name_clashing.to_sdfg(simplify=True)
-    func = sdfg.compile()
-    for _ in range(100):
-        res = func(__with_32___enter__=ctx_a.__enter__,
-                   __with_32___exit__=ctx_a.__exit__,
-                   __with_32___enter___0=ctx_b.__enter__,
-                   __with_32___exit___0=ctx_b.__exit__,
-                   print=print,
-                   print_0=print,
-                   randint=randint)
-        if res[0] == 0:
-            a_count += 1
+
+    @dace.program
+    def ctxmgr_name_clashing_1():
+        i: dace.int64 = randint()
+        if i == 0:
+            prog_a()
         else:
-            b_count += 1
-    assert a_count > 0 and b_count > 0
+            prog_b()
+        return i
+    
+    sdfg = ctxmgr_name_clashing_0.to_sdfg()
+    
+    for i, f in enumerate([ctxmgr_name_clashing_0, ctxmgr_name_clashing_1]):
+
+        if i > 0:
+            f.load_precompiled_sdfg(sdfg.build_folder)
+
+        a_count = 0
+        b_count = 0
+        for _ in range(100):
+            res = f()
+            if res[0] == 0:
+                a_count += 1
+            else:
+                b_count += 1
+        assert a_count > 0 and b_count > 0
 
 
 if __name__ == '__main__':

From 4497bbe0b74236762736b431fe0e21d7317ad7cf Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 17 May 2023 15:52:09 +0200
Subject: [PATCH 072/392] Before calling `subs` on a SymPy expression, convert
 strings to SymPy symbols and filter out Callables and Iterables.

---
 dace/frontend/python/newast.py | 11 +++++++++--
 dace/symbolic.py               |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 0e9ad41adf..31672dc7d1 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -11,7 +11,7 @@
 from os import path
 import warnings
 from numbers import Number
-from typing import Any, Dict, List, Set, Tuple, Union, Callable, Optional
+from typing import Any, Dict, Iterable, List, Set, Tuple, Union, Callable, Optional
 import operator
 
 import dace
@@ -2164,7 +2164,14 @@ def _replace_with_global_symbols(self, expr: sympy.Expr) -> sympy.Expr:
         repldict = dict()
         for s in expr.free_symbols:
             if s.name in self.defined:
-                repldict[s] = self.defined[s.name]
+                repl = self.defined[s.name]
+                # Convert strings to SymPy symbols (for SymPy 1.12)
+                if isinstance(repl, str):
+                    repl = sympy.Symbol(repl)
+                # Filter out callables and iterables (for SymPy 1.12)
+                elif isinstance(repl, (Callable, Iterable)):
+                    continue
+                repldict[s] = repl
         return expr.subs(repldict)
 
     def visit_For(self, node: ast.For):
diff --git a/dace/symbolic.py b/dace/symbolic.py
index c6d484fa18..ea25349b87 100644
--- a/dace/symbolic.py
+++ b/dace/symbolic.py
@@ -4,7 +4,7 @@
 import sympy
 import pickle
 import re
-from typing import Any, Callable, Dict, Optional, Set, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, Union
 import warnings
 import numpy
 
@@ -358,6 +358,11 @@ def evaluate(expr: Union[sympy.Basic, int, float],
     syms = {(sname if isinstance(sname, sympy.Symbol) else symbol(sname)):
             sval.get() if isinstance(sval, symbol) else sval
             for sname, sval in symbols.items()}
+    
+    # Filter out callables and iterables but not strings (for SymPy 1.12)
+    syms = {k: v for k, v in syms.items() if not isinstance(v, (Callable, Iterable)) or isinstance(v, str)}
+    # Convert strings to SymPy symbols (for SymPy 1.12)
+    syms = {k: sympy.Symbol(v) if isinstance(v, str) else v for k, v in syms.items()}
 
     return expr.subs(syms)
 

From 59e552f769862f429a9187788893134235b77bc6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 17 May 2023 16:51:00 +0200
Subject: [PATCH 073/392] Also ignore None values.

---
 dace/frontend/python/newast.py | 6 ++++--
 dace/symbolic.py               | 9 ++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 31672dc7d1..ecf09417d8 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -254,7 +254,7 @@ def repl_callback(repldict):
     except Exception:
         # Print the offending line causing the exception
         li = visitor.current_lineinfo
-        print('Exception raised while parsing DaCe program:\n' f'  in File "{li.filename}", line {li.start_line}')
+        print(f'Exception raised while parsing DaCe program:\n  in File "{li.filename}", line {li.start_line}')
         lines = preprocessed_ast.src.split('\n')
         lineid = li.start_line - preprocessed_ast.src_line - 1
         if lineid >= 0 and lineid < len(lines):
@@ -601,6 +601,7 @@ class TaskletTransformer(ExtNodeTransformer):
     """ A visitor that traverses a data-centric tasklet, removes memlet
         annotations and returns input and output memlets.
     """
+
     def __init__(self,
                  visitor,
                  defined,
@@ -2169,7 +2170,7 @@ def _replace_with_global_symbols(self, expr: sympy.Expr) -> sympy.Expr:
                 if isinstance(repl, str):
                     repl = sympy.Symbol(repl)
                 # Filter out callables and iterables (for SymPy 1.12)
-                elif isinstance(repl, (Callable, Iterable)):
+                elif repl is None or isinstance(repl, (Callable, Iterable)):
                     continue
                 repldict[s] = repl
         return expr.subs(repldict)
@@ -4826,6 +4827,7 @@ def _parse_subscript_slice(self,
         """ Parses the slice attribute of an ast.Subscript node.
             Scalar data are promoted to symbols.
         """
+
         def _promote(node: ast.AST) -> Union[Any, str, symbolic.symbol]:
             node_str = astutils.unparse(node)
             sym = None
diff --git a/dace/symbolic.py b/dace/symbolic.py
index ea25349b87..ec2c9806c2 100644
--- a/dace/symbolic.py
+++ b/dace/symbolic.py
@@ -358,9 +358,12 @@ def evaluate(expr: Union[sympy.Basic, int, float],
     syms = {(sname if isinstance(sname, sympy.Symbol) else symbol(sname)):
             sval.get() if isinstance(sval, symbol) else sval
             for sname, sval in symbols.items()}
-    
-    # Filter out callables and iterables but not strings (for SymPy 1.12)
-    syms = {k: v for k, v in syms.items() if not isinstance(v, (Callable, Iterable)) or isinstance(v, str)}
+
+    # Filter out `None` values, callables, and iterables but not strings (for SymPy 1.12)
+    syms = {
+        k: v
+        for k, v in syms.items() if not (v is None or isinstance(v, (Callable, Iterable))) or isinstance(v, str)
+    }
     # Convert strings to SymPy symbols (for SymPy 1.12)
     syms = {k: sympy.Symbol(v) if isinstance(v, str) else v for k, v in syms.items()}
 

From 1e71286da9f00e368bdfff1e10697b5133bcacea Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 17 May 2023 17:39:29 +0200
Subject: [PATCH 074/392] Increases the priority of the `views` connector.

---
 dace/sdfg/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 8c43fd237e..c58837fdff 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -825,6 +825,12 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
     in_edge = in_edges[0]
     out_edge = out_edges[0]
 
+    # Check if there is a 'views' connector
+    if in_edge.dst_conn and in_edge.dst_conn == 'views':
+        return in_edge
+    if out_edge.src_conn and out_edge.src_conn == 'views':
+        return out_edge
+
     # If there is one incoming and one outgoing edge, and one leads to a code
     # node, the one that leads to an access node is the viewed data.
     inmpath = state.memlet_path(in_edge)
@@ -851,12 +857,6 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
     if in_edge.data.data == view.data and out_edge.data.data == view.data:
         return None
 
-    # Check if there is a 'views' connector
-    if in_edge.dst_conn and in_edge.dst_conn == 'views':
-        return in_edge
-    if out_edge.src_conn and out_edge.src_conn == 'views':
-        return out_edge
-
     # If both memlets' data are the respective access nodes, the access
     # node at the highest scope is the one that is viewed.
     if isinstance(in_edge.src, nd.EntryNode):

From cdd517830ded0f8404f921314d334b0139e3845c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 17 May 2023 18:18:54 +0200
Subject: [PATCH 075/392] Disables serialization

---
 tests/inlining_test.py                      | 6 ++++--
 tests/npbench/polybench/correlation_test.py | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/inlining_test.py b/tests/inlining_test.py
index 7135573b65..d207aa6c2c 100644
--- a/tests/inlining_test.py
+++ b/tests/inlining_test.py
@@ -4,6 +4,7 @@
 from dace.libraries import blas
 from dace.library import change_default
 import numpy as np
+import os
 import pytest
 
 W = dace.symbol('W')
@@ -263,8 +264,6 @@ def inline_unsqueeze(A: dace.int32[2, 5], B: dace.int32[5, 3]):
             assert (np.array_equal(B[:, i], np.zeros((5, ), np.int32)))
 
 
-# NOTE: Issue with serialization
-@pytest.mark.skip
 def test_inline_unsqueeze4():
 
     @dace.program
@@ -281,7 +280,10 @@ def inline_unsqueeze(A: dace.int32[2, 5], B: dace.int32[5, 3]):
 
     A = np.arange(10, dtype=np.int32).reshape(2, 5).copy()
     B = np.zeros((5, 3), np.int32)
+    last_value = os.environ.get('DACE_testing_serialization', '0')
+    os.environ['DACE_testing_serialization'] = '0'
     sdfg(A, B)
+    os.environ['DACE_testing_serialization'] = last_value
     for i in range(3):
         if i < 2:
             assert (np.array_equal(B[i + 1:2 * i + 3, 1 - i], A[i, i:2 * i + 2]))
diff --git a/tests/npbench/polybench/correlation_test.py b/tests/npbench/polybench/correlation_test.py
index e428a99826..d1536d51c8 100644
--- a/tests/npbench/polybench/correlation_test.py
+++ b/tests/npbench/polybench/correlation_test.py
@@ -3,6 +3,7 @@
 import dace.dtypes
 import numpy as np
 import dace as dc
+import os
 import pytest
 import argparse
 from dace.transformation.auto.auto_optimize import auto_optimize
@@ -71,7 +72,10 @@ def run_correlation(device_type: dace.dtypes.DeviceType):
         # Parse the SDFG and apply autopot
         sdfg = correlation_kernel.to_sdfg()
         sdfg = auto_optimize(sdfg, device_type)
+        last_value = os.environ.get('DACE_testing_serialization', '0')
+        os.environ['DACE_testing_serialization'] = '0'
         corr = sdfg(float_n, data, M=M, N=N)
+        os.environ['DACE_testing_serialization'] = last_value
 
     elif device_type == dace.dtypes.DeviceType.FPGA:
         pass  # Not Yet Implemented

From 666ccc2d35cf4ef9ba3480596971e7cb6affb287 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 17 May 2023 19:22:03 +0200
Subject: [PATCH 076/392] Disables serialization testing on general other
 tests.

---
 .github/workflows/general-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
index d656a10371..c52380d6ff 100644
--- a/.github/workflows/general-ci.yml
+++ b/.github/workflows/general-ci.yml
@@ -75,7 +75,7 @@ jobs:
     - name: Run other tests
       run: |
         export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
+        export DACE_testing_serialization=0
         export DACE_testing_deserialize_exception=1
         export DACE_cache=single
         export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}

From 418cff5754655d336f1b949dc85944f56d8b6344 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Tr=C3=BCmper?= <lukas.truemper@outlook.de>
Date: Sun, 21 May 2023 17:02:52 +0200
Subject: [PATCH 077/392] Added LIKWID environment (#1258)

---
 dace/codegen/instrumentation/likwid.py | 84 +++++++++++++++++++-------
 1 file changed, 61 insertions(+), 23 deletions(-)

diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
index b40d27546f..b14a8166af 100644
--- a/dace/codegen/instrumentation/likwid.py
+++ b/dace/codegen/instrumentation/likwid.py
@@ -4,13 +4,63 @@
 """
 
 import dace
-from dace import dtypes, registry
+import os
+import ctypes.util
+
+from pathlib import Path
+
+from dace import dtypes, registry, library
 from dace.codegen.instrumentation.provider import InstrumentationProvider
 from dace.config import Config
-
 from dace.transformation import helpers as xfh
 
-from pathlib import Path
+
+@library.environment
+class LIKWID:
+    """ 
+    An environment for LIKWID
+    """
+
+    cmake_minimum_version = None
+    cmake_packages = []
+    cmake_variables = {}
+    cmake_compile_flags = []
+    cmake_link_flags = []
+    cmake_files = []
+
+    headers = ["likwid.h"]
+    state_fields = []
+    init_code = ""
+    finalize_code = ""
+    dependencies = []
+
+    @staticmethod
+    def cmake_includes():
+        # Anaconda
+        if 'CONDA_PREFIX' in os.environ:
+            base_path = os.environ['CONDA_PREFIX']
+            # Anaconda on Windows
+            candpath = os.path.join(base_path, 'Library', 'include')
+            if os.path.isfile(os.path.join(candpath, 'likwid.h')):
+                return [candpath]
+            # Anaconda on other platforms
+            candpath = os.path.join(base_path, 'include')
+            if os.path.isfile(os.path.join(candpath, 'likwid.h')):
+                return [candpath]
+
+        return []
+
+    @staticmethod
+    def cmake_libraries():
+        path = ctypes.util.find_library('likwid')
+        if path:
+            return [path]
+
+        return []
+
+    @staticmethod
+    def is_installed():
+        return len(LIKWID.cmake_libraries()) > 0
 
 
 @registry.autoregister_params(type=dtypes.InstrumentationType.LIKWID_CPU)
@@ -25,29 +75,23 @@ def __init__(self):
         self._likwid_used = False
         self._regions = []
 
-    def configure_likwid(self):
-        Config.append('compiler', 'cpu', 'args', value=' -DLIKWID_PERFMON -fopenmp ')
-
-        # Link with liblikwid
-        Config.append('compiler', 'cpu', 'libs', value=' likwid ')
-
         try:
             self._default_events = Config.get('instrumentation', 'likwid', 'default_events')
         except KeyError:
             self._default_events = "CLOCK"
 
-        self._likwid_used = True
-
     def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
         if sdfg.parent is not None:
             return
 
         # Configure CMake project and counters
-        self.configure_likwid()
-
+        self._likwid_used = LIKWID.is_installed()
         if not self._likwid_used:
             return
 
+        codegen.dispatcher.used_environments.add(LIKWID.full_class_path())
+        Config.append('compiler', 'cpu', 'args', value=' -DLIKWID_PERFMON -fopenmp ')
+
         self.codegen = codegen
 
         likwid_marker_file = Path(sdfg.build_folder) / "perf" / "likwid_marker.out"
@@ -276,29 +320,23 @@ def __init__(self):
         self._likwid_used = False
         self._regions = []
 
-    def configure_likwid(self):
-        Config.append('compiler', 'cpu', 'args', value=' -DLIKWID_NVMON ')
-
-        # Link with liblikwid
-        Config.append('compiler', 'cpu', 'libs', value=' likwid ')
-
         try:
             self._default_events = Config.get('instrumentation', 'likwid', 'default_events')
         except KeyError:
             self._default_events = "FLOPS_SP"
 
-        self._likwid_used = True
-
     def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen):
         if sdfg.parent is not None:
             return
 
         # Configure CMake project and counters
-        self.configure_likwid()
-
+        self._likwid_used = LIKWID.is_installed()
         if not self._likwid_used:
             return
 
+        codegen.dispatcher.used_environments.add(LIKWID.full_class_path())
+        Config.append('compiler', 'cpu', 'args', value=' -DLIKWID_NVMON ')
+
         self.codegen = codegen
 
         likwid_marker_file_gpu = Path(sdfg.build_folder) / "perf" / "likwid_marker_gpu.out"

From f535eb38f5e52fc79ecdc1b87959bc22a65a9dcf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 22 May 2023 16:17:12 +0000
Subject: [PATCH 078/392] Bump mpmath from 1.1.0 to 1.3.0

Bumps [mpmath](https://github.com/fredrik-johansson/mpmath) from 1.1.0 to 1.3.0.
- [Release notes](https://github.com/fredrik-johansson/mpmath/releases)
- [Changelog](https://github.com/mpmath/mpmath/blob/master/CHANGES)
- [Commits](https://github.com/fredrik-johansson/mpmath/compare/1.1.0...1.3.0)

---
updated-dependencies:
- dependency-name: mpmath
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4ac426d6f1..285374dccb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ idna==2.10
 itsdangerous==2.0.0a1
 Jinja2==3.0.0a1
 MarkupSafe==2.0.0a1
-mpmath==1.1.0
+mpmath==1.3.0
 networkx==2.5
 numpy>=1.21
 packaging==20.4

From edf2ee292e4a27b62a88243a1827667cf991261a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 May 2023 00:52:19 +0000
Subject: [PATCH 079/392] Bump requests from 2.25.1 to 2.31.0

Bumps [requests](https://github.com/psf/requests) from 2.25.1 to 2.31.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.25.1...v2.31.0)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4ac426d6f1..5c1ee58dda 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ packaging==20.4
 ply==3.11
 pyparsing==3.0.0a2
 PyYAML==5.4
-requests==2.25.1
+requests==2.31.0
 scikit-build==0.11.1
 six==1.15.0
 sympy==1.7

From cb85ef2c1427d75932a7222714bb9073b1d5ec93 Mon Sep 17 00:00:00 2001
From: edopao <edoardo16@gmail.com>
Date: Tue, 23 May 2023 08:40:52 +0200
Subject: [PATCH 080/392] Fix compile errors (#1259)

- Some import paths were not compatible with python3.6
- The g++ compiler on MacOS installed through Homebrew
  seems to rely on MacOS SDK built with clang, for which
  __unused is a reserved word (an attribute), therefore
  variables cannot be named __unused.
---
 dace/codegen/targets/snitch.py            | 2 --
 dace/libraries/blas/nodes/dot.py          | 2 +-
 dace/libraries/standard/nodes/reduce.py   | 2 +-
 dace/transformation/subgraph/composite.py | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/targets/snitch.py b/dace/codegen/targets/snitch.py
index 0950bb5476..389d906b36 100644
--- a/dace/codegen/targets/snitch.py
+++ b/dace/codegen/targets/snitch.py
@@ -1090,8 +1090,6 @@ def gen_code_snitch(sdfg):
         code._code = code._code.replace('dace::float64', '(double)')
         code._code = code._code.replace('dace::int64', '(int64_t)')
         code._code = code._code.replace('dace::math::pow', 'pow')
-        # __unused is reserved in C
-        code._code = code._code.replace('__unused', '_unused_var')
 
         # change new/delete to malloc/free
         code._code = re.sub(r"new (.+) \[(\d*)\];", r"(\1*)malloc(\2*sizeof(\1));", code._code)
diff --git a/dace/libraries/blas/nodes/dot.py b/dace/libraries/blas/nodes/dot.py
index bbbd4fa0a1..c994504048 100644
--- a/dace/libraries/blas/nodes/dot.py
+++ b/dace/libraries/blas/nodes/dot.py
@@ -44,7 +44,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         state = sdfg.add_state_after(init_state, node.label + "_state")
 
         # Initialization map
-        init_state.add_mapped_tasklet("_i_dotnit", {"__unused": "0:1"}, {},
+        init_state.add_mapped_tasklet("_i_dotnit", {"__i_unused": "0:1"}, {},
                                       "_out = 0", {"_out": dace.Memlet("_result[0]")},
                                       external_edges=True)
 
diff --git a/dace/libraries/standard/nodes/reduce.py b/dace/libraries/standard/nodes/reduce.py
index bfc0520dcc..0f76c7e252 100644
--- a/dace/libraries/standard/nodes/reduce.py
+++ b/dace/libraries/standard/nodes/reduce.py
@@ -27,7 +27,7 @@
 from dace.symbolic import symstr, issymbolic
 from dace.libraries.standard.environments.cuda import CUDA
 
-import dace.libraries.standard.reduction_planner as red_planner
+from dace.libraries.standard import reduction_planner as red_planner
 
 
 @dace.library.expansion
diff --git a/dace/transformation/subgraph/composite.py b/dace/transformation/subgraph/composite.py
index d8f0b8e12e..fd1824f4a0 100644
--- a/dace/transformation/subgraph/composite.py
+++ b/dace/transformation/subgraph/composite.py
@@ -9,7 +9,7 @@
 import dace.transformation.transformation as transformation
 from dace.transformation.subgraph import SubgraphFusion, MultiExpansion
 from dace.transformation.subgraph.stencil_tiling import StencilTiling
-import dace.transformation.subgraph.helpers as helpers
+from dace.transformation.subgraph import helpers
 
 from dace import dtypes, registry, symbolic, subsets, data
 from dace.properties import EnumProperty, make_properties, Property, ShapeProperty

From 84510d27ce1e4ea08145e665dc4aa6b579ac37c2 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 24 May 2023 05:55:20 -0700
Subject: [PATCH 081/392] Make check more sympy-version-robust

---
 dace/transformation/auto/auto_optimize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/auto/auto_optimize.py b/dace/transformation/auto/auto_optimize.py
index f0fe22e181..6177e9e38e 100644
--- a/dace/transformation/auto/auto_optimize.py
+++ b/dace/transformation/auto/auto_optimize.py
@@ -627,7 +627,7 @@ def auto_optimize(sdfg: SDFG,
             if s in sdfg.free_symbols:
                 if isinstance(v, (int, float)):
                     known_symbols[s] = v
-                if isinstance(v, sympy.core.numbers.Integer):
+                if isinstance(v, sympy.Integer):
                     try:
                         known_symbols[s] = int(v)
                     except TypeError:

From 8900e523663a862c46ccdd2d91c18e8e5fb8adf8 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 24 May 2023 07:48:27 -0700
Subject: [PATCH 082/392] Determine schedule type based on surrounding storage
 types

---
 dace/dtypes.py                        |  12 +
 dace/sdfg/infer_types.py              | 387 ++++++++++++++++----------
 tests/sdfg/schedule_inference_test.py | 180 ++++++++++++
 3 files changed, 439 insertions(+), 140 deletions(-)
 create mode 100644 tests/sdfg/schedule_inference_test.py

diff --git a/dace/dtypes.py b/dace/dtypes.py
index 00192eeb9b..a86a746884 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -217,6 +217,18 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
 }
 
+# Maps from StorageType to a preferred ScheduleType for helping determine schedules.
+# If mapped to None or does not exist in this dictionary, does not affect decision.
+# Scalar data containers also do not affect this decision.
+STORAGEDEFAULT_SCHEDULE = {
+    StorageType.CPU_Heap: ScheduleType.CPU_Multicore,
+    StorageType.CPU_ThreadLocal: ScheduleType.CPU_Multicore,
+    StorageType.GPU_Global: ScheduleType.GPU_Device,
+    StorageType.GPU_Shared: ScheduleType.GPU_ThreadBlock,
+    StorageType.FPGA_Global: ScheduleType.FPGA_Device,
+    StorageType.SVE_Register: ScheduleType.SVE_Map,
+}
+
 # Translation of types to C types
 _CTYPES = {
     None: "void",
diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
index 5cf1f5f7a4..dd89b1dee8 100644
--- a/dace/sdfg/infer_types.py
+++ b/dace/sdfg/infer_types.py
@@ -3,11 +3,11 @@
 from dace import data, dtypes
 from dace.codegen.tools import type_inference
 from dace.memlet import Memlet
-from dace.sdfg import SDFG, SDFGState, nodes
+from dace.sdfg import SDFG, SDFGState, nodes, validation
 from dace.sdfg import nodes
-from dace.sdfg.graph import Edge
+from dace.sdfg.graph import Edge, SubgraphView
 from dace.sdfg.utils import dfs_topological_sort
-from typing import Callable, Dict, List, Optional, Set
+from typing import Callable, Dict, List, Optional, Set, Union
 
 #############################################################################
 # Connector type inference
@@ -123,156 +123,263 @@ def infer_connector_types(sdfg: SDFG):
 # Default schedule and storage type inference
 
 
-def set_default_schedule_and_storage_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType):
+def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.EntryNode],
+                                           parent_schedules: List[dtypes.ScheduleType] = None,
+                                           use_parent_schedule: bool = False,
+                                           state: SDFGState = None,
+                                           child_nodes: Dict[nodes.Node, List[nodes.Node]] = None):
     """ 
     Sets default storage and schedule types throughout SDFG in-place.
-    Replaces `ScheduleType.Default` and `StorageType.Default`
+    Replaces ``ScheduleType.Default`` and ``StorageType.Default``
     with the corresponding types according to the parent scope's schedule. 
     
     The defaults for storage types are determined by the
     ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device 
-    schedule, by default, will allocate containers on the shared memory); and
-    similarly for schedules by ``dtypes.SCOPEDEFAULT_SCHEDULE`` (e.g., a map
-    nested in a CPU multi-core map will by default run within a single thread).
-
-    :param sdfg: The SDFG to infer.
-    :param toplevel_schedule: The default top-level schedule for "global" nodes
-                              (without parent scope nodes).
+    schedule, by default, will allocate containers on the shared memory).
+    Following storage type inference for a scope, nested scopes (e.g., map entry, nested SDFG)
+    are evaluated using the ``dtypes.STORAGEDEFAULT_SCHEDULE`` dictionary (for example, a
+    default map with only GPU arrays connected to it will execute on the GPU). This decision
+    is superseded if the schedule is specified in ``dtypes.SCOPEDEFAULT_SCHEDULE`` (e.g.,
+    a map nested in a CPU multi-core map will by default run within a single thread).
+    If no default schedule is found while traversing the parent scopes, the chosen schedule will be
+    determined based on the SDFG's device, as specified in ``dtypes.DEFAULT_TOPLEVEL_STORAGE`` and
+    ``dtypes.DEFAULT_TOPLEVEL_SCHEDULE``.
+    May raise ``InvalidSDFGNodeError`` if a default scope is ambiguous based on surrounding
+    storage types.
+    :param scope: The SDFG, state, or scope to infer.
+    :param parent_schedules: A list of ScheduleType elements representing
+                             an ordered list of schedules, from the global schedule
+                             on the top-level SDFG (usually ``None``), up to this
+                             point.
+    :param use_parent_schedule: If True, uses the parent scope's schedule type
+                                directly, instead of the default schedule type.
+                                Used when expanding nested SDFGs to preserve their
+                                top-level schedule.
+    :param state: (Use when working with a single scope) The parent state.
+    :param child_nodes: (Use when working with a single scope) A mapping of each scope entry
+                        node to its children.
     """
-    _set_default_schedule_types(sdfg, toplevel_schedule)
-    _set_default_storage_types(sdfg, toplevel_schedule)
-
-
-def _scopes_with_tbmaps(state: SDFGState, scopes: List[nodes.EntryNode]):
-    """ Returns a set of scopes where a thread-block (or dynamic thread-block)
-        sub-scopes exist. Used, e.g., to modify storage defaults. """
-    scopes_with_tbmaps = set()
-    for scope_entry in scopes:
-        subgraph = state.scope_subgraph(scope_entry)
-        has_tb_map = False
-        # Append thread-block maps from subgraph and nested SDFGs
-        for node in subgraph.nodes():
-            if isinstance(node, nodes.EntryNode) and node.schedule in (dtypes.ScheduleType.GPU_ThreadBlock,
-                                                                       dtypes.ScheduleType.GPU_ThreadBlock_Dynamic):
-                has_tb_map = True
-                break
-            elif isinstance(node, nodes.NestedSDFG):
-                for n in node.sdfg.all_nodes_recursive():
-                    if isinstance(node,
-                                  nodes.EntryNode) and node.schedule in (dtypes.ScheduleType.GPU_ThreadBlock,
-                                                                         dtypes.ScheduleType.GPU_ThreadBlock_Dynamic):
-                        has_tb_map = True
-                        break
-                if has_tb_map:
-                    break
-        if has_tb_map:
-            scopes_with_tbmaps.add(scope_entry)
-    return scopes_with_tbmaps
-
-
-def _set_default_schedule_in_scope(parent_node: nodes.Node,
-                                   parent_schedule: dtypes.ScheduleType,
-                                   reverse_scope_dict: Dict[nodes.Node, List[nodes.Node]],
-                                   use_parent_schedule: bool = False):
-    for node in reverse_scope_dict[parent_node]:
-        if use_parent_schedule:
-            child_schedule = parent_schedule
-            if parent_schedule in (dtypes.ScheduleType.Default, dtypes.ScheduleType.GPU_Default):
-                child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[parent_schedule]
+    parent_schedules = parent_schedules or [None]
+    if isinstance(scope, SDFG):
+        # Set device for default top-level schedules and storages
+        for state in scope.nodes():
+            set_default_schedule_and_storage_types(state,
+                                                   parent_schedules,
+                                                   use_parent_schedule=use_parent_schedule,
+                                                   state=state,
+                                                   child_nodes=state.scope_children())
+
+        # Take care of remaining scalars without access nodes
+        for aname, desc in scope.arrays.items():
+            # If not transient in a nested SDFG, take storage from parent, regardless of current type
+            if not desc.transient and scope.parent_sdfg is not None:
+                desc.storage = _get_storage_from_parent(aname, scope)
+            elif ((desc.transient or scope.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default):
+                # Indeterminate storage type, set to register
+                desc.storage = dtypes.StorageType.Register
+        return
+
+    # Setup arguments
+    parent_node = None if isinstance(scope, SDFGState) else scope
+    if state is None:
+        if isinstance(scope, SDFGState):
+            state = scope
         else:
-            child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[parent_schedule]
-        # Set default schedule type
-        if isinstance(node, nodes.MapEntry):
-            if node.map.schedule is dtypes.ScheduleType.Default:
-                node.map.schedule = child_schedule
-            # Also traverse children (recursively)
-            _set_default_schedule_in_scope(node, node.map.schedule, reverse_scope_dict)
-        elif isinstance(node, nodes.ConsumeEntry):
-            if node.consume.schedule is dtypes.ScheduleType.Default:
-                node.consume.schedule = child_schedule
-
-            # Also traverse children (recursively)
-            _set_default_schedule_in_scope(node, node.consume.schedule, reverse_scope_dict)
-        elif isinstance(node, nodes.NestedSDFG):
-            # Nested SDFGs retain same schedule as their parent scope
-            if node.schedule is dtypes.ScheduleType.Default:
-                node.schedule = parent_schedule
-            _set_default_schedule_types(node.sdfg, node.schedule)
-        elif getattr(node, 'schedule', False):
-            if node.schedule is dtypes.ScheduleType.Default:
-                node.schedule = (child_schedule
-                                 if isinstance(node, nodes.EntryNode) or parent_schedule is None else parent_schedule)
-
-
-def _set_default_schedule_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType, use_parent_schedule: bool = False):
-    for state in sdfg.nodes():
-        reverse_scope_dict = state.scope_children()
+            raise ValueError('SDFG state cannot be None when inferring a scope')
+    if child_nodes is None:
+        child_nodes = state.scope_children()
 
-        # Start with top-level nodes and call recursively
-        _set_default_schedule_in_scope(None, toplevel_schedule, reverse_scope_dict, use_parent_schedule)
+    ############################################
 
+    # Set default storage types in this scope
+    _set_default_storage_in_scope(state, parent_node, parent_schedules, child_nodes)
 
-def _set_default_storage_types(sdfg: SDFG, toplevel_schedule: dtypes.ScheduleType):
-    for state in sdfg.nodes():
-        scope_dict = state.scope_dict()
-        scopes_with_tbmaps = _scopes_with_tbmaps(state, [
-            n
-            for n in state.nodes() if isinstance(n, nodes.MapEntry) and n.schedule in [dtypes.ScheduleType.GPU_Device]
-        ])
-
-        for node in state.nodes():
-            if not isinstance(node, nodes.AccessNode):
-                continue
-            desc = node.desc(sdfg)
-            # Only set transients if nested
-            if ((desc.transient or sdfg.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default):
-                # Special cases
-                parent_node = scope_dict[node]
-                if parent_node is None:
-                    parent_schedule = toplevel_schedule
-                else:
-                    parent_schedule = parent_node.map.schedule
-                    # Skip sequential maps to determine storage
-                    while parent_schedule == dtypes.ScheduleType.Sequential:
-                        parent_node = scope_dict[parent_node]
-                        if parent_node is None:
-                            parent_schedule = toplevel_schedule
-                            break
-                        parent_schedule = parent_node.map.schedule
-                # Determine default GPU schedule based on existence of
-                # thread-block maps
-                if parent_schedule == dtypes.ScheduleType.GPU_Device:
-                    if parent_node not in scopes_with_tbmaps:
-                        parent_schedule = dtypes.ScheduleType.GPU_ThreadBlock
-                # End of special cases
-
-                # Set default storage type
-                desc.storage = dtypes.SCOPEDEFAULT_STORAGE[parent_schedule]
-
-    # Take care of remaining arrays/scalars, e.g., code->code edges
-    for desc in sdfg.arrays.values():
-        if ((desc.transient or sdfg.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default):
-            desc.storage = dtypes.StorageType.Register
+    # Set default schedules in this scope based on parent schedule and inferred storage types
+    nested_scopes = _set_default_schedule_in_scope(state, parent_node, parent_schedules, child_nodes,
+                                                   use_parent_schedule)
 
-    for state in sdfg.nodes():
-        # Loop again after all default storages have been set to set nested
-        # SDFGs
-        for node in state.nodes():
-            if not isinstance(node, nodes.NestedSDFG):
-                continue
-            for name, desc in node.sdfg.arrays.items():
-                if (not desc.transient and desc.storage is dtypes.StorageType.Default):
-                    # Find connector and ensure storage types match
-                    for e in state.in_edges(node):
-                        if e.dst_conn == name:
-                            desc.storage = sdfg.arrays[e.data.data].storage
-                            break
-                    for e in state.out_edges(node):
-                        if e.src_conn == name:
-                            desc.storage = sdfg.arrays[e.data.data].storage
-                            break
-            _set_default_storage_types(node.sdfg, node.schedule)
+    # Loop over internal nested SDFGs and scope entry nodes
+    for nnode in nested_scopes:
+        # Continue through nested SDFGs
+        if isinstance(nnode, nodes.NestedSDFG):
+            nscope = nnode.sdfg
+            child_nodes = None
+        else:
+            nscope = nnode
+        set_default_schedule_and_storage_types(nscope,
+                                               parent_schedules +
+                                               ([nnode.schedule] if not isinstance(nnode, nodes.NestedSDFG) else []),
+                                               use_parent_schedule=False,
+                                               state=state,
+                                               child_nodes=child_nodes)
+
+
+def _determine_child_schedule(parent_schedules: List[dtypes.ScheduleType]) -> Optional[dtypes.ScheduleType]:
+    for sched in reversed(parent_schedules):
+        if sched is not None and sched in dtypes.SCOPEDEFAULT_SCHEDULE:
+            child_sched = dtypes.SCOPEDEFAULT_SCHEDULE[sched]
+            if child_sched is not None:
+                return child_sched
+    return None
+
+
+def _determine_child_storage(parent_schedules: List[dtypes.ScheduleType]) -> Optional[dtypes.StorageType]:
+    for sched in reversed(parent_schedules):
+        if sched is not None and sched in dtypes.SCOPEDEFAULT_STORAGE:
+            child_sched = dtypes.SCOPEDEFAULT_STORAGE[sched]
+            if child_sched is not None:
+                return child_sched
+    return None
+
+
+def _determine_schedule_from_storage(state: SDFGState, node: nodes.Node) -> Optional[dtypes.ScheduleType]:
+    child_schedule = None
+    memlets: Set[str] = set()
+    if node is None or isinstance(node, nodes.NestedSDFG):  # State or nested SDFG
+        pass
+    elif isinstance(node, nodes.EntryNode):
+        # Test for storage of the scope by collecting all neighboring memlets
+        memlets = set(e.data.data for e in state.out_edges(node) if not e.data.is_empty())
+        exit_node = state.exit_node(node)
+        memlets.update(e.data.data for e in state.in_edges(exit_node) if not e.data.is_empty())
+    else:
+        # Other nodes only need neighboring memlets
+        memlets = set(e.data.data for e in state.all_edges(node) if not e.data.is_empty())
+
+    # From memlets, use non-scalar data descriptors for decision
+    constraints: Set[dtypes.ScheduleType] = set()
+    sdfg = state.parent
+    for dname in memlets:
+        if isinstance(sdfg.arrays[dname], data.Scalar):
+            continue  # Skip scalars
+
+        storage = sdfg.arrays[dname].storage
+        if storage not in dtypes.STORAGEDEFAULT_SCHEDULE:
+            continue
+        sched = dtypes.STORAGEDEFAULT_SCHEDULE[storage]
+        if sched is None:
+            continue
+        constraints.add(sched)
+
+    if not constraints:  # No constraints found
+        child_schedule = None
+    elif len(constraints) > 1:
+        raise validation.InvalidSDFGNodeError(
+            f'Cannot determine default schedule for node {node}. '
+            'Multiple arrays that point to it say that it should be the following schedules: '
+            f'{constraints}', state.parent, state.parent.node_id(state), state.node_id(node))
+    else:
+        child_schedule = next(iter(constraints))
 
+    # If no valid schedules are found and there are no conflicts with storage, use default top-level schedule
+    if child_schedule is None:
+        child_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None]
+
+    return child_schedule
+
+
+def _set_default_schedule_in_scope(state: SDFGState,
+                                   parent_node: nodes.Node,
+                                   parent_schedules: List[dtypes.ScheduleType],
+                                   child_nodes: Dict[nodes.Node, List[nodes.Node]],
+                                   use_parent_schedule: bool = False) -> List[Union[nodes.EntryNode, nodes.NestedSDFG]]:
+    nested_scopes: List[Union[nodes.EntryNode, nodes.NestedSDFG]] = []
+
+    # Try to determine schedule based on parent schedule(s)
+    if use_parent_schedule:
+        child_schedule = parent_schedules[-1]
+    else:
+        child_schedule = _determine_child_schedule(parent_schedules)
+
+    # Set child schedule type in scope
+    for node in child_nodes[parent_node]:
+        # Set default schedule types
+        if isinstance(node, (nodes.EntryNode, nodes.NestedSDFG)):
+            nested_scopes.append(node)
+            if node.schedule == dtypes.ScheduleType.Default:
+                # If parent schedules do not determine child schedule,
+                # test for storage of the scope by collecting all neighboring memlets
+                if child_schedule is None:
+                    local_child_schedule = _determine_schedule_from_storage(state, node)
+                else:
+                    local_child_schedule = child_schedule
+                node.schedule = local_child_schedule
+        elif getattr(node, 'schedule', False) and not isinstance(node, nodes.ExitNode):
+            if node.schedule == dtypes.ScheduleType.Default:
+                if child_schedule is None:
+                    local_child_schedule = _determine_schedule_from_storage(state, node)
+                else:
+                    local_child_schedule = child_schedule
+                node.schedule = local_child_schedule
+
+    return nested_scopes
+
+
+def _set_default_storage_in_scope(state: SDFGState, parent_node: Optional[nodes.Node],
+                                  parent_schedules: List[dtypes.ScheduleType], child_nodes: Dict[nodes.Node,
+                                                                                                 List[nodes.Node]]):
+    # Special case for GPU maps without explicit thread-block assignment
+    if (dtypes.ScheduleType.GPU_Device in parent_schedules
+            and dtypes.ScheduleType.GPU_ThreadBlock not in parent_schedules
+            and dtypes.ScheduleType.GPU_ThreadBlock_Dynamic not in parent_schedules):
+        from dace.transformation.helpers import gpu_map_has_explicit_threadblocks  # Avoid import loops
+        # Find GPU scopes without thread-block maps
+        if not gpu_map_has_explicit_threadblocks(state, parent_node):
+            # Do not modify external list
+            parent_schedules = parent_schedules + [dtypes.ScheduleType.GPU_ThreadBlock]
+    # End of special case
+
+    sdfg = state.parent
+    child_storage = _determine_child_storage(parent_schedules)
+    if child_storage is None:
+        child_storage = dtypes.SCOPEDEFAULT_STORAGE[None]
+
+    exit_nodes = [state.exit_node(n) for n in child_nodes[parent_node] if isinstance(n, nodes.EntryNode)]
+    scope_subgraph = SubgraphView(state, child_nodes[parent_node] + exit_nodes)
+
+    # Loop over access nodes
+    for node in scope_subgraph.nodes():
+        if not isinstance(node, nodes.AccessNode):
+            continue
+        desc = node.desc(sdfg)
+        # If not transient in a nested SDFG, take storage from parent, regardless of current type
+        if not desc.transient and sdfg.parent is not None:
+            desc.storage = _get_storage_from_parent(node.data, sdfg)
+        elif desc.storage == dtypes.StorageType.Default:
+            desc.storage = child_storage
+
+    # Take care of code->code edges that do not have access nodes
+    for edge in scope_subgraph.edges():
+        if not edge.data.is_empty():
+            desc = sdfg.arrays[edge.data.data]
+            # If not transient in a nested SDFG, take storage from parent, regardless of current type
+            if not desc.transient and sdfg.parent is not None:
+                desc.storage = _get_storage_from_parent(edge.data.data, sdfg)
+            elif desc.storage == dtypes.StorageType.Default:
+                desc.storage = child_storage
+
+
+def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType:
+    """
+    Retrieves the storage type of an array from its parent SDFG.
+
+    :param data_name: The name of the data descriptor.
+    :param sdfg: The parent SDFG.
+    :return: The storage type of the data descriptor.
+    """
+    nsdfg_node = sdfg.parent_nsdfg_node
+    parent_state = sdfg.parent
+    parent_sdfg = parent_state.parent
+
+    # Find data descriptor in parent SDFG
+    if data_name in nsdfg_node.in_connectors:
+        e = next(iter(parent_state.in_edges_by_connector(nsdfg_node, data_name)))
+        return parent_sdfg.arrays[e.data.data].storage
+    elif data_name in nsdfg_node.out_connectors:
+        e = next(iter(parent_state.out_edges_by_connector(nsdfg_node, data_name)))
+        return parent_sdfg.arrays[e.data.data].storage
+
+    raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG')
 
 def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None:
     """
diff --git a/tests/sdfg/schedule_inference_test.py b/tests/sdfg/schedule_inference_test.py
new file mode 100644
index 0000000000..1b1b3422d8
--- /dev/null
+++ b/tests/sdfg/schedule_inference_test.py
@@ -0,0 +1,180 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Tests for default storage/schedule inference. """
+import dace
+from dace.sdfg.validation import InvalidSDFGNodeError
+from dace.sdfg.infer_types import set_default_schedule_and_storage_types
+from dace.transformation.helpers import get_parent_map
+import pytest
+
+
+def test_default_schedule_autodetect():
+
+    @dace.program
+    def add(a: dace.float32[10, 10], b: dace.float32[10, 10]):
+        return a + b @ b
+
+    sdfg = add.to_sdfg()
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, _ in sdfg.all_nodes_recursive():
+        if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)):
+            assert node.schedule == dace.ScheduleType.CPU_Multicore
+
+
+def test_gpu_schedule_autodetect():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global,
+            b: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
+        return a + b @ b
+
+    sdfg = add.to_sdfg()
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, _ in sdfg.all_nodes_recursive():
+        if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)):
+            assert node.schedule == dace.ScheduleType.GPU_Device
+
+
+def test_gpu_schedule_scalar_autodetect():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global,
+            b: dace.float32[10, 10] @ dace.StorageType.GPU_Global, c: dace.float32[10] @ dace.StorageType.CPU_Heap):
+        return a + b @ b + c[0]
+
+    sdfg = add.to_sdfg()
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, _ in sdfg.all_nodes_recursive():
+        if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)):
+            assert node.schedule == dace.ScheduleType.GPU_Device
+
+
+def test_gpu_schedule_scalar_autodetect_2():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, b: dace.float32):
+        return a + b
+
+    sdfg = add.to_sdfg()
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, _ in sdfg.all_nodes_recursive():
+        if isinstance(node, (dace.nodes.LibraryNode, dace.nodes.MapEntry)):
+            assert node.schedule == dace.ScheduleType.GPU_Device
+
+
+def test_nested_map_in_loop_schedule():
+
+    @dace.program
+    def top(a: dace.float64[20, 20], b: dace.float64[20, 20], c: dace.float64[20, 20]):
+        for i in dace.map[0:20] @ dace.ScheduleType.GPU_Device:
+            for _ in range(5):
+                c[i] += a[i] + b[i]
+
+    sdfg = top.to_sdfg(simplify=False)
+
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, state in sdfg.all_nodes_recursive():
+        if isinstance(node, dace.nodes.MapEntry):
+            if get_parent_map(state, node) is None:
+                assert node.schedule == dace.ScheduleType.GPU_Device
+            else:
+                assert node.schedule == dace.ScheduleType.GPU_ThreadBlock
+
+
+def test_nested_storage():
+
+    @dace.program
+    def nested(a: dace.float64[20, 20], b: dace.float64[20, 20]):
+        tmp = dace.define_local([20, 20], dace.float64)
+        tmp[:] = a
+        b[:] = tmp
+
+    @dace.program
+    def top(a: dace.float64[20, 20], b: dace.float64[20, 20]):
+        nested(a, b)
+
+    sdfg = top.to_sdfg(simplify=False)
+
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, state in sdfg.all_nodes_recursive():
+        nsdfg = state.parent
+        if isinstance(node, dace.nodes.AccessNode):
+            assert node.desc(nsdfg).storage == dace.StorageType.CPU_Heap
+
+
+def test_nested_storage_equivalence():
+
+    @dace.program
+    def nested(a: dace.float64[20, 20], b: dace.float64[20, 20]):
+        b[:] = a
+
+    @dace.program
+    def top(a: dace.float64[20, 20] @ dace.StorageType.CPU_Heap, b: dace.float64[20, 20] @ dace.StorageType.CPU_Pinned):
+        nested(a, b)
+
+    sdfg = top.to_sdfg(simplify=False)
+
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, state in sdfg.all_nodes_recursive():
+        nsdfg = state.parent
+        if isinstance(node, dace.nodes.AccessNode):
+            if state.out_degree(node) > 0:  # Check for a in external and internal scopes
+                assert node.desc(nsdfg).storage == dace.StorageType.CPU_Heap
+            elif state.in_degree(node) > 0:  # Check for b in external and internal scopes
+                assert node.desc(nsdfg).storage == dace.StorageType.CPU_Pinned
+
+
+def test_ambiguous_schedule():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global, b: dace.float32[10, 10]):
+        return a + b
+
+    with pytest.raises(InvalidSDFGNodeError):
+        sdfg = add.to_sdfg()
+        set_default_schedule_and_storage_types(sdfg, None)
+
+
+def test_ambiguous_schedule_2():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global,
+            b: dace.float32[10, 10] @ dace.StorageType.GPU_Global, c: dace.float32[10] @ dace.StorageType.CPU_Heap):
+        return a + b @ b + c
+
+    with pytest.raises(InvalidSDFGNodeError):
+        sdfg = add.to_sdfg()
+        set_default_schedule_and_storage_types(sdfg, None)
+
+
+def test_semi_ambiguous_schedule():
+
+    @dace.program
+    def add(a: dace.float32[10, 10] @ dace.StorageType.GPU_Global,
+            b: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
+        for i in dace.map[0:10] @ dace.ScheduleType.GPU_Device:
+            shared = dace.define_local([10], dace.float32)
+            for j in dace.map[0:10]:  # Should be inferred as thread-block
+                b[i, j] = a[i, j] + shared[j]
+
+    sdfg = add.to_sdfg()
+    set_default_schedule_and_storage_types(sdfg, None)
+    for node, state in sdfg.all_nodes_recursive():
+        if isinstance(node, dace.nodes.MapEntry):
+            if get_parent_map(state, node) is None:
+                assert node.schedule == dace.ScheduleType.GPU_Device
+            else:
+                assert node.schedule == dace.ScheduleType.GPU_ThreadBlock
+
+
+if __name__ == '__main__':
+    test_default_schedule_autodetect()
+    test_gpu_schedule_autodetect()
+    test_gpu_schedule_scalar_autodetect()
+    test_gpu_schedule_scalar_autodetect_2()
+    test_nested_kernel_computation()
+    test_nested_map_in_loop_schedule()
+    test_nested_storage()
+    test_nested_storage_equivalence()
+    test_ambiguous_schedule()
+    test_ambiguous_schedule_2()
+    test_semi_ambiguous_schedule()

From 1a7fe454dffafb331a4208b5b9be9de08e40a2e3 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 01:10:35 -0700
Subject: [PATCH 083/392] Fix use of internal API in expansion

---
 dace/transformation/transformation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dace/transformation/transformation.py b/dace/transformation/transformation.py
index 6634276f46..75e591cb1e 100644
--- a/dace/transformation/transformation.py
+++ b/dace/transformation/transformation.py
@@ -627,15 +627,15 @@ def apply(self, state, sdfg, *args, **kwargs):
         else:
             raise TypeError("Node expansion must be a CodeNode or an SDFG")
 
-        # Fix nested schedules
-        if isinstance(expansion, nd.NestedSDFG):
-            infer_types._set_default_schedule_types(expansion.sdfg, expansion.schedule, True)
-            infer_types._set_default_storage_types(expansion.sdfg, expansion.schedule)
-
         expansion.environments = copy.copy(set(map(lambda a: a.full_class_path(), type(self).environments)))
         sdutil.change_edge_dest(state, node, expansion)
         sdutil.change_edge_src(state, node, expansion)
         state.remove_node(node)
+
+        # Fix nested schedules
+        if isinstance(expansion, nd.NestedSDFG):
+            infer_types.set_default_schedule_and_storage_types(expansion.sdfg, [expansion.schedule], True)
+
         type(self).postprocessing(sdfg, state, expansion)
 
     def to_json(self, parent=None) -> Dict[str, Any]:

From e173d9cc1951c9c5fcc9c555d6026e73c4f92097 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 01:50:12 -0700
Subject: [PATCH 084/392] Fix use of connectors in matmul library nodes

---
 dace/libraries/blas/nodes/gemm.py   | 4 ++--
 dace/libraries/blas/nodes/matmul.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py
index 4a49397255..767cd53429 100644
--- a/dace/libraries/blas/nodes/gemm.py
+++ b/dace/libraries/blas/nodes/gemm.py
@@ -87,7 +87,7 @@ def make_sdfg(node, parent_state, parent_sdfg):
             init_state = sdfg.add_state(node.label + "_initstate")
             state = sdfg.add_state_after(init_state, node.label + "_state")
 
-        if node.beta != 0:
+        if '_cin' in node.in_connectors:
             sdfg.add_array("_cin", shape_c, dtype_c, strides=cdata[-1], storage=cdata[1].storage)
 
         mul_out, mul_out_array = "_c", array_c
@@ -1050,7 +1050,7 @@ def validate(self, sdfg, state):
 # Numpy replacement
 @oprepo.replaces('dace.libraries.blas.gemm')
 @oprepo.replaces('dace.libraries.blas.Gemm')
-def gemv_libnode(pv: 'ProgramVisitor',
+def gemm_libnode(pv: 'ProgramVisitor',
                  sdfg: SDFG,
                  state: SDFGState,
                  A,
diff --git a/dace/libraries/blas/nodes/matmul.py b/dace/libraries/blas/nodes/matmul.py
index a937af0a81..185beee1a0 100644
--- a/dace/libraries/blas/nodes/matmul.py
+++ b/dace/libraries/blas/nodes/matmul.py
@@ -143,6 +143,8 @@ def expansion(node, state, sdfg):
             from dace.libraries.blas.nodes.gemm import Gemm
             beta = node.beta
             cin = True
+            if '_cin' not in node.in_connectors:
+                cin = False
             if c[0].data.wcr:
                 from dace.frontend import operations
                 redtype = operations.detect_reduction_type(c[0].data.wcr)

From 6ad811fbb41337513669d45407d784a69dcbbf57 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 01:55:52 -0700
Subject: [PATCH 085/392] Ignore Sequential schedule when determining storage

---
 dace/sdfg/infer_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
index dd89b1dee8..a02471342a 100644
--- a/dace/sdfg/infer_types.py
+++ b/dace/sdfg/infer_types.py
@@ -225,7 +225,7 @@ def _determine_child_schedule(parent_schedules: List[dtypes.ScheduleType]) -> Op
 
 def _determine_child_storage(parent_schedules: List[dtypes.ScheduleType]) -> Optional[dtypes.StorageType]:
     for sched in reversed(parent_schedules):
-        if sched is not None and sched in dtypes.SCOPEDEFAULT_STORAGE:
+        if (sched is not None and sched in dtypes.SCOPEDEFAULT_STORAGE and sched != dtypes.ScheduleType.Sequential):
             child_sched = dtypes.SCOPEDEFAULT_STORAGE[sched]
             if child_sched is not None:
                 return child_sched

From fc7e7ae4c449121289a3f47cf6d65fb4e2e4a113 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 02:35:58 -0700
Subject: [PATCH 086/392] Fix API usage

---
 dace/codegen/targets/framecode.py                   | 1 -
 dace/transformation/interstate/multistate_inline.py | 4 ++--
 dace/transformation/interstate/sdfg_nesting.py      | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 808ba26d6a..c556da3e6b 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -20,7 +20,6 @@
 from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes
 from dace.sdfg import scope as sdscope
 from dace.sdfg import utils
-from dace.sdfg.infer_types import set_default_schedule_and_storage_types
 from dace.transformation.passes.analysis import StateReachability
 
 
diff --git a/dace/transformation/interstate/multistate_inline.py b/dace/transformation/interstate/multistate_inline.py
index 4c20be1568..74dd51a483 100644
--- a/dace/transformation/interstate/multistate_inline.py
+++ b/dace/transformation/interstate/multistate_inline.py
@@ -143,8 +143,8 @@ def apply(self, outer_state: SDFGState, sdfg: SDFG):
         nsdfg_node = self.nested_sdfg
         nsdfg: SDFG = nsdfg_node.sdfg
 
-        if nsdfg_node.schedule is not dtypes.ScheduleType.Default:
-            infer_types.set_default_schedule_and_storage_types(nsdfg, nsdfg_node.schedule)
+        if nsdfg_node.schedule != dtypes.ScheduleType.Default:
+            infer_types.set_default_schedule_and_storage_types(nsdfg, [nsdfg_node.schedule])
 
         #######################################################
         # Collect and update top-level SDFG metadata
diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index b85877120b..a63b37aa19 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -248,8 +248,8 @@ def apply(self, state: SDFGState, sdfg: SDFG):
         nsdfg: SDFG = nsdfg_node.sdfg
         nstate: SDFGState = nsdfg.nodes()[0]
 
-        if nsdfg_node.schedule is not dtypes.ScheduleType.Default:
-            infer_types.set_default_schedule_and_storage_types(nsdfg, nsdfg_node.schedule)
+        if nsdfg_node.schedule != dtypes.ScheduleType.Default:
+            infer_types.set_default_schedule_and_storage_types(nsdfg, [nsdfg_node.schedule])
 
         nsdfg_scope_entry = state.entry_node(nsdfg_node)
         nsdfg_scope_exit = (state.exit_node(nsdfg_scope_entry) if nsdfg_scope_entry is not None else None)

From aaa5043faced19500ee8fe439a05c7334ad1722d Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 03:21:50 -0700
Subject: [PATCH 087/392] Better error handling

---
 dace/codegen/compiled_sdfg.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 8896a191fe..8821628000 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -218,6 +218,8 @@ def get_state_struct(self) -> ctypes.Structure:
             
             :return: the ctypes.Structure representation of the state struct.
         """
+        if not self._libhandle:
+            raise ValueError('Library was not initialized')
 
         return ctypes.cast(self._libhandle, ctypes.POINTER(self._try_parse_state_struct())).contents
 

From 99c5eb552519c937ef356f3fbb464449991f0de2 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 03:22:33 -0700
Subject: [PATCH 088/392] Special cases for GPU_Default

---
 dace/sdfg/infer_types.py         | 15 ++++++++++++---
 tests/parse_state_struct_test.py |  7 ++++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
index a02471342a..105e1d12e9 100644
--- a/dace/sdfg/infer_types.py
+++ b/dace/sdfg/infer_types.py
@@ -160,6 +160,11 @@ def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.E
                         node to its children.
     """
     parent_schedules = parent_schedules or [None]
+
+    # TODO(later): Remove GPU_Default
+    if parent_schedules[-1] == dtypes.ScheduleType.GPU_Default and use_parent_schedule:
+        use_parent_schedule = False
+
     if isinstance(scope, SDFG):
         # Set device for default top-level schedules and storages
         for state in scope.nodes():
@@ -174,7 +179,7 @@ def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.E
             # If not transient in a nested SDFG, take storage from parent, regardless of current type
             if not desc.transient and scope.parent_sdfg is not None:
                 desc.storage = _get_storage_from_parent(aname, scope)
-            elif ((desc.transient or scope.parent_sdfg is None) and desc.storage is dtypes.StorageType.Default):
+            elif ((desc.transient or scope.parent_sdfg is None) and desc.storage == dtypes.StorageType.Default):
                 # Indeterminate storage type, set to register
                 desc.storage = dtypes.StorageType.Register
         return
@@ -204,11 +209,15 @@ def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.E
         if isinstance(nnode, nodes.NestedSDFG):
             nscope = nnode.sdfg
             child_nodes = None
+            extra_parent_schedules = []
+            # TODO(later): Remove GPU_Default
+            if nnode.schedule == dtypes.ScheduleType.GPU_Default:
+                extra_parent_schedules.append(nnode.schedule)
         else:
             nscope = nnode
+            extra_parent_schedules = [nnode.schedule]
         set_default_schedule_and_storage_types(nscope,
-                                               parent_schedules +
-                                               ([nnode.schedule] if not isinstance(nnode, nodes.NestedSDFG) else []),
+                                               parent_schedules + extra_parent_schedules,
                                                use_parent_schedule=False,
                                                state=state,
                                                child_nodes=child_nodes)
diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py
index 969420d693..89bb2550f8 100644
--- a/tests/parse_state_struct_test.py
+++ b/tests/parse_state_struct_test.py
@@ -13,9 +13,11 @@
 from dace import dtypes
 from dace.codegen import codeobject, targets, compiler, compiled_sdfg
 
-
 @pytest.fixture
 def cuda_helper():
+    return _cuda_helper()
+
+def _cuda_helper():
 
     helper_code = """
     #include <dace/dace.h>
@@ -89,3 +91,6 @@ def persistent_transient(A: dace.float32[3, 3]):
     compiledsdfg(A=A, __return=result)
 
     assert np.allclose(result, A @ B)
+
+if __name__ =='__main__':
+    test_preallocate_transients_in_state_struct(_cuda_helper())

From dbfc944049a27fc30d1bdf629024e25488cd202b Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 05:23:07 -0700
Subject: [PATCH 089/392] Fix persistent fusion transformation nested SDFG
 schedule

---
 dace/transformation/subgraph/gpu_persistent_fusion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py
index 6096ff9572..1cf93469bb 100644
--- a/dace/transformation/subgraph/gpu_persistent_fusion.py
+++ b/dace/transformation/subgraph/gpu_persistent_fusion.py
@@ -246,6 +246,7 @@ def apply(self, sdfg: SDFG):
             kernel_args_read,
             kernel_args_write,
         )
+        nested_sdfg.schedule = ScheduleType.GPU_Persistent
 
         # Create and connect read only data access nodes
         for arg in kernel_args_read:

From 8b77a8611fe6968831edbd29d84035bedec0a0e6 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 25 May 2023 06:02:10 -0700
Subject: [PATCH 090/392] Fix FPGA dispatching with scalars

---
 dace/codegen/targets/fpga.py |  8 ++------
 dace/sdfg/utils.py           | 14 +++++++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py
index 31cfc6e13f..b920b0e9d5 100644
--- a/dace/codegen/targets/fpga.py
+++ b/dace/codegen/targets/fpga.py
@@ -1110,12 +1110,8 @@ def generate_nested_state(self, sdfg, state, nest_name, subgraphs, function_stre
     def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream):
 
         if not self._in_device_code:
-            # If we're not already generating kernel code we need to set up the
-            # kernel launch
-            subgraphs = [dfg_scope]
-            return self.generate_kernel(sdfg, sdfg.node(state_id),
-                                        dfg_scope.source_nodes()[0].map.label.replace(" ", "_"), subgraphs,
-                                        function_stream, callsite_stream)
+            # If we're not already generating kernel code, fail
+            raise cgx.CodegenError('FPGA kernel needs to be generated inside a device state.')
 
         self.generate_node(sdfg, dfg_scope, state_id, dfg_scope.source_nodes()[0], function_stream, callsite_stream)
 
diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index c58837fdff..3918b8b1e1 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1501,13 +1501,17 @@ def is_fpga_kernel(sdfg, state):
     if ("is_FPGA_kernel" in state.location and state.location["is_FPGA_kernel"] == False):
         return False
     data_nodes = state.data_nodes()
-    if len(data_nodes) == 0:
-        return False
+    at_least_one_fpga_array = False
     for n in data_nodes:
-        if n.desc(sdfg).storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
-                                        dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
+        desc = n.desc(sdfg)
+        if isinstance(desc, dt.Scalar):
+            continue
+        if desc.storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
+                                dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
             return False
-    return True
+        at_least_one_fpga_array = True
+
+    return at_least_one_fpga_array
 
 
 def postdominators(

From 68b64494e50efb20e88fa58a085761c511d28e00 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Thu, 1 Jun 2023 15:55:14 -0400
Subject: [PATCH 091/392] Feature/log10 (#1265)

---
 dace/frontend/python/replacements.py          |  7 +++++++
 dace/runtime/include/dace/cuda/cudacommon.cuh | 10 ++++++++++
 dace/runtime/include/dace/math.h              |  5 +++++
 tests/numpy/math_test.py                      |  8 +++++++-
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 3e0ff554d5..3586d40374 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -692,6 +692,13 @@ def _log(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, input: str):
     return _simple_call(sdfg, state, input, 'log')
 
 
+@oprepo.replaces('log10')
+@oprepo.replaces('dace.log10')
+@oprepo.replaces('math.log10')
+def _log10(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, input: str):
+    return _simple_call(sdfg, state, input, 'log10')
+
+
 @oprepo.replaces('math.floor')
 def _floor(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, input: str):
     return _simple_call(sdfg, state, input, 'floor', restype=dtypes.typeclass(int))
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index ae34da918b..3c050f9d75 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -151,6 +151,15 @@ DACE_DFI dace::vec<float, 4> log(dace::vec<float, 4> v) {
     return result;
 }
 
+DACE_DFI dace::vec<float, 4> log10(dace::vec<float, 4> v) {
+    dace::vec<float, 4> result;
+    result.x = log10(v.x);
+    result.y = log10(v.y);
+    result.z = log10(v.z);
+    result.w = log10(v.w);
+    return result;
+}
+
 DACE_DFI dace::vec<float, 4> tanh(dace::vec<float, 4> v) {
     dace::vec<float, 4> result;
     result.x = tanh(v.x);
@@ -171,6 +180,7 @@ DACE_DFI dace::vec<float, 4> heaviside(const dace::vec<float, 4>& a) {
 } } // namespace dace::math
 using dace::math::exp;
 using dace::math::log;
+using dace::math::log10;
 using dace::math::tanh;
 using dace::math::heaviside;
 #endif
diff --git a/dace/runtime/include/dace/math.h b/dace/runtime/include/dace/math.h
index a1f501e4da..aa4dcb358d 100644
--- a/dace/runtime/include/dace/math.h
+++ b/dace/runtime/include/dace/math.h
@@ -569,6 +569,11 @@ namespace dace
         {
           return std::log(a);
         }
+        template<typename T>
+        DACE_CONSTEXPR DACE_HDFI T log10(const T& a)
+        {
+          return std::log10(a);
+        }
     }
 
     namespace cmath
diff --git a/tests/numpy/math_test.py b/tests/numpy/math_test.py
index e96e1dbeae..5e241ebb48 100644
--- a/tests/numpy/math_test.py
+++ b/tests/numpy/math_test.py
@@ -3,7 +3,7 @@
 import dace
 from common import compare_numpy_output
 import math
-from numpy import exp, sin, cos, sqrt, log, conj, real, imag
+from numpy import exp, sin, cos, sqrt, log, log10, conj, real, imag
 import pytest
 
 M, N = 24, 24
@@ -34,6 +34,11 @@ def test_logarithm(A: dace.complex64[M, N]):
     return log(A)
 
 
+@compare_numpy_output(non_zero=True, positive=True)
+def test_log10(A: dace.complex64[M, N]):
+    return log10(A)
+
+
 @compare_numpy_output()
 def test_conjugate(A: dace.complex64[M, N]):
     return conj(A)
@@ -159,6 +164,7 @@ def func():
     test_cosine()
     test_square_root()
     test_logarithm()
+    test_log10()
     test_conjugate()
     test_real_part()
     test_imag_part()

From 6a471648055cfe173bd0cd53083022956a3129f9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Sat, 3 Jun 2023 19:06:58 -0700
Subject: [PATCH 092/392] Ignore nested SDFG schedule in codegen allocation,
 FPGA dispatcher works for all-scalar kernels

---
 dace/codegen/targets/framecode.py | 8 +++++++-
 dace/sdfg/utils.py                | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index c556da3e6b..091c893d5b 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -424,7 +424,13 @@ def _get_schedule(self, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> dtype
             sdfg: SDFG = (scope if isinstance(scope, SDFG) else scope.parent)
             if sdfg.parent_nsdfg_node is None:
                 return TOP_SCHEDULE
-            return (sdfg.parent_nsdfg_node.schedule or TOP_SCHEDULE)
+
+            # Go one SDFG up
+            pstate = sdfg.parent
+            pscope = pstate.entry_node(sdfg.parent_nsdfg_node)
+            if pscope is not None:
+                return self._get_schedule(pscope)
+            return self._get_schedule(pstate)
         else:
             raise TypeError
 
diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 3918b8b1e1..0b62c96c0b 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1504,12 +1504,14 @@ def is_fpga_kernel(sdfg, state):
     at_least_one_fpga_array = False
     for n in data_nodes:
         desc = n.desc(sdfg)
+        if desc.storage in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
+                            dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
+            at_least_one_fpga_array = True
         if isinstance(desc, dt.Scalar):
             continue
         if desc.storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
                                 dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
             return False
-        at_least_one_fpga_array = True
 
     return at_least_one_fpga_array
 

From 466550986623bc437e2c56e7202b20b235cbd928 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 6 Jun 2023 20:21:07 +0200
Subject: [PATCH 093/392] initial try

---
 dace/sdfg/nodes.py | 144 +++++++++++++++++++++++++++++++++++++++++++++
 dace/sdfg/sdfg.py  |   5 ++
 dace/sdfg/state.py |  69 ++++++++++++++++++++++
 3 files changed, 218 insertions(+)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 866d77bed6..4bb9f8f6b2 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -645,6 +645,150 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None):
 # ------------------------------------------------------------------------------
 
 
+@make_properties
+class ExternalNestedSDFG(CodeNode):
+    """ An SDFG state node that will contain an SDFG of its own. It has outside connectors, but lacks the nestedSDFG.
+        This node is used to represent a nested SDFG that is not yet defined, but will be defined later.
+
+        :note: A nested SDFG cannot create recursion (one of its parent SDFGs).
+    """
+
+    # NOTE: We cannot use SDFG as the type because of an import loop
+    sdfg = SDFGReferenceProperty(desc="The SDFG", allow_none=True)
+    schedule = EnumProperty(dtype=dtypes.ScheduleType,
+                            desc="SDFG schedule",
+                            allow_none=True,
+                            default=dtypes.ScheduleType.Default)
+    symbol_mapping = DictProperty(key_type=str,
+                                  value_type=dace.symbolic.pystr_to_symbolic,
+                                  desc="Mapping between internal symbols and their values, expressed as "
+                                  "symbolic expressions")
+    debuginfo = DebugInfoProperty()
+    is_collapsed = Property(dtype=bool, desc="Show this node/scope/state as collapsed", default=False)
+
+    instrument = EnumProperty(dtype=dtypes.InstrumentationType,
+                              desc="Measure execution statistics with given method",
+                              default=dtypes.InstrumentationType.No_Instrumentation)
+
+    no_inline = Property(dtype=bool,
+                         desc="If True, this nested SDFG will not be inlined during "
+                         "simplification",
+                         default=False)
+
+    unique_name = Property(dtype=str, desc="Unique name of the SDFG", default="")
+
+    def __init__(self,
+                 label,
+                 sdfg,
+                 inputs: Set[str],
+                 outputs: Set[str],
+                 symbol_mapping: Dict[str, Any] = None,
+                 schedule=dtypes.ScheduleType.Default,
+                 location=None,
+                 debuginfo=None):
+        from dace.sdfg import SDFG
+        super(ExternalNestedSDFG, self).__init__(label, location, inputs, outputs)
+
+        # Properties
+        self.sdfg: SDFG = sdfg
+        self.symbol_mapping = symbol_mapping or {}
+        self.schedule = schedule
+        self.debuginfo = debuginfo
+    
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            setattr(result, k, dcpy(v, memo))
+        if result._sdfg is not None:
+            result._sdfg.parent_nsdfg_node = result
+        return result
+
+    @staticmethod
+    def from_json(json_obj, context=None):
+        from dace import SDFG  # Avoid import loop
+
+        # We have to load the SDFG first.
+        ret = NestedSDFG("nolabel", SDFG('nosdfg'), {}, {})
+
+        dace.serialize.set_properties_from_json(ret, json_obj, context)
+
+        if context and 'sdfg_state' in context:
+            ret.sdfg.parent = context['sdfg_state']
+        if context and 'sdfg' in context:
+            ret.sdfg.parent_sdfg = context['sdfg']
+
+        ret.sdfg.parent_nsdfg_node = ret
+
+        ret.sdfg.update_sdfg_list([])
+
+        return ret
+
+    @property
+    def free_symbols(self) -> Set[str]:
+        return set().union(*(map(str,
+                                 pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()),
+                           *(map(str,
+                                 pystr_to_symbolic(v).free_symbols) for v in self.location.values()))
+
+    def infer_connector_types(self, sdfg, state):
+        # Avoid import loop
+        from dace.sdfg.infer_types import infer_connector_types, infer_aliasing
+
+        # Propagate aliasing information into SDFG
+        infer_aliasing(self, sdfg, state)
+
+        # Infer internal connector types
+        infer_connector_types(self.sdfg)
+
+    def __str__(self):
+        if not self.label:
+            return "SDFG"
+        else:
+            return self.label
+
+    def validate(self, sdfg, state, references: Optional[Set[int]] = None):
+        if not dtypes.validate_name(self.label):
+            raise NameError('Invalid nested SDFG name "%s"' % self.label)
+        for in_conn in self.in_connectors:
+            if not dtypes.validate_name(in_conn):
+                raise NameError('Invalid input connector "%s"' % in_conn)
+        for out_conn in self.out_connectors:
+            if not dtypes.validate_name(out_conn):
+                raise NameError('Invalid output connector "%s"' % out_conn)
+        connectors = self.in_connectors.keys() | self.out_connectors.keys()
+        for conn in connectors:
+            if conn not in self.sdfg.arrays:
+                raise NameError(
+                    f'Connector "{conn}" was given but is not a registered data descriptor in the nested SDFG. '
+                    'Example: parameter passed to a function without a matching array within it.')
+        for dname, desc in self.sdfg.arrays.items():
+            # TODO(later): Disallow scalars without access nodes (so that this
+            #              check passes for them too).
+            if isinstance(desc, data.Scalar):
+                continue
+            if not desc.transient and dname not in connectors:
+                raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname)
+            if dname in connectors and desc.transient:
+                raise NameError('"%s" is a connector but its corresponding array is transient' % dname)
+
+        # Validate undefined symbols
+        symbols = set(k for k in self.sdfg.free_symbols if k not in connectors)
+        missing_symbols = [s for s in symbols if s not in self.symbol_mapping]
+        if missing_symbols:
+            raise ValueError('Missing symbols on nested SDFG: %s' % (missing_symbols))
+        extra_symbols = self.symbol_mapping.keys() - symbols
+        if len(extra_symbols) > 0:
+            # TODO: Elevate to an error?
+            warnings.warn(f"{self.label} maps to unused symbol(s): {extra_symbols}")
+
+        # Recursively validate nested SDFG
+        self.sdfg.validate(references)
+
+# ------------------------------------------------------------------------------
+
+
 # Scope entry class
 class EntryNode(Node):
     """ A type of node that opens a scope (e.g., Map or Consume). """
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index bee601e7b1..f0d38b2081 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2692,3 +2692,8 @@ def make_array_memlet(self, array: str):
            :return: a Memlet that fully transfers array
         """
         return dace.Memlet.from_array(array, self.data(array))
+
+@make_properties    
+class SDFGShell(SDFG):
+    """ A shell SDFG that allows inputs, outputs and SDFG properties but does not contain the actual SDFG. Can be transformed into an SDFG by loading in the actual content.
+   """
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 0796bf00d0..bd5a5f2205 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -1174,6 +1174,75 @@ def add_nested_sdfg(
                 sdfg.add_symbol(sym, infer_expr_type(symval, self.parent.symbols) or dtypes.typeclass(int))
 
         return s
+    
+    def add_external_nested_sdfg(
+        self,
+        sdfg: 'dace.sdfg.SDFG',
+        parent,
+        inputs: Union[Set[str], Dict[str, dtypes.typeclass]],
+        outputs: Union[Set[str], Dict[str, dtypes.typeclass]],
+        symbol_mapping: Dict[str, Any] = None,
+        name=None,
+        schedule=dtypes.ScheduleType.Default,
+        location=None,
+        debuginfo=None,
+    ):
+        """ Adds an external nested SDFG to the SDFG state. """
+        if name is None:
+            name = sdfg.label
+        debuginfo = _getdebuginfo(debuginfo or self._default_lineinfo)
+
+        sdfg.parent = self
+        sdfg.parent_sdfg = self.parent
+
+        sdfg.update_sdfg_list([])
+
+        # Make dictionary of autodetect connector types from set
+        if isinstance(inputs, (set, collections.abc.KeysView)):
+            inputs = {k: None for k in inputs}
+        if isinstance(outputs, (set, collections.abc.KeysView)):
+            outputs = {k: None for k in outputs}
+
+        s = nd.ExternalNestedSDFG(
+            name,
+            sdfg,
+            inputs,
+            outputs,
+            symbol_mapping=symbol_mapping,
+            schedule=schedule,
+            location=location,
+            debuginfo=debuginfo,
+        )
+        self.add_node(s)
+
+        sdfg.parent_nsdfg_node = s
+
+        # Add "default" undefined symbols if None are given
+        symbols = sdfg.free_symbols
+        if symbol_mapping is None:
+            symbol_mapping = {s: s for s in symbols}
+            s.symbol_mapping = symbol_mapping
+
+        # Validate missing symbols
+        missing_symbols = [s for s in symbols if s not in symbol_mapping]
+        if missing_symbols and parent:
+            # If symbols are missing, try to get them from the parent SDFG
+            parent_mapping = {s: s for s in missing_symbols if s in parent.symbols}
+            symbol_mapping.update(parent_mapping)
+            s.symbol_mapping = symbol_mapping
+            missing_symbols = [s for s in symbols if s not in symbol_mapping]
+        if missing_symbols:
+            raise ValueError('Missing symbols on nested SDFG "%s": %s' % (name, missing_symbols))
+
+        # Add new global symbols to nested SDFG
+        from dace.codegen.tools.type_inference import infer_expr_type
+        for sym, symval in s.symbol_mapping.items():
+            if sym not in sdfg.symbols:
+                # TODO: Think of a better way to avoid calling
+                # symbols_defined_at in this moment
+                sdfg.add_symbol(sym, infer_expr_type(symval, self.parent.symbols) or dtypes.typeclass(int))
+
+        return s
 
     def add_map(
         self,

From 9290fd20b409ce0f865351280a44ba215e3a0eba Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Tue, 6 Jun 2023 21:06:53 +0200
Subject: [PATCH 094/392] Bugfix in taskletfusion

---
 .../transformation/dataflow/tasklet_fusion.py |  3 +++
 tests/transformations/tasklet_fusion_test.py  | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/dace/transformation/dataflow/tasklet_fusion.py b/dace/transformation/dataflow/tasklet_fusion.py
index 8179ead457..398af7c8b9 100644
--- a/dace/transformation/dataflow/tasklet_fusion.py
+++ b/dace/transformation/dataflow/tasklet_fusion.py
@@ -199,6 +199,9 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
         repldict = {}
         for in_edge in graph.in_edges(t1):
             old_value = in_edge.dst_conn
+            if old_value is None:
+                continue
+
             # Check if there is a conflict.
             if in_edge.dst_conn in inputs:
                 # Conflicts are ok if the Memlets are the same.
diff --git a/tests/transformations/tasklet_fusion_test.py b/tests/transformations/tasklet_fusion_test.py
index a65d218d98..8c5e06ed58 100644
--- a/tests/transformations/tasklet_fusion_test.py
+++ b/tests/transformations/tasklet_fusion_test.py
@@ -178,6 +178,28 @@ def test_tasklet_fusion_multiline(A: datatype):
     assert (result[0] == 11)
 
 
+def test_map_param():
+    @dace.program
+    def map_uses_param(A: dace.float32[10], B: dace.float32[10], C: dace.float32[10]):
+        for i in dace.map[0:10]:
+            a = i - A[i]
+            b = B[i] * i
+            C[i] = a + b
+
+    sdfg = map_uses_param.to_sdfg(simplify=True)
+
+    num_tasklet_fusions = sdfg.apply_transformations(TaskletFusion)
+    assert (num_tasklet_fusions == 1)
+
+    A = np.zeros([10], dtype=np.float32)
+    B = np.ones([10], dtype=np.float32)
+    C = np.empty([10], dtype=np.float32)
+    sdfg(A=A, B=B, C=C)
+
+    ref = np.array(range(0, 10, 1)) * 2.0
+    assert (C == ref).all()
+
+
 @pytest.mark.parametrize('with_data', [pytest.param(True), pytest.param(False)])
 @pytest.mark.parametrize('language', [pytest.param('CPP'), pytest.param('Python')])
 def test_map_with_tasklets(language: str, with_data: bool):
@@ -200,6 +222,7 @@ def test_map_with_tasklets(language: str, with_data: bool):
     test_same_name()
     test_same_name_different_memlet()
     test_tasklet_fusion_multiline()
+    test_map_param()
     test_map_with_tasklets(language='Python', with_data=False)
     test_map_with_tasklets(language='Python', with_data=True)
     test_map_with_tasklets(language='CPP', with_data=False)

From c3f5548aacdc0ccec921a9657a70cf85ed1e95f1 Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Tue, 6 Jun 2023 21:07:31 +0200
Subject: [PATCH 095/392] yapf in taskletfusion classes

---
 .../transformation/dataflow/tasklet_fusion.py | 25 +++++-----------
 tests/transformations/tasklet_fusion_test.py  | 29 +++++++++----------
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/dace/transformation/dataflow/tasklet_fusion.py b/dace/transformation/dataflow/tasklet_fusion.py
index 398af7c8b9..99f8f625be 100644
--- a/dace/transformation/dataflow/tasklet_fusion.py
+++ b/dace/transformation/dataflow/tasklet_fusion.py
@@ -32,7 +32,6 @@ def visit_Name(self, node: ast.Name) -> Any:
 
 
 class CPPConnectorRenamer():
-
     def __init__(self, repl_dict: Dict[str, str]) -> None:
         self.repl_dict = repl_dict
 
@@ -44,7 +43,6 @@ def rename(self, code: str) -> str:
 
 
 class PythonInliner(ast.NodeTransformer):
-
     def __init__(self, target_id, target_ast):
         self.target_id = target_id
         self.target_ast = target_ast
@@ -57,7 +55,6 @@ def visit_Name(self, node: ast.AST):
 
 
 class CPPInliner():
-
     def __init__(self, inline_target, inline_val):
         self.inline_target = inline_target
         self.inline_val = inline_val
@@ -144,10 +141,7 @@ class TaskletFusion(pm.SingleStateTransformation):
 
     @classmethod
     def expressions(cls):
-        return [
-            sdutil.node_path_graph(cls.t1, cls.data, cls.t2),
-            sdutil.node_path_graph(cls.t1, cls.t2)
-        ]
+        return [sdutil.node_path_graph(cls.t1, cls.data, cls.t2), sdutil.node_path_graph(cls.t1, cls.t2)]
 
     def can_be_applied(self, graph: dace.SDFGState, expr_index: int, sdfg: dace.SDFG, permissive: bool = False) -> bool:
         t1 = self.t1
@@ -191,9 +185,7 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
         t2_in_edge = graph.out_edges(data if data is not None else t1)[0]
 
         # Remove the connector from the second Tasklet.
-        inputs = {
-            k: v for k, v in t2.in_connectors.items() if k != t2_in_edge.dst_conn
-        }
+        inputs = {k: v for k, v in t2.in_connectors.items() if k != t2_in_edge.dst_conn}
 
         # Copy the first Tasklet's in connectors.
         repldict = {}
@@ -214,8 +206,8 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
                             break
                 else:
                     t2edge = conflict_edges[0]
-                if t2edge is not None and (in_edge.data != t2edge.data or in_edge.data.data != t2edge.data.data or
-                    in_edge.data is None or in_edge.data.data is None):
+                if t2edge is not None and (in_edge.data != t2edge.data or in_edge.data.data != t2edge.data.data
+                                           or in_edge.data is None or in_edge.data.data is None):
                     in_edge.dst_conn = dace.data.find_new_name(in_edge.dst_conn, set(inputs))
                     repldict[old_value] = in_edge.dst_conn
                 else:
@@ -231,9 +223,7 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
             if repldict:
                 assigned_value = PythonConnectorRenamer(repldict).visit(assigned_value)
 
-            new_code = [
-                PythonInliner(t2_in_edge.dst_conn, assigned_value).visit(line) for line in t2.code.code
-            ]
+            new_code = [PythonInliner(t2_in_edge.dst_conn, assigned_value).visit(line) for line in t2.code.code]
             new_code_str = '\n'.join(astunparse.unparse(line) for line in new_code)
         elif t1.language == Language.CPP:
             assigned_value = t1.code.as_string
@@ -255,9 +245,8 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
         else:
             return
 
-        new_tasklet = graph.add_tasklet(
-            t1.label + '_fused_' + t2.label, inputs, t2.out_connectors, new_code_str, t1.language
-        )
+        new_tasklet = graph.add_tasklet(t1.label + '_fused_' + t2.label, inputs, t2.out_connectors, new_code_str,
+                                        t1.language)
 
         for in_edge in graph.in_edges(t1):
             graph.add_edge(in_edge.src, in_edge.src_conn, new_tasklet, in_edge.dst_conn, in_edge.data)
diff --git a/tests/transformations/tasklet_fusion_test.py b/tests/transformations/tasklet_fusion_test.py
index 8c5e06ed58..1e10759753 100644
--- a/tests/transformations/tasklet_fusion_test.py
+++ b/tests/transformations/tasklet_fusion_test.py
@@ -10,6 +10,7 @@
 M = 10
 N = 2 * M
 
+
 @dace.program
 def map_with_tasklets(A: datatype[N], B: datatype[M]):
     C = np.zeros_like(B)
@@ -42,15 +43,11 @@ def _make_sdfg(language: str, with_data: bool = False):
     outputs = {
         '__out': datatype,
     }
-    ta = state.add_tasklet(
-        'a', inputs, {
-            '__out1': datatype,
-            '__out2': datatype,
-            '__out3': datatype,
-        },
-        f'__out1 = __inp1 + __inp2{endl}__out2 = __out1{endl}__out3 = __out1{endl}',
-        lang
-    )
+    ta = state.add_tasklet('a', inputs, {
+        '__out1': datatype,
+        '__out2': datatype,
+        '__out3': datatype,
+    }, f'__out1 = __inp1 + __inp2{endl}__out2 = __out1{endl}__out3 = __out1{endl}', lang)
     tb = state.add_tasklet('b', inputs, outputs, f'__out = __inp1 * __inp2{endl}', lang)
     tc = state.add_tasklet('c', inputs, outputs, f'__out = __inp1 + __inp2{endl}', lang)
     td = state.add_tasklet('d', inputs, outputs, f'__out = __inp1 / __inp2{endl}', lang)
@@ -60,12 +57,12 @@ def _make_sdfg(language: str, with_data: bool = False):
     state.add_memlet_path(A, me, tb, memlet=dace.Memlet('A[2*i]'), dst_conn='__inp2')
     state.add_memlet_path(B, me, tc, memlet=dace.Memlet('B[i]'), dst_conn='__inp2')
     if with_data:
-        sdfg.add_array('tmp1', (1,), datatype, dtypes.StorageType.Default, None, True)
-        sdfg.add_array('tmp2', (1,), datatype, dtypes.StorageType.Default, None, True)
-        sdfg.add_array('tmp3', (1,), datatype, dtypes.StorageType.Default, None, True)
-        sdfg.add_array('tmp4', (1,), datatype, dtypes.StorageType.Default, None, True)
-        sdfg.add_array('tmp5', (1,), datatype, dtypes.StorageType.Default, None, True)
-        sdfg.add_array('tmp6', (1,), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp1', (1, ), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp2', (1, ), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp3', (1, ), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp4', (1, ), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp5', (1, ), datatype, dtypes.StorageType.Default, None, True)
+        sdfg.add_array('tmp6', (1, ), datatype, dtypes.StorageType.Default, None, True)
         atemp1 = state.add_access('tmp1')
         atemp2 = state.add_access('tmp2')
         atemp3 = state.add_access('tmp3')
@@ -101,7 +98,7 @@ def test_basic():
     def test_basic_tf(A: datatype[5, 5]):
         B = A + 1
         return B * 2
-    
+
     sdfg = test_basic_tf.to_sdfg(simplify=True)
 
     num_map_fusions = sdfg.apply_transformations(MapFusion)

From 642bf2a2a4be9f3273af1306990fb6f4cca346de Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Tue, 6 Jun 2023 21:11:21 +0200
Subject: [PATCH 096/392] Corrected taskletfusion test case

---
 tests/transformations/tasklet_fusion_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/transformations/tasklet_fusion_test.py b/tests/transformations/tasklet_fusion_test.py
index 1e10759753..c7fd6802d5 100644
--- a/tests/transformations/tasklet_fusion_test.py
+++ b/tests/transformations/tasklet_fusion_test.py
@@ -185,8 +185,8 @@ def map_uses_param(A: dace.float32[10], B: dace.float32[10], C: dace.float32[10]
 
     sdfg = map_uses_param.to_sdfg(simplify=True)
 
-    num_tasklet_fusions = sdfg.apply_transformations(TaskletFusion)
-    assert (num_tasklet_fusions == 1)
+    num_tasklet_fusions = sdfg.apply_transformations_repeated(TaskletFusion)
+    assert (num_tasklet_fusions == 3)
 
     A = np.zeros([10], dtype=np.float32)
     B = np.ones([10], dtype=np.float32)

From c00b00cf2e14af68f21d5ee6edef54d6adb8a6eb Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 6 Jun 2023 22:24:32 +0200
Subject: [PATCH 097/392] State Fusion Extension with happens before dependency
 edge

---
 AUTHORS                                       |   1 +
 dace/transformation/interstate/__init__.py    |   1 +
 .../state_fusion_with_happens_before.py       | 589 ++++++++++++++++++
 .../state_fusion_extended_test.py             |  65 ++
 4 files changed, 656 insertions(+)
 create mode 100644 dace/transformation/interstate/state_fusion_with_happens_before.py
 create mode 100644 tests/transformations/state_fusion_extended_test.py

diff --git a/AUTHORS b/AUTHORS
index 3275288476..9b7763593e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -34,5 +34,6 @@ Cliff Hodel
 Tiancheng Chen
 Reid Wahl
 Yihang Luo
+Alexandru Calotoiu
 
 and other contributors listed in https://github.com/spcl/dace/graphs/contributors
diff --git a/dace/transformation/interstate/__init__.py b/dace/transformation/interstate/__init__.py
index 8af9e901ab..0bd168751c 100644
--- a/dace/transformation/interstate/__init__.py
+++ b/dace/transformation/interstate/__init__.py
@@ -2,6 +2,7 @@
 """ This module initializes the inter-state transformations package."""
 
 from .state_fusion import StateFusion
+from .state_fusion_with_happens_before import StateFusionExtended
 from .state_elimination import (EndStateElimination, StartStateElimination, StateAssignElimination,
                                 SymbolAliasPromotion, HoistState)
 from .fpga_transform_state import FPGATransformState
diff --git a/dace/transformation/interstate/state_fusion_with_happens_before.py b/dace/transformation/interstate/state_fusion_with_happens_before.py
new file mode 100644
index 0000000000..a3d7d421ab
--- /dev/null
+++ b/dace/transformation/interstate/state_fusion_with_happens_before.py
@@ -0,0 +1,589 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+""" State fusion transformation """
+
+from typing import Dict, List, Set
+
+import networkx as nx
+
+from dace import data as dt, dtypes, registry, sdfg, subsets, memlet
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.sdfg import utils as sdutil
+from dace.sdfg.state import SDFGState
+from dace.transformation import transformation
+
+
+# Helper class for finding connected component correspondences
+class CCDesc:
+    def __init__(self, first_input_nodes: Set[nodes.AccessNode], first_output_nodes: Set[nodes.AccessNode],
+                 second_input_nodes: Set[nodes.AccessNode], second_output_nodes: Set[nodes.AccessNode]) -> None:
+        self.first_inputs = {n.data for n in first_input_nodes}
+        self.first_input_nodes = first_input_nodes
+        self.first_outputs = {n.data for n in first_output_nodes}
+        self.first_output_nodes = first_output_nodes
+        self.second_inputs = {n.data for n in second_input_nodes}
+        self.second_input_nodes = second_input_nodes
+        self.second_outputs = {n.data for n in second_output_nodes}
+        self.second_output_nodes = second_output_nodes
+
+
+def top_level_nodes(state: SDFGState):
+    return state.scope_children()[None]
+
+
+class StateFusionExtended(transformation.MultiStateTransformation):
+    """ Implements the state-fusion transformation extended to fuse states with RAW and WAW dependencies.
+        An empty memlet is used to represent a dependency between two subgraphs with RAW and WAW dependencies.
+        The merge is made by identifying the source in the first state and the sink in the second state, and linking the bottom of the appropriate source subgraph in the first state with the top of the appropriate sink subgraph in the second state.
+
+        State-fusion takes two states that are connected through a single edge,
+        and fuses them into one state. If permissive, also applies if potential memory
+        access hazards are created.
+    """
+    connections_to_make = []
+    first_state = transformation.PatternNode(sdfg.SDFGState)
+    second_state = transformation.PatternNode(sdfg.SDFGState)
+
+    @staticmethod
+    def annotates_memlets():
+        return False
+
+    @classmethod
+    def expressions(cls):
+        return [sdutil.node_path_graph(cls.first_state, cls.second_state)]
+
+    @staticmethod
+    def find_fused_components(first_cc_input, first_cc_output, second_cc_input, second_cc_output) -> List[CCDesc]:
+        # Make a bipartite graph out of the first and second components
+        g = nx.DiGraph()
+        g.add_nodes_from((0, i) for i in range(len(first_cc_output)))
+        g.add_nodes_from((1, i) for i in range(len(second_cc_output)))
+        # Find matching nodes in second state
+        for i, cc1 in enumerate(first_cc_output):
+            outnames1 = {n.data for n in cc1}
+            for j, cc2 in enumerate(second_cc_input):
+                inpnames2 = {n.data for n in cc2}
+                if len(outnames1 & inpnames2) > 0:
+                    g.add_edge((0, i), (1, j))
+
+        # Construct result out of connected components of the bipartite graph
+        result = []
+        for cc in nx.weakly_connected_components(g):
+            input1, output1, input2, output2 = set(), set(), set(), set()
+            for gind, cind in cc:
+                if gind == 0:
+                    input1 |= first_cc_input[cind]
+                    output1 |= first_cc_output[cind]
+                else:
+                    input2 |= second_cc_input[cind]
+                    output2 |= second_cc_output[cind]
+            result.append(CCDesc(input1, output1, input2, output2))
+
+        return result
+
+    @staticmethod
+    def memlets_intersect(graph_a: SDFGState, group_a: List[nodes.AccessNode], inputs_a: bool, graph_b: SDFGState,
+                          group_b: List[nodes.AccessNode], inputs_b: bool) -> bool:
+        """
+        Performs an all-pairs check for subset intersection on two
+        groups of nodes. If group intersects or result is indeterminate,
+        returns True as a precaution.
+        
+        :param graph_a: The graph in which the first set of nodes reside.
+        :param group_a: The first set of nodes to check.
+        :param inputs_a: If True, checks inputs of the first group.
+        :param graph_b: The graph in which the second set of nodes reside.
+        :param group_b: The second set of nodes to check.
+        :param inputs_b: If True, checks inputs of the second group.
+        :return: True if subsets intersect or result is indeterminate.
+        """
+        # Set traversal functions
+        src_subset = lambda e: (e.data.src_subset if e.data.src_subset is not None else e.data.dst_subset)
+        dst_subset = lambda e: (e.data.dst_subset if e.data.dst_subset is not None else e.data.src_subset)
+        if inputs_a:
+            edges_a = [e for n in group_a for e in graph_a.out_edges(n)]
+            subset_a = src_subset
+        else:
+            edges_a = [e for n in group_a for e in graph_a.in_edges(n)]
+            subset_a = dst_subset
+        if inputs_b:
+            edges_b = [e for n in group_b for e in graph_b.out_edges(n)]
+            subset_b = src_subset
+        else:
+            edges_b = [e for n in group_b for e in graph_b.in_edges(n)]
+            subset_b = dst_subset
+
+        # Simple all-pairs check
+        for ea in edges_a:
+            for eb in edges_b:
+                result = subsets.intersects(subset_a(ea), subset_b(eb))
+                if result is True or result is None:
+                    return True
+        return False
+
+    def has_path(self, first_state: SDFGState, second_state: SDFGState,
+                 match_nodes: Dict[nodes.AccessNode, nodes.AccessNode], node_a: nodes.Node, node_b: nodes.Node) -> bool:
+        """ Check for paths between the two states if they are fused. """
+        for match_a, match_b in match_nodes.items():
+            if nx.has_path(first_state._nx, node_a, match_a) and nx.has_path(second_state._nx, match_b, node_b):
+                return True
+        return False
+
+    def _check_all_paths(self, first_state: SDFGState, second_state: SDFGState,
+                         match_nodes: Dict[nodes.AccessNode, nodes.AccessNode], nodes_first: List[nodes.AccessNode],
+                         nodes_second: List[nodes.AccessNode], first_read: bool, second_read: bool) -> bool:
+        for node_a in nodes_first:
+            succ_a = first_state.successors(node_a)
+            for node_b in nodes_second:
+                if all(self.has_path(first_state, second_state, match_nodes, sa, node_b) for sa in succ_a):
+                    return True
+        # Path not found, check memlets
+        if StateFusionExtended.memlets_intersect(first_state, nodes_first, first_read, second_state, nodes_second,
+                                            second_read):
+            return False
+        return True
+
+    def _check_paths(self, first_state: SDFGState, second_state: SDFGState, match_nodes: Dict[nodes.AccessNode,
+                                                                                              nodes.AccessNode],
+                     nodes_first: List[nodes.AccessNode], nodes_second: List[nodes.AccessNode],
+                     second_input: Set[nodes.AccessNode], first_read: bool, second_read: bool) -> bool:
+        fail = False
+        path_found = False
+        for match in match_nodes:
+            for node in nodes_first:
+                path_to = nx.has_path(first_state._nx, node, match)
+                if not path_to:
+                    continue
+                path_found = True
+                node2 = next(n for n in second_input if n.data == match.data)
+                if not all(nx.has_path(second_state._nx, node2, n) for n in nodes_second):
+                    fail = True
+                    break
+            if fail or path_found:
+                break
+
+        # Check for intersection (if None, fusion is ok)
+        if fail or not path_found:
+            if StateFusionExtended.memlets_intersect(first_state, nodes_first, first_read, second_state, nodes_second,
+                                                second_read):
+                return False
+        return True
+
+    def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
+        first_state: SDFGState = self.first_state
+        second_state: SDFGState = self.second_state
+
+        out_edges = graph.out_edges(first_state)
+        in_edges = graph.in_edges(first_state)
+
+        # First state must have only one output edge (with dst the second
+        # state).
+        if len(out_edges) != 1:
+            return False
+        # If both states have more than one incoming edge, some control flow
+        # may become ambiguous
+        if len(in_edges) > 1 and graph.in_degree(second_state) > 1:
+            return False
+        # The interstate edge must not have a condition.
+        if not out_edges[0].data.is_unconditional():
+            return False
+        # The interstate edge may have assignments, as long as there are input
+        # edges to the first state that can absorb them.
+        if out_edges[0].data.assignments:
+            if not in_edges:
+                return False
+            # Fail if symbol is set before the state to fuse
+            new_assignments = set(out_edges[0].data.assignments.keys())
+            if any((new_assignments & set(e.data.assignments.keys())) for e in in_edges):
+                return False
+            # Fail if symbol is used in the dataflow of that state
+            if len(new_assignments & first_state.free_symbols) > 0:
+                return False
+            # Fail if assignments have free symbols that are updated in the
+            # first state
+            freesyms = out_edges[0].data.free_symbols
+            if freesyms and any(n.data in freesyms for n in first_state.nodes()
+                                if isinstance(n, nodes.AccessNode) and first_state.in_degree(n) > 0):
+                return False
+            # Fail if symbols assigned on the first edge are free symbols on the
+            # second edge
+            symbols_used = set(out_edges[0].data.free_symbols)
+            for e in in_edges:
+                if e.data.assignments.keys() & symbols_used:
+                    return False
+                # Also fail in the inverse; symbols assigned on the second edge are free symbols on the first edge
+                if new_assignments & set(e.data.free_symbols):
+                    return False
+
+        # There can be no state that have output edges pointing to both the
+        # first and the second state. Such a case will produce a multi-graph.
+        for src, _, _ in in_edges:
+            for _, dst, _ in graph.out_edges(src):
+                if dst == second_state:
+                    return False
+
+        if not permissive:
+            # Strict mode that inhibits state fusion if Python callbacks are involved
+            if Config.get_bool('frontend', 'dont_fuse_callbacks'):
+                for node in (first_state.data_nodes() + second_state.data_nodes()):
+                    if node.data == '__pystate':
+                        return False
+
+            # NOTE: This is quick fix for MPI Waitall (probably also needed for
+            # Wait), until we have a better SDFG representation of the buffer
+            # dependencies.
+            try:
+                next(node for node in first_state.nodes()
+                     if (isinstance(node, nodes.LibraryNode) and type(node).__name__ == 'Waitall')
+                     or node.label == '_Waitall_')
+                return False
+            except StopIteration:
+                pass
+            try:
+                next(node for node in second_state.nodes()
+                     if (isinstance(node, nodes.LibraryNode) and type(node).__name__ == 'Waitall')
+                     or node.label == '_Waitall_')
+                return False
+            except StopIteration:
+                pass
+
+            # If second state has other input edges, there might be issues
+            # Exceptions are when none of the states contain dataflow, unless
+            # the first state is an initial state (in which case the new initial
+            # state would be ambiguous).
+            first_in_edges = graph.in_edges(first_state)
+            second_in_edges = graph.in_edges(second_state)
+            if ((not second_state.is_empty() or not first_state.is_empty() or len(first_in_edges) == 0)
+                    and len(second_in_edges) != 1):
+                return False
+
+            # Get connected components.
+            first_cc = [cc_nodes for cc_nodes in nx.weakly_connected_components(first_state._nx)]
+            second_cc = [cc_nodes for cc_nodes in nx.weakly_connected_components(second_state._nx)]
+
+            # Find source/sink (data) nodes
+            first_input = {node for node in first_state.source_nodes() if isinstance(node, nodes.AccessNode)}
+            first_output = {
+                node
+                for node in first_state.scope_children()[None]
+                if isinstance(node, nodes.AccessNode) and node not in first_input
+            }
+            second_input = {node for node in second_state.source_nodes() if isinstance(node, nodes.AccessNode)}
+            second_output = {
+                node
+                for node in second_state.scope_children()[None]
+                if isinstance(node, nodes.AccessNode) and node not in second_input
+            }
+
+            # Find source/sink (data) nodes by connected component
+            first_cc_input = [cc.intersection(first_input) for cc in first_cc]
+            first_cc_output = [cc.intersection(first_output) for cc in first_cc]
+            second_cc_input = [cc.intersection(second_input) for cc in second_cc]
+            second_cc_output = [cc.intersection(second_output) for cc in second_cc]
+
+            # Apply transformation in case all paths to the second state's
+            # nodes go through the same access node, which implies sequential
+            # behavior in SDFG semantics.
+            first_output_names = {node.data for node in first_output}
+            second_input_names = {node.data for node in second_input}
+
+            # If any second input appears more than once, fail
+            if len(second_input) > len(second_input_names):
+                return False
+
+            # If any first output that is an input to the second state
+            # appears in more than one CC, fail
+            matches = first_output_names & second_input_names
+            for match in matches:
+                cc_appearances = 0
+                for cc in first_cc_output:
+                    if len([n for n in cc if n.data == match]) > 0:
+                        cc_appearances += 1
+                if cc_appearances > 1:
+                    return False
+
+            # Recreate fused connected component correspondences, and then
+            # check for hazards
+            resulting_ccs: List[CCDesc] = StateFusionExtended.find_fused_components(first_cc_input, first_cc_output,
+                                                                               second_cc_input, second_cc_output)
+
+            # Check for data races
+            for fused_cc in resulting_ccs:
+                # Write-Write hazard - data is output of both first and second
+                # states, without a read in between
+                write_write_candidates = ((fused_cc.first_outputs & fused_cc.second_outputs) - fused_cc.second_inputs)
+
+                # Find the leaf (topological) instances of the matches
+                order = [
+                    x for x in reversed(list(nx.topological_sort(first_state._nx)))
+                    if isinstance(x, nodes.AccessNode) and x.data in fused_cc.first_outputs
+                ]
+                # Those nodes will be the connection points upon fusion
+                match_nodes: Dict[nodes.AccessNode, nodes.AccessNode] = {
+                    next(n for n in order
+                         if n.data == match): next(n for n in fused_cc.second_input_nodes if n.data == match)
+                    for match in (fused_cc.first_outputs
+                                  & fused_cc.second_inputs)
+                }
+
+                # If we have potential candidates, check if there is a
+                # path from the first write to the second write (in that
+                # case, there is no hazard):
+                for cand in write_write_candidates:
+                    nodes_first = [n for n in first_output if n.data == cand]
+                    nodes_second = [n for n in second_output if n.data == cand]
+
+                    # If there is a path for the candidate that goes through
+                    # the match nodes in both states, there is no conflict
+                    if not self._check_paths(first_state, second_state, match_nodes, nodes_first, nodes_second,
+                                             second_input, False, False):
+                        return False
+                # End of write-write hazard check
+
+                first_inout = fused_cc.first_inputs | fused_cc.first_outputs
+                for other_cc in resulting_ccs:
+                    # NOTE: Special handling for `other_cc is fused_cc`
+                    if other_cc is fused_cc:
+                        # Checking for potential Read-Write data races
+                        for d in first_inout:
+                            if d in other_cc.second_outputs:
+                                nodes_second = [n for n in second_output if n.data == d]
+                                # Read-Write race
+                                if d in fused_cc.first_inputs:
+                                    nodes_first = [n for n in first_input if n.data == d]
+                                else:
+                                    nodes_first = []
+                                for n2 in nodes_second:
+                                    for e in second_state.in_edges(n2):
+                                        path = second_state.memlet_path(e)
+                                        src = path[0].src
+                                        if src in second_input and src.data in fused_cc.first_outputs:
+                                            for n1 in fused_cc.first_output_nodes:
+                                                if n1.data == src.data:
+                                                    for n0 in nodes_first:
+                                                        if not nx.has_path(first_state._nx, n0, n1):
+                                                            return False
+                                # Read-write hazard where an access node is connected
+                                # to more than one output at once: (a) -> (b)  |  (d) -> [code] -> (d)
+                                #                                     \-> (c)  |
+                                # in the first state, and the same memory is inout in the second state
+                                # All paths need to lead to `src`
+                                if not self._check_all_paths(first_state, second_state, match_nodes, nodes_first,
+                                                             nodes_second, True, False):
+                                    return False
+
+                        continue
+                    # If an input/output of a connected component in the first
+                    # state is an output of another connected component in the
+                    # second state, we have a potential data race (Read-Write
+                    # or Write-Write)
+                    for d in first_inout:
+                        if d in other_cc.second_outputs:
+                            # Check for intersection (if None, fusion is ok)
+                            nodes_second = [n for n in second_output if n.data == d]
+                            # Read-Write race
+                            if d in fused_cc.first_inputs:
+                                nodes_first = [n for n in first_input if n.data == d]
+                                if StateFusionExtended.memlets_intersect(first_state, nodes_first, True, second_state,
+                                                                    nodes_second, False):
+                                    self.connections_to_make.append([nodes_first, nodes_second])
+                                    #return False
+                            # Write-Write race
+                            if d in fused_cc.first_outputs:
+                                nodes_first = [n for n in first_output if n.data == d]
+                                if StateFusionExtended.memlets_intersect(first_state, nodes_first, False, second_state,
+                                                                    nodes_second, False):
+                                    self.connections_to_make.append([nodes_first, nodes_second])
+                                    #return False
+                    # End of data race check
+
+                # Read-after-write dependencies: if there is an output of the
+                # second state that is an input of the first, ensure all paths
+                # from the input of the first state lead to the output.
+                # Otherwise, there may be a RAW due to topological sort or
+                # concurrency.
+                second_inout = ((fused_cc.first_inputs | fused_cc.first_outputs) & fused_cc.second_outputs)
+                for inout in second_inout:
+                    nodes_first = [n for n in match_nodes if n.data == inout]
+                    if any(first_state.out_degree(n) > 0 for n in nodes_first):
+                        return False
+
+                    # If we have potential candidates, check if there is a
+                    # path from the first read to the second write (in that
+                    # case, there is no hazard):
+                    nodes_first = {
+                        n
+                        for n in fused_cc.first_input_nodes
+                        | fused_cc.first_output_nodes if n.data == inout
+                    }
+                    nodes_second = {n for n in fused_cc.second_output_nodes if n.data == inout}
+
+                    # If there is a path for the candidate that goes through
+                    # the match nodes in both states, there is no conflict
+                    if not self._check_paths(first_state, second_state, match_nodes, nodes_first, nodes_second,
+                                             second_input, True, False):
+                        return False
+
+                # End of read-write hazard check
+
+                # Read-after-write dependencies: if there is more than one first
+                # output with the same data, make sure it can be unambiguously
+                # connected to the second state
+                if (len(fused_cc.first_output_nodes) > len(fused_cc.first_outputs)):
+                    for inpnode in fused_cc.second_input_nodes:
+                        found = None
+                        for outnode in fused_cc.first_output_nodes:
+                            if outnode.data != inpnode.data:
+                                continue
+                            if StateFusionExtended.memlets_intersect(first_state, [outnode], False, second_state, [inpnode],
+                                                                True):
+                                # If found more than once, either there is a
+                                # path from one to another or it is ambiguous
+                                if found is not None:
+                                    if nx.has_path(first_state.nx, outnode, found):
+                                        # Found is a descendant, continue
+                                        continue
+                                    elif nx.has_path(first_state.nx, found, outnode):
+                                        # New node is a descendant, set as found
+                                        found = outnode
+                                    else:
+                                        # No path: ambiguous match
+                                        return False
+                                found = outnode
+
+        # Do not fuse FPGA and NON-FPGA states (unless one of them is empty)
+        if first_state.number_of_nodes() > 0 and second_state.number_of_nodes() > 0 and sdutil.is_fpga_kernel(
+                sdfg, first_state) != sdutil.is_fpga_kernel(sdfg, second_state):
+            return False
+
+        return True
+
+    def apply(self, _, sdfg):
+        first_state: SDFGState = self.first_state
+        second_state: SDFGState = self.second_state
+
+        # Remove interstate edge(s)
+        edges = sdfg.edges_between(first_state, second_state)
+        for edge in edges:
+            if edge.data.assignments:
+                for src, dst, other_data in sdfg.in_edges(first_state):
+                    other_data.assignments.update(edge.data.assignments)
+            sdfg.remove_edge(edge)
+
+        # Special case 1: first state is empty
+        if first_state.is_empty():
+            sdutil.change_edge_dest(sdfg, first_state, second_state)
+            sdfg.remove_node(first_state)
+            if sdfg.start_state == first_state:
+                sdfg.start_state = sdfg.node_id(second_state)
+            return
+
+        # Special case 2: second state is empty
+        if second_state.is_empty():
+            sdutil.change_edge_src(sdfg, second_state, first_state)
+            sdutil.change_edge_dest(sdfg, second_state, first_state)
+            sdfg.remove_node(second_state)
+            if sdfg.start_state == second_state:
+                sdfg.start_state = sdfg.node_id(first_state)
+            return
+
+        # Normal case: both states are not empty
+
+        # Find source/sink (data) nodes
+        first_input = [node for node in first_state.source_nodes() if isinstance(node, nodes.AccessNode)]
+        first_output = [node for node in first_state.sink_nodes() if isinstance(node, nodes.AccessNode)]
+        second_input = [node for node in second_state.source_nodes() if isinstance(node, nodes.AccessNode)]
+
+        top2 = top_level_nodes(second_state)
+
+        # first input = first input - first output
+        first_input = [
+            node for node in first_input if next((x for x in first_output if x.data == node.data), None) is None
+        ]
+
+        # NOTE: We exclude Views from the process of merging common data nodes because it may lead to double edges.
+        second_mid = [
+            x for x in list(nx.topological_sort(second_state._nx)) if isinstance(x, nodes.AccessNode)
+            and second_state.out_degree(x) > 0 and not isinstance(sdfg.arrays[x.data], dt.View)
+        ]
+
+        # Merge second state to first state
+        # First keep a backup of the topological sorted order of the nodes
+        sdict = first_state.scope_dict()
+        order = [
+            x for x in reversed(list(nx.topological_sort(first_state._nx)))
+            if isinstance(x, nodes.AccessNode) and sdict[x] is None
+        ]
+        for node in second_state.nodes():
+            if isinstance(node, nodes.NestedSDFG):
+                # update parent information
+                node.sdfg.parent = first_state
+            #we only want to add the node once!    
+            try:
+                first_state.add_node(node)
+            except:
+                pass
+                
+            for conn in self.connections_to_make:
+                if node in conn[1]:
+                    for i in top2:
+                        if i not in [nodex for nodex in second_state.source_nodes()]:
+                            continue
+                        paths = second_state.all_nodes_between(i, node)
+                        direct_edges = second_state.edges_between(i, node)
+
+                        if ((paths != None and len(paths) > 0) or len(direct_edges) > 0):
+                            for j in conn[0]:
+                                if j in first_output:
+                                    first_state.add_nedge(j, i, memlet.Memlet())
+        for src, src_conn, dst, dst_conn, data in second_state.edges():
+            first_state.add_edge(src, src_conn, dst, dst_conn, data)
+
+        top = top_level_nodes(first_state)
+
+        # Merge common (data) nodes
+        merged_nodes = set()
+        for node in second_mid:
+
+            # merge only top level nodes, skip everything else
+            if node not in top2:
+                continue
+
+            candidates = [x for x in order if x.data == node.data and x in top and x not in merged_nodes]
+            source_node = first_state.in_degree(node) == 0
+
+            # If not source node, try to connect every memlet-intersecting candidate
+            if not source_node:
+                for cand in candidates:
+                    if StateFusionExtended.memlets_intersect(first_state, [cand], False, second_state, [node], True):
+                        if nx.has_path(first_state._nx, cand, node):  # Do not create cycles
+                            continue
+                        sdutil.change_edge_src(first_state, cand, node)
+                        sdutil.change_edge_dest(first_state, cand, node)
+                        first_state.remove_node(cand)
+                continue
+
+            if len(candidates) == 0:
+                continue
+            elif len(candidates) == 1:
+                n = candidates[0]
+            else:
+                # Choose first candidate that intersects memlets
+                for cand in candidates:
+                    if StateFusionExtended.memlets_intersect(first_state, [cand], False, second_state, [node], True):
+                        n = cand
+                        break
+                else:
+                    # No node intersects, use topologically-last node
+                    n = candidates[0]
+
+            sdutil.change_edge_src(first_state, node, n)
+            sdutil.change_edge_dest(first_state, node, n)
+            first_state.remove_node(node)
+            merged_nodes.add(n)
+
+        # Redirect edges and remove second state
+        sdutil.change_edge_src(sdfg, second_state, first_state)
+        sdfg.remove_node(second_state)
+        if sdfg.start_state == second_state:
+            sdfg.start_state = sdfg.node_id(first_state)
diff --git a/tests/transformations/state_fusion_extended_test.py b/tests/transformations/state_fusion_extended_test.py
new file mode 100644
index 0000000000..97ba8da2b9
--- /dev/null
+++ b/tests/transformations/state_fusion_extended_test.py
@@ -0,0 +1,65 @@
+from dace import SDFG, InterstateEdge,Memlet
+from dace import dtypes
+from dace.transformation.interstate import StateFusionExtended
+
+
+def test_extended_fusion():
+    """
+    Test the extended state fusion transformation.
+    It should fuse the two states into one and add a dependency between the two uses of tmp.
+    """
+    sdfg=SDFG('extended_state_fusion_test')
+    sdfg.add_array('A', [20, 20], dtypes.float64)
+    sdfg.add_array('B', [20, 20], dtypes.float64)
+    sdfg.add_array('C', [20, 20], dtypes.float64)
+    sdfg.add_array('D', [20, 20], dtypes.float64)
+    sdfg.add_array('E', [20, 20], dtypes.float64)
+    sdfg.add_array('F', [20, 20], dtypes.float64)
+
+    sdfg.add_scalar('tmp', dtypes.float64)
+
+    strt = sdfg.add_state("start")
+    mid = sdfg.add_state("middle")
+
+    sdfg.add_edge(strt, mid, InterstateEdge())
+
+    acc_a = strt.add_read('A')
+    acc_b = strt.add_read('B')
+    acc_c = strt.add_write('C')
+    acc_tmp = strt.add_access('tmp')
+
+    acc2_d = mid.add_read('D')
+    acc2_e = mid.add_read('E')
+    acc2_f = mid.add_write('F')
+    acc2_tmp = mid.add_access('tmp')
+
+    t1 = strt.add_tasklet('t1', {'a', 'b'}, {
+        'c',
+    }, 'c[1,1] = a[1,1] + b[1,1]')
+    t2 = strt.add_tasklet('t2', {}, {
+        'tmpa',
+    }, 'tmpa=4')
+
+    t3 = mid.add_tasklet('t3', {'d', 'e'}, {
+        'f',
+    }, 'f[1,1] = e[1,1] + d[1,1]')
+    t4 = mid.add_tasklet('t4', {}, {
+        'tmpa',
+    }, 'tmpa=7')
+
+    strt.add_edge(acc_a, None, t1, 'a', Memlet.simple('A', '1,1'))
+    strt.add_edge(acc_b, None, t1, 'b', Memlet.simple('B', '1,1'))
+    strt.add_edge(t1, 'c', acc_c, None, Memlet.simple('C', '1,1'))
+    strt.add_edge(t2, 'tmpa', acc_tmp, None, Memlet.simple('tmp', '0'))
+
+    mid.add_edge(acc2_d, None, t3, 'd', Memlet.simple('D', '1,1'))
+    mid.add_edge(acc2_e, None, t3, 'e', Memlet.simple('E', '1,1'))
+    mid.add_edge(t3, 'f', acc2_f, None, Memlet.simple('F', '1,1'))
+    mid.add_edge(t4, 'tmpa', acc2_tmp, None, Memlet.simple('tmp', '0'))
+    sdfg.simplify()
+    sdfg.apply_transformations_repeated(StateFusionExtended)
+    assert sdfg.number_of_nodes()==1
+
+
+if __name__ == '__main__':
+    test_extended_fusion()

From e7aadae52e4d94d3403bcc63fc592db65a4bee4b Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Thu, 8 Jun 2023 18:15:17 +0200
Subject: [PATCH 098/392] Fix and update requirements.txt (#1270)

---
 dace/version.py  |  2 +-
 requirements.txt | 41 ++++++++++++++++++++---------------------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/dace/version.py b/dace/version.py
index c41af0b6a1..62ea085b13 100644
--- a/dace/version.py
+++ b/dace/version.py
@@ -1 +1 @@
-__version__ = '0.14.2'
+__version__ = '0.14.3'
diff --git a/requirements.txt b/requirements.txt
index 5e3b265a26..d75a38ac75 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,26 +1,25 @@
-aenum==3.1.11
+aenum==3.1.12
 astunparse==1.6.3
-chardet==3.0.4
-click==7.1.2
-cmake==3.18.2.post1
-decorator==4.4.2
-distro==1.5.0
+blinker==1.6.2
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+dill==0.3.6
 Flask==2.3.2
-idna==2.10
-itsdangerous==2.0.0a1
-Jinja2==3.0.0a1
-MarkupSafe==2.0.0a1
+idna==3.4
+importlib-metadata==6.6.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+MarkupSafe==2.1.3
 mpmath==1.3.0
-networkx==2.5
-numpy>=1.21
-packaging==20.4
+networkx==3.1
+numpy==1.24.3
 ply==3.11
-pyparsing==3.0.0a2
-PyYAML==5.4
+PyYAML==6.0
 requests==2.31.0
-scikit-build==0.11.1
-six==1.15.0
-sympy==1.7
-urllib3==1.26.5
-websockets==9.1
-Werkzeug==2.2.3
+six==1.16.0
+sympy==1.9
+urllib3==2.0.3
+websockets==11.0.3
+Werkzeug==2.3.5
+zipp==3.15.0

From 3dcc04c36933c4f60be0be7ef6884cc268cd75af Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Fri, 9 Jun 2023 13:38:12 +0200
Subject: [PATCH 099/392] yapf

---
 .../interstate/state_fusion_with_happens_before.py            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dace/transformation/interstate/state_fusion_with_happens_before.py b/dace/transformation/interstate/state_fusion_with_happens_before.py
index a3d7d421ab..ecaeb988b4 100644
--- a/dace/transformation/interstate/state_fusion_with_happens_before.py
+++ b/dace/transformation/interstate/state_fusion_with_happens_before.py
@@ -34,7 +34,9 @@ def top_level_nodes(state: SDFGState):
 class StateFusionExtended(transformation.MultiStateTransformation):
     """ Implements the state-fusion transformation extended to fuse states with RAW and WAW dependencies.
         An empty memlet is used to represent a dependency between two subgraphs with RAW and WAW dependencies.
-        The merge is made by identifying the source in the first state and the sink in the second state, and linking the bottom of the appropriate source subgraph in the first state with the top of the appropriate sink subgraph in the second state.
+        The merge is made by identifying the source in the first state and the sink in the second state, 
+        and linking the bottom of the appropriate source subgraph in the first state with the top of the
+        appropriate sink subgraph in the second state.
 
         State-fusion takes two states that are connected through a single edge,
         and fuses them into one state. If permissive, also applies if potential memory

From 4d3031b74372391dc4a79631064fb249deeb9645 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Fri, 9 Jun 2023 14:59:22 +0200
Subject: [PATCH 100/392] removing redundant try/catch

---
 .../interstate/state_fusion_with_happens_before.py         | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/dace/transformation/interstate/state_fusion_with_happens_before.py b/dace/transformation/interstate/state_fusion_with_happens_before.py
index ecaeb988b4..a5fbd814ac 100644
--- a/dace/transformation/interstate/state_fusion_with_happens_before.py
+++ b/dace/transformation/interstate/state_fusion_with_happens_before.py
@@ -520,11 +520,8 @@ def apply(self, _, sdfg):
             if isinstance(node, nodes.NestedSDFG):
                 # update parent information
                 node.sdfg.parent = first_state
-            #we only want to add the node once!    
-            try:
-                first_state.add_node(node)
-            except:
-                pass
+            
+            first_state.add_node(node)
                 
             for conn in self.connections_to_make:
                 if node in conn[1]:

From 5e3b3022aa994e9926755d0d140984004f514773 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Sun, 11 Jun 2023 12:47:10 +0200
Subject: [PATCH 101/392] fix for adding duplicate nodes

---
 .../state_fusion_with_happens_before.py       | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/dace/transformation/interstate/state_fusion_with_happens_before.py b/dace/transformation/interstate/state_fusion_with_happens_before.py
index a5fbd814ac..4c6ad3c992 100644
--- a/dace/transformation/interstate/state_fusion_with_happens_before.py
+++ b/dace/transformation/interstate/state_fusion_with_happens_before.py
@@ -141,7 +141,7 @@ def _check_all_paths(self, first_state: SDFGState, second_state: SDFGState,
                     return True
         # Path not found, check memlets
         if StateFusionExtended.memlets_intersect(first_state, nodes_first, first_read, second_state, nodes_second,
-                                            second_read):
+                                                 second_read):
             return False
         return True
 
@@ -167,7 +167,7 @@ def _check_paths(self, first_state: SDFGState, second_state: SDFGState, match_no
         # Check for intersection (if None, fusion is ok)
         if fail or not path_found:
             if StateFusionExtended.memlets_intersect(first_state, nodes_first, first_read, second_state, nodes_second,
-                                                second_read):
+                                                     second_read):
                 return False
         return True
 
@@ -307,7 +307,7 @@ def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
             # Recreate fused connected component correspondences, and then
             # check for hazards
             resulting_ccs: List[CCDesc] = StateFusionExtended.find_fused_components(first_cc_input, first_cc_output,
-                                                                               second_cc_input, second_cc_output)
+                                                                                    second_cc_input, second_cc_output)
 
             # Check for data races
             for fused_cc in resulting_ccs:
@@ -387,14 +387,14 @@ def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
                             if d in fused_cc.first_inputs:
                                 nodes_first = [n for n in first_input if n.data == d]
                                 if StateFusionExtended.memlets_intersect(first_state, nodes_first, True, second_state,
-                                                                    nodes_second, False):
+                                                                         nodes_second, False):
                                     self.connections_to_make.append([nodes_first, nodes_second])
                                     #return False
                             # Write-Write race
                             if d in fused_cc.first_outputs:
                                 nodes_first = [n for n in first_output if n.data == d]
                                 if StateFusionExtended.memlets_intersect(first_state, nodes_first, False, second_state,
-                                                                    nodes_second, False):
+                                                                         nodes_second, False):
                                     self.connections_to_make.append([nodes_first, nodes_second])
                                     #return False
                     # End of data race check
@@ -437,8 +437,8 @@ def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
                         for outnode in fused_cc.first_output_nodes:
                             if outnode.data != inpnode.data:
                                 continue
-                            if StateFusionExtended.memlets_intersect(first_state, [outnode], False, second_state, [inpnode],
-                                                                True):
+                            if StateFusionExtended.memlets_intersect(first_state, [outnode], False, second_state,
+                                                                     [inpnode], True):
                                 # If found more than once, either there is a
                                 # path from one to another or it is ambiguous
                                 if found is not None:
@@ -520,9 +520,11 @@ def apply(self, _, sdfg):
             if isinstance(node, nodes.NestedSDFG):
                 # update parent information
                 node.sdfg.parent = first_state
-            
-            first_state.add_node(node)
-                
+
+            #The node could have been added when adding connections by add_nedge hence the need to check
+            if node not in first_state.nodes():
+                first_state.add_node(node)
+
             for conn in self.connections_to_make:
                 if node in conn[1]:
                     for i in top2:

From 87c2cf7c5170efceee5eceea4be047d2a545d400 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Sun, 11 Jun 2023 08:29:55 -0700
Subject: [PATCH 102/392] Add support for Python 3.11

---
 .github/workflows/general-ci.yml | 2 +-
 setup.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
index c52380d6ff..138726ef1d 100644
--- a/.github/workflows/general-ci.yml
+++ b/.github/workflows/general-ci.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7,'3.10']
+        python-version: [3.7,'3.11']
         simplify: [0,1,autoopt]
 
     steps:
diff --git a/setup.py b/setup.py
index 12562c2a85..10bce83184 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@
           "License :: OSI Approved :: BSD License",
           "Operating System :: OS Independent",
       ],
-      python_requires='>=3.6, <3.11',
+      python_requires='>=3.6, <3.12',
       packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
       package_data={
           '': [

From 57521150d27b21af96250f70254d4545d9684318 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 12 Jun 2023 00:39:58 +0200
Subject: [PATCH 103/392] initial commit with commented, formatted core of the
 fortran frontend

---
 dace/frontend/fortran/ast_components.py       | 1034 ++++++++++++++++
 dace/frontend/fortran/ast_internal_classes.py |  358 ++++++
 dace/frontend/fortran/ast_transforms.py       |  905 ++++++++++++++
 dace/frontend/fortran/ast_utils.py            |  363 ++++++
 dace/frontend/fortran/fortran_parser.py       | 1061 +++++++++++++++++
 5 files changed, 3721 insertions(+)
 create mode 100644 dace/frontend/fortran/ast_components.py
 create mode 100644 dace/frontend/fortran/ast_internal_classes.py
 create mode 100644 dace/frontend/fortran/ast_transforms.py
 create mode 100644 dace/frontend/fortran/ast_utils.py
 create mode 100644 dace/frontend/fortran/fortran_parser.py

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
new file mode 100644
index 0000000000..6e83d6c477
--- /dev/null
+++ b/dace/frontend/fortran/ast_components.py
@@ -0,0 +1,1034 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+from fparser.two import Fortran2008
+from fparser.two import Fortran2003
+from fparser.two import symbol_table
+
+import copy
+from dace.frontend.fortran import ast_internal_classes
+from dace.frontend.fortran.ast_internal_classes import FNode, Name_Node
+from typing import Any, List, Tuple, Type, TypeVar, Union, overload
+
+#We rely on fparser to provide an initial AST and convert to a version that is more suitable for our purposes
+
+# The following class is used to translate the fparser AST to our own AST of Fortran
+# the supported_fortran_types dictionary is used to determine which types are supported by our compiler
+# for each entry in the dictionary, the key is the name of the class in the fparser AST and the value is the name of the function that will be used to translate the fparser AST to our AST
+# the functions return an object of the class that is the name of the key in the dictionary with _Node appended to it to ensure it is diferentietated from the fparser AST
+FASTNode = Any
+T = TypeVar('T')
+
+
+@overload
+def get_child(node: Union[FASTNode, List[FASTNode]], child_type: str) -> FASTNode:
+    ...
+
+
+@overload
+def get_child(node: Union[FASTNode, List[FASTNode]], child_type: Type[T]) -> T:
+    ...
+
+
+def get_child(node: Union[FASTNode, List[FASTNode]], child_type: Union[str, Type[T], List[Type[T]]]):
+    if isinstance(node, list):
+        children = node
+    else:
+        children = node.children
+
+    if not isinstance(child_type, str) and not isinstance(child_type, list):
+        child_type = child_type.__name__
+        children_of_type = list(filter(lambda child: child.__class__.__name__ == child_type, children))
+
+    elif isinstance(child_type, list):
+        if all(isinstance(i, str) for i in child_type):
+            child_types = [i for i in child_type]
+        else:
+            child_types = [i.__name__ for i in child_type]
+        children_of_type = list(filter(lambda child: child.__class__.__name__ in child_types, children))
+
+    if len(children_of_type) == 1:
+        return children_of_type[0]
+    raise ValueError('Expected only one child of type {} but found {}'.format(child_type, children_of_type))
+
+
+@overload
+def get_children(node: Union[FASTNode, List[FASTNode]], child_type: str) -> List[FASTNode]:
+    ...
+
+
+@overload
+def get_children(node: Union[FASTNode, List[FASTNode]], child_type: Type[T]) -> List[T]:
+    ...
+
+
+def get_children(node: Union[FASTNode, List[FASTNode]], child_type: Union[str, Type[T], List[Type[T]]]):
+    if isinstance(node, list):
+        children = node
+    else:
+        children = node.children
+
+    if not isinstance(child_type, str) and not isinstance(child_type, list):
+        child_type = child_type.__name__
+        children_of_type = list(filter(lambda child: child.__class__.__name__ == child_type, children))
+
+    elif isinstance(child_type, list):
+        child_types = [i.__name__ for i in child_type]
+        children_of_type = list(filter(lambda child: child.__class__.__name__ in child_types, children))
+
+    elif isinstance(child_type, str):
+        children_of_type = list(filter(lambda child: child.__class__.__name__ == child_type, children))
+
+    return children_of_type
+
+
+def get_line(node: FASTNode):
+    line = None
+    if node.item is not None and hasattr(node.item, "span"):
+        line = node.item.span
+    else:
+        tmp = node
+        while tmp.parent is not None:
+            tmp = tmp.parent
+            if tmp.item is not None and hasattr(tmp.item, "span"):
+                line = tmp.item.span
+                break
+    return line
+
+
+class InternalFortranAst:
+    """
+    This class is used to translate the fparser AST to our own AST of Fortran
+    the supported_fortran_types dictionary is used to determine which types are supported by our compiler
+    for each entry in the dictionary, the key is the name of the class in the fparser AST and the value
+    is the name of the function that will be used to translate the fparser AST to our AST
+    """
+    def __init__(self, ast: Fortran2003.Program, tables: symbol_table.SymbolTables):
+        """
+        Initialization of the AST converter
+        :param ast: the fparser AST
+        :param tables: the symbol table of the fparser AST
+
+        """
+        self.ast = ast
+        self.tables = tables
+        self.functions_and_subroutines = []
+        self.symbols = {}
+        self.types = {
+            "LOGICAL": "BOOL",
+            "CHARACTER": "CHAR",
+            "INTEGER": "INTEGER",
+            "INTEGER4": "INTEGER",
+            "REAL4": "REAL",
+            "REAL8": "DOUBLE",
+            "DOUBLE PRECISION": "DOUBLE",
+            "REAL": "REAL",
+        }
+        self.supported_fortran_syntax = {
+            "str": self.str_node,
+            "tuple": self.tuple_node,
+            "Program": self.program,
+            "Main_Program": self.main_program,
+            "Program_Stmt": self.program_stmt,
+            "End_Program_Stmt": self.end_program_stmt,
+            "Subroutine_Subprogram": self.subroutine_subprogram,
+            "Function_Subprogram": self.function_subprogram,
+            "Subroutine_Stmt": self.subroutine_stmt,
+            "Function_Stmt": self.function_stmt,
+            "End_Subroutine_Stmt": self.end_subroutine_stmt,
+            "End_Function_Stmt": self.end_function_stmt,
+            "Module": self.module,
+            "Module_Stmt": self.module_stmt,
+            "End_Module_Stmt": self.end_module_stmt,
+            "Use_Stmt": self.use_stmt,
+            "Implicit_Part": self.implicit_part,
+            "Implicit_Stmt": self.implicit_stmt,
+            "Implicit_None_Stmt": self.implicit_none_stmt,
+            "Implicit_Part_Stmt": self.implicit_part_stmt,
+            "Declaration_Construct": self.declaration_construct,
+            "Declaration_Type_Spec": self.declaration_type_spec,
+            "Type_Declaration_Stmt": self.type_declaration_stmt,
+            "Entity_Decl": self.entity_decl,
+            "Array_Spec": self.array_spec,
+            "Ac_Value_List": self.ac_value_list,
+            "Array_Constructor": self.array_constructor,
+            "Loop_Control": self.loop_control,
+            "Block_Nonlabel_Do_Construct": self.block_nonlabel_do_construct,
+            "Real_Literal_Constant": self.real_literal_constant,
+            "Subscript_Triplet": self.subscript_triplet,
+            "Section_Subscript_List": self.section_subscript_list,
+            "Explicit_Shape_Spec_List": self.explicit_shape_spec_list,
+            "Explicit_Shape_Spec": self.explicit_shape_spec,
+            "Type_Attr_Spec": self.type_attr_spec,
+            "Attr_Spec": self.attr_spec,
+            "Intent_Spec": self.intent_spec,
+            "Access_Spec": self.access_spec,
+            "Allocatable_Stmt": self.allocatable_stmt,
+            "Asynchronous_Stmt": self.asynchronous_stmt,
+            "Bind_Stmt": self.bind_stmt,
+            "Common_Stmt": self.common_stmt,
+            "Data_Stmt": self.data_stmt,
+            "Dimension_Stmt": self.dimension_stmt,
+            "External_Stmt": self.external_stmt,
+            "Intent_Stmt": self.intent_stmt,
+            "Intrinsic_Stmt": self.intrinsic_stmt,
+            "Optional_Stmt": self.optional_stmt,
+            "Parameter_Stmt": self.parameter_stmt,
+            "Pointer_Stmt": self.pointer_stmt,
+            "Protected_Stmt": self.protected_stmt,
+            "Save_Stmt": self.save_stmt,
+            "Target_Stmt": self.target_stmt,
+            "Value_Stmt": self.value_stmt,
+            "Volatile_Stmt": self.volatile_stmt,
+            "Execution_Part": self.execution_part,
+            "Execution_Part_Construct": self.execution_part_construct,
+            "Action_Stmt": self.action_stmt,
+            "Assignment_Stmt": self.assignment_stmt,
+            "Pointer_Assignment_Stmt": self.pointer_assignment_stmt,
+            "Where_Stmt": self.where_stmt,
+            "Forall_Stmt": self.forall_stmt,
+            "Where_Construct": self.where_construct,
+            "Where_Construct_Stmt": self.where_construct_stmt,
+            "Masked_Elsewhere_Stmt": self.masked_elsewhere_stmt,
+            "Elsewhere_Stmt": self.elsewhere_stmt,
+            "End_Where_Stmt": self.end_where_stmt,
+            "Forall_Construct": self.forall_construct,
+            "Forall_Header": self.forall_header,
+            "Forall_Triplet_Spec": self.forall_triplet_spec,
+            "Forall_Stmt": self.forall_stmt,
+            "End_Forall_Stmt": self.end_forall_stmt,
+            "Arithmetic_If_Stmt": self.arithmetic_if_stmt,
+            "If_Construct": self.if_construct,
+            "If_Stmt": self.if_stmt,
+            "If_Then_Stmt": self.if_then_stmt,
+            "Else_If_Stmt": self.else_if_stmt,
+            "Else_Stmt": self.else_stmt,
+            "End_If_Stmt": self.end_if_stmt,
+            "Case_Construct": self.case_construct,
+            "Select_Case_Stmt": self.select_case_stmt,
+            "Case_Stmt": self.case_stmt,
+            "End_Select_Stmt": self.end_select_stmt,
+            "Do_Construct": self.do_construct,
+            "Label_Do_Stmt": self.label_do_stmt,
+            "Nonlabel_Do_Stmt": self.nonlabel_do_stmt,
+            "End_Do_Stmt": self.end_do_stmt,
+            "Interface_Block": self.interface_block,
+            "Interface_Stmt": self.interface_stmt,
+            "End_Interface_Stmt": self.end_interface_stmt,
+            "Generic_Spec": self.generic_spec,
+            "Name": self.name,
+            "Type_Name": self.type_name,
+            "Specification_Part": self.specification_part,
+            "Intrinsic_Type_Spec": self.intrinsic_type_spec,
+            "Entity_Decl_List": self.entity_decl_list,
+            "Int_Literal_Constant": self.int_literal_constant,
+            "Logical_Literal_Constant": self.logical_literal_constant,
+            "Actual_Arg_Spec_List": self.actual_arg_spec_list,
+            "Attr_Spec_List": self.attr_spec_list,
+            "Initialization": self.initialization,
+            "Procedure_Declaration_Stmt": self.procedure_declaration_stmt,
+            "Type_Bound_Procedure_Part": self.type_bound_procedure_part,
+            "Contains_Stmt": self.contains_stmt,
+            "Call_Stmt": self.call_stmt,
+            "Return_Stmt": self.return_stmt,
+            "Stop_Stmt": self.stop_stmt,
+            "Dummy_Arg_List": self.dummy_arg_list,
+            "Part_Ref": self.part_ref,
+            "Level_2_Expr": self.level_2_expr,
+            "Equiv_Operand": self.level_2_expr,
+            "Level_3_Expr": self.level_2_expr,
+            "Level_4_Expr": self.level_2_expr,
+            "Add_Operand": self.level_2_expr,
+            "Or_Operand": self.level_2_expr,
+            "And_Operand": self.level_2_expr,
+            "Level_2_Unary_Expr": self.level_2_expr,
+            "Mult_Operand": self.power_expr,
+            "Parenthesis": self.parenthesis_expr,
+            "Intrinsic_Name": self.intrinsic_name,
+            "Intrinsic_Function_Reference": self.intrinsic_function_reference,
+            "Only_List": self.only_list,
+            "Structure_Constructor": self.structure_constructor,
+            "Component_Spec_List": self.component_spec_list,
+            "Write_Stmt": self.write_stmt,
+        }
+
+    def list_tables(self):
+        for i in self.tables._symbol_tables:
+            print(i)
+
+    def create_children(self, node: FASTNode):
+        return [self.create_ast(child)
+                for child in node] if isinstance(node,
+                                                 (list,
+                                                  tuple)) else [self.create_ast(child) for child in node.children]
+
+
+    def create_ast(self, node=None):
+        """
+        Creates an AST from a FASTNode
+        :param node: FASTNode
+        :note: this is a recursive function, and relies on the dictionary of supported syntax to call the correct converter functions
+        """
+        if node is not None:
+            if isinstance(node, (list, tuple)):
+                return [self.create_ast(child) for child in node]
+            return self.supported_fortran_syntax[type(node).__name__](node)
+        return None
+
+    def write_stmt(self, node: FASTNode):
+        children = self.create_children(node.children[1])
+        line = get_line(node)
+        return ast_internal_classes.Write_Stmt_Node(args=children, line_number=line)
+
+    def program(self, node: FASTNode):
+        children = self.create_children(node)
+
+        main_program = get_child(children, ast_internal_classes.Main_Program_Node)
+
+        function_definitions = [i for i in children if isinstance(i, ast_internal_classes.Function_Subprogram_Node)]
+
+        subroutine_definitions = [i for i in children if isinstance(i, ast_internal_classes.Subroutine_Subprogram_Node)]
+        modules = [node for node in children if isinstance(node, ast_internal_classes.Module_Node)]
+
+        return ast_internal_classes.Program_Node(main_program=main_program,
+                                                 function_definitions=function_definitions,
+                                                 subroutine_definitions=subroutine_definitions,
+                                                 modules=modules)
+
+    def main_program(self, node: FASTNode):
+        children = self.create_children(node)
+
+        name = get_child(children, ast_internal_classes.Program_Stmt_Node)
+        specification_part = get_child(children, ast_internal_classes.Specification_Part_Node)
+        execution_part = get_child(children, ast_internal_classes.Execution_Part_Node)
+
+        return ast_internal_classes.Main_Program_Node(name=name,
+                                                      specification_part=specification_part,
+                                                      execution_part=execution_part)
+
+    def program_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, Name_Node)
+        return ast_internal_classes.Program_Stmt_Node(name=name, line_number=node.item.span)
+
+    def subroutine_subprogram(self, node: FASTNode):
+        children = self.create_children(node)
+
+        name = get_child(children, ast_internal_classes.Subroutine_Stmt_Node)
+        specification_part = get_child(children, ast_internal_classes.Specification_Part_Node)
+        execution_part = get_child(children, ast_internal_classes.Execution_Part_Node)
+        return_type = ast_internal_classes.Void
+        return ast_internal_classes.Subroutine_Subprogram_Node(
+            name=name.name,
+            args=name.args,
+            specification_part=specification_part,
+            execution_part=execution_part,
+            type=return_type,
+            line_number=name.line_number,
+        )
+
+    def end_program_stmt(self, node: FASTNode):
+        return node
+
+    def only_list(self, node: FASTNode):
+        children = self.create_children(node)
+        names = [i for i in children if isinstance(i, ast_internal_classes.Name_Node)]
+        return ast_internal_classes.Only_List_Node(names=names)
+
+    def function_subprogram(self, node: FASTNode):
+        raise NotImplementedError("Function subprograms are not supported yet")
+
+    def subroutine_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        args = get_child(children, ast_internal_classes.Arg_List_Node)
+        return ast_internal_classes.Subroutine_Stmt_Node(name=name, args=args.args, line_number=node.item.span)
+
+    def ac_value_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Ac_Value_List_Node(value_list=children)
+
+    def power_expr(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+        #child 0 is the base, child 2 is the exponent
+        #child 1 is "**"
+        return ast_internal_classes.Call_Expr_Node(name=ast_internal_classes.Name_Node(name="pow"),
+                                                   args=[children[0], children[2]],
+                                                   line_number=line)
+
+    def array_constructor(self, node: FASTNode):
+        children = self.create_children(node)
+        value_list = get_child(children, ast_internal_classes.Ac_Value_List_Node)
+        return ast_internal_classes.Array_Constructor_Node(value_list=value_list.value_list)
+
+    def structure_constructor(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Type_Name_Node)
+        args = get_child(children, ast_internal_classes.Component_Spec_List_Node)
+        return ast_internal_classes.Structure_Constructor_Node(name=name, args=args.args, type=None)
+
+    def intrinsic_name(self, node: FASTNode):
+        name = node.string
+        replacements = {
+            "INT": "__dace_int",
+            "DBLE": "__dace_dble",
+            "SQRT": "sqrt",
+            "COSH": "cosh",
+            "ABS": "abs",
+            "MIN": "min",
+            "MAX": "max",
+            "EXP": "exp",
+            "EPSILON": "__dace_epsilon",
+            "TANH": "tanh",
+            "SUM": "__dace_sum",
+            "SIGN": "__dace_sign",
+            "EXP": "exp",
+            "SELECTED_INT_KIND": "__dace_selected_int_kind",
+            "SELECTED_REAL_KIND": "__dace_selected_real_kind",
+        }
+        return ast_internal_classes.Name_Node(name=replacements[name])
+
+    def intrinsic_function_reference(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        args = get_child(children, ast_internal_classes.Arg_List_Node)
+        if name.name == "__dace_selected_int_kind":
+            import math
+            return ast_internal_classes.Int_Literal_Node(value=str(
+                math.ceil((math.log2(math.pow(10, int(args.args[0].value))) + 1) / 8)),
+                                                         line_number=line)
+        # TODO This needs a better translation
+        elif name.name == "__dace_selected_real_kind":
+            if args.args[0].value == '13' and args.args[1].value == '300':
+                return ast_internal_classes.Int_Literal_Node(value="8", line_number=line)
+            elif args.args[0].value == '2' and args.args[1].value == '1':
+                return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
+            elif args.args[0].value == '4' and args.args[1].value == '2':
+                return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
+            elif args.args[0].value == '6' and args.args[1].value == '37':
+                return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
+            else:
+                raise NotImplementedError("Only real*8 is supported")
+        func_types = {
+            "__dace_int": "INT",
+            "__dace_dble": "DOUBLE",
+            "sqrt": "DOUBLE",
+            "cosh": "DOUBLE",
+            "abs": "DOUBLE",
+            "min": "DOUBLE",
+            "max": "DOUBLE",
+            "exp": "DOUBLE",
+            "__dace_epsilon": "DOUBLE",
+            "tanh": "DOUBLE",
+            "__dace_sum": "DOUBLE",
+            "__dace_sign": "DOUBLE",
+            "exp": "DOUBLE",
+            "__dace_selected_int_kind": "INT",
+            "__dace_selected_real_kind": "INT",
+        }
+        call_type = func_types[name.name]
+        return ast_internal_classes.Call_Expr_Node(name=name, type=call_type, args=args.args, line_number=line)
+
+    def function_stmt(self, node: FASTNode):
+        raise NotImplementedError(
+            "Function statements are not supported yet - at least not if defined this way. Not encountered in code yet."
+        )
+
+    def end_subroutine_stmt(self, node: FASTNode):
+        return node
+
+    def end_function_stmt(self, node: FASTNode):
+        return node
+
+    def parenthesis_expr(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Parenthesis_Expr_Node(expr=children[1])
+
+    def module(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Module_Stmt_Node)
+        specification_part = get_child(children, ast_internal_classes.Specification_Part_Node)
+
+        function_definitions = [i for i in children if isinstance(i, ast_internal_classes.Function_Subprogram_Node)]
+
+        subroutine_definitions = [i for i in children if isinstance(i, ast_internal_classes.Subroutine_Subprogram_Node)]
+        return ast_internal_classes.Module_Node(
+            name=name.name,
+            specification_part=specification_part,
+            function_definitions=function_definitions,
+            subroutine_definitions=subroutine_definitions,
+            line_number=name.line_number,
+        )
+
+    def module_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        return ast_internal_classes.Module_Stmt_Node(name=name, line_number=node.item.span)
+
+    def end_module_stmt(self, node: FASTNode):
+        return node
+
+    def use_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        only_list = get_child(children, ast_internal_classes.Only_List_Node)
+        return ast_internal_classes.Use_Stmt_Node(name=name.name, list=only_list.names)
+
+    def implicit_part(self, node: FASTNode):
+        return node
+
+    def implicit_stmt(self, node: FASTNode):
+        return node
+
+    def implicit_none_stmt(self, node: FASTNode):
+        return node
+
+    def implicit_part_stmt(self, node: FASTNode):
+        return node
+
+    def declaration_construct(self, node: FASTNode):
+        raise NotImplementedError("Declaration constructs are not supported yet")
+        return node
+
+    def declaration_type_spec(self, node: FASTNode):
+        raise NotImplementedError("Declaration type spec is not supported yet")
+        return node
+
+    def type_declaration_stmt(self, node: FASTNode):
+
+        #decide if its a intrinsic variable type or a derived type
+
+        type_of_node = get_child(node, [Fortran2003.Intrinsic_Type_Spec, Fortran2003.Declaration_Type_Spec])
+
+        if isinstance(type_of_node, Fortran2003.Intrinsic_Type_Spec):
+            derived_type = False
+            basetype = type_of_node.items[0]
+        elif isinstance(type_of_node, Fortran2003.Declaration_Type_Spec):
+            derived_type = True
+            basetype = type_of_node.items[1].string
+        else:
+            raise TypeError("Type of node must be either Intrinsic_Type_Spec or Declaration_Type_Spec")
+        kind = None
+        if len(type_of_node.items) >= 2:
+            if type_of_node.items[1] is not None:
+                if not derived_type:
+                    kind = type_of_node.items[1].items[1].string
+                    if self.symbols[kind] is not None:
+                        if basetype == "REAL":
+                            if self.symbols[kind].value == "8":
+                                basetype = "REAL8"
+                        elif basetype == "INTEGER":
+                            if self.symbols[kind].value == "4":
+                                basetype = "INTEGER"
+                        else:
+                            raise TypeError("Derived type not supported")
+                    else:
+                        raise TypeError("Derived type not supported")
+                if derived_type:
+                    raise TypeError("Derived type not supported")
+        if not derived_type:
+            testtype = self.types[basetype]
+        else:
+
+            testtype = basetype
+
+        # get the names of the variables being defined
+        names_list = get_child(node, ["Entity_Decl_List", "Component_Decl_List"])
+
+        #get the names out of the name list
+        names = get_children(names_list, [Fortran2003.Entity_Decl, Fortran2003.Component_Decl])
+
+        #get the attributes of the variables being defined
+        # alloc relates to whether it is statically (False) or dynamically (True) allocated
+        # parameter means its a constant, so we should transform it into a symbol
+        attributes = get_children(node, "Attr_Spec_List")
+
+        alloc = False
+        symbol = False
+        for i in attributes:
+            if i.string.lower() == "allocatable":
+                alloc = True
+            if i.string.lower() == "parameter":
+                symbol = True
+
+        vardecls = []
+
+        for var in names:
+            #first handle dimensions
+            size = None
+            var_components = self.create_children(var)
+            array_sizes = get_children(var, "Explicit_Shape_Spec_List")
+            actual_name = get_child(var_components, ast_internal_classes.Name_Node)
+            if len(array_sizes) == 1:
+                array_sizes = array_sizes[0]
+                size = []
+                for dim in array_sizes.children:
+                    #sanity check
+                    if isinstance(dim, Fortran2003.Explicit_Shape_Spec):
+                        dim_expr = [i for i in dim.children if i is not None]
+                        if len(dim_expr) == 1:
+                            dim_expr = dim_expr[0]
+                            #now to add the dimension to the size list after processing it if necessary
+                            size.append(self.create_ast(dim_expr))
+                        else:
+                            raise TypeError("Array dimension must be a single expression")
+            #handle initializiation
+            init = None
+
+            initialization = get_children(var, Fortran2003.Initialization)
+            if len(initialization) == 1:
+                initialization = initialization[0]
+                #if there is an initialization, the actual expression is in the second child, with the first being the equals sign
+                if len(initialization.children) < 2:
+                    raise ValueError("Initialization must have an expression")
+                raw_init = initialization.children[1]
+                init = self.create_ast(raw_init)
+
+            if symbol == False:
+
+                vardecls.append(
+                    ast_internal_classes.Var_Decl_Node(name=actual_name.name,
+                                                       type=testtype,
+                                                       alloc=alloc,
+                                                       sizes=size,
+                                                       kind=kind,
+                                                       line_number=node.item.span))
+            else:
+                if size is None:
+                    self.symbols[actual_name.name] = init
+                    vardecls.append(
+                        ast_internal_classes.Symbol_Decl_Node(name=actual_name.name,
+                                                              type=testtype,
+                                                              alloc=alloc,
+                                                              init=init,
+                                                              line_number=node.item.span))
+                else:
+                    vardecls.append(
+                        ast_internal_classes.Symbol_Array_Decl_Node(name=actual_name.name,
+                                                                    type=testtype,
+                                                                    alloc=alloc,
+                                                                    sizes=size,
+                                                                    kind=kind,
+                                                                    init=init,
+                                                                    line_number=node.item.span))
+
+        return ast_internal_classes.Decl_Stmt_Node(vardecl=vardecls, line_number=node.item.span)
+
+    def entity_decl(self, node: FASTNode):
+        raise NotImplementedError("Entity decl is not supported yet")
+
+    def array_spec(self, node: FASTNode):
+        raise NotImplementedError("Array spec is not supported yet")
+        return node
+
+    def explicit_shape_spec_list(self, node: FASTNode):
+        return node
+
+    def explicit_shape_spec(self, node: FASTNode):
+        return node
+
+    def type_attr_spec(self, node: FASTNode):
+        return node
+
+    def attr_spec(self, node: FASTNode):
+        return node
+
+    def intent_spec(self, node: FASTNode):
+        raise NotImplementedError("Intent spec is not supported yet")
+        return node
+
+    def access_spec(self, node: FASTNode):
+        raise NotImplementedError("Access spec is not supported yet")
+        return node
+
+    def allocatable_stmt(self, node: FASTNode):
+        raise NotImplementedError("Allocatable stmt is not supported yet")
+        return node
+
+    def asynchronous_stmt(self, node: FASTNode):
+        raise NotImplementedError("Asynchronous stmt is not supported yet")
+        return node
+
+    def bind_stmt(self, node: FASTNode):
+        raise NotImplementedError("Bind stmt is not supported yet")
+        return node
+
+    def common_stmt(self, node: FASTNode):
+        raise NotImplementedError("Common stmt is not supported yet")
+        return node
+
+    def data_stmt(self, node: FASTNode):
+        raise NotImplementedError("Data stmt is not supported yet")
+        return node
+
+    def dimension_stmt(self, node: FASTNode):
+        raise NotImplementedError("Dimension stmt is not supported yet")
+        return node
+
+    def external_stmt(self, node: FASTNode):
+        raise NotImplementedError("External stmt is not supported yet")
+        return node
+
+    def intent_stmt(self, node: FASTNode):
+        return node
+
+    def intrinsic_stmt(self, node: FASTNode):
+        return node
+
+    def optional_stmt(self, node: FASTNode):
+        return node
+
+    def parameter_stmt(self, node: FASTNode):
+        return node
+
+    def pointer_stmt(self, node: FASTNode):
+        return node
+
+    def protected_stmt(self, node: FASTNode):
+        return node
+
+    def save_stmt(self, node: FASTNode):
+        return node
+
+    def target_stmt(self, node: FASTNode):
+        return node
+
+    def value_stmt(self, node: FASTNode):
+        return node
+
+    def volatile_stmt(self, node: FASTNode):
+        return node
+
+    def execution_part(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Execution_Part_Node(execution=children)
+
+    def execution_part_construct(self, node: FASTNode):
+        return node
+
+    def action_stmt(self, node: FASTNode):
+        return node
+
+    def level_2_expr(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+        if len(children) == 3:
+            return ast_internal_classes.BinOp_Node(lval=children[0], op=children[1], rval=children[2], line_number=line)
+        else:
+            return ast_internal_classes.UnOp_Node(lval=children[1], op=children[0], line_number=line)
+
+    def assignment_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+
+        if len(children) == 3:
+            return ast_internal_classes.BinOp_Node(lval=children[0], op=children[1], rval=children[2], line_number=line)
+        else:
+            return ast_internal_classes.UnOp_Node(lval=children[1], op=children[0], line_number=line)
+
+    def pointer_assignment_stmt(self, node: FASTNode):
+        return node
+
+    def where_stmt(self, node: FASTNode):
+        return node
+
+    def forall_stmt(self, node: FASTNode):
+        return node
+
+    def where_construct(self, node: FASTNode):
+        return node
+
+    def where_construct_stmt(self, node: FASTNode):
+        return node
+
+    def masked_elsewhere_stmt(self, node: FASTNode):
+        return node
+
+    def elsewhere_stmt(self, node: FASTNode):
+        return node
+
+    def end_where_stmt(self, node: FASTNode):
+        return node
+
+    def forall_construct(self, node: FASTNode):
+        return node
+
+    def forall_header(self, node: FASTNode):
+        return node
+
+    def forall_triplet_spec(self, node: FASTNode):
+        return node
+
+    def forall_stmt(self, node: FASTNode):
+        return node
+
+    def end_forall_stmt(self, node: FASTNode):
+        return node
+
+    def arithmetic_if_stmt(self, node: FASTNode):
+        return node
+
+    def if_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+        cond = children[0]
+        body = children[1:]
+        return ast_internal_classes.If_Stmt_Node(cond=cond,
+                                                 body=ast_internal_classes.Execution_Part_Node(execution=body),
+                                                 body_else=ast_internal_classes.Execution_Part_Node(execution=[]),
+                                                 line_number=line)
+
+    def if_construct(self, node: FASTNode):
+        children = self.create_children(node)
+        cond = children[0]
+        body = []
+        body_else = []
+        else_mode = False
+        line = get_line(node)
+        if line is None:
+            line = cond.line_number
+        toplevelIf = ast_internal_classes.If_Stmt_Node(cond=cond, line_number=line)
+        currentIf = toplevelIf
+        for i in children[1:-1]:
+            if isinstance(i, ast_internal_classes.Else_If_Stmt_Node):
+                newif = ast_internal_classes.If_Stmt_Node(cond=i.cond, line_number=i.line_number)
+                currentIf.body = ast_internal_classes.Execution_Part_Node(execution=body)
+                currentIf.body_else = ast_internal_classes.Execution_Part_Node(execution=[newif])
+                currentIf = newif
+                body = []
+                continue
+            if isinstance(i, ast_internal_classes.Else_Separator_Node):
+                else_mode = True
+                continue
+            if else_mode:
+                body_else.append(i)
+            else:
+                body.append(i)
+        currentIf.body = ast_internal_classes.Execution_Part_Node(execution=body)
+        currentIf.body_else = ast_internal_classes.Execution_Part_Node(execution=body_else)
+        return toplevelIf
+
+    def if_then_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        if len(children) != 1:
+            raise ValueError("If statement must have a condition")
+        return_value = children[0]
+        return_value.line_number = node.item.span
+        return return_value
+
+    def else_if_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Else_If_Stmt_Node(cond=children[0], line_number=get_line(node))
+
+    def else_stmt(self, node: FASTNode):
+        return ast_internal_classes.Else_Separator_Node(line_number=node.item.span)
+
+    def end_if_stmt(self, node: FASTNode):
+        return node
+
+    def case_construct(self, node: FASTNode):
+        return node
+
+    def select_case_stmt(self, node: FASTNode):
+        return node
+
+    def case_stmt(self, node: FASTNode):
+        return node
+
+    def end_select_stmt(self, node: FASTNode):
+        return node
+
+    def do_construct(self, node: FASTNode):
+        return node
+
+    def label_do_stmt(self, node: FASTNode):
+        return node
+
+    def nonlabel_do_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        loop_control = get_child(children, ast_internal_classes.Loop_Control_Node)
+        return ast_internal_classes.Nonlabel_Do_Stmt_Node(iter=loop_control.iter,
+                                                          cond=loop_control.cond,
+                                                          init=loop_control.init,
+                                                          line_number=node.item.span)
+
+    def end_do_stmt(self, node: FASTNode):
+        return node
+
+    def interface_block(self, node: FASTNode):
+        return node
+
+    def interface_stmt(self, node: FASTNode):
+        return node
+
+    def end_interface_stmt(self, node: FASTNode):
+        return node
+
+    def generic_spec(self, node: FASTNode):
+        return node
+
+    def procedure_declaration_stmt(self, node: FASTNode):
+        return node
+
+    def type_bound_procedure_part(self, node: FASTNode):
+        return node
+
+    def contains_stmt(self, node: FASTNode):
+        return node
+
+    def call_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        args = get_child(children, ast_internal_classes.Arg_List_Node)
+        return ast_internal_classes.Call_Expr_Node(name=name, args=args.args, type=None, line_number=node.item.span)
+
+    def return_stmt(self, node: FASTNode):
+        return node
+
+    def stop_stmt(self, node: FASTNode):
+        return node
+
+    def dummy_arg_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Arg_List_Node(args=children)
+
+    def component_spec_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Component_Spec_List_Node(args=children)
+
+    def attr_spec_list(self, node: FASTNode):
+        return node
+
+    def part_ref(self, node: FASTNode):
+        children = self.create_children(node)
+        line = get_line(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        args = get_child(children, ast_internal_classes.Section_Subscript_List_Node)
+        return ast_internal_classes.Call_Expr_Node(
+            name=name,
+            args=args.list,
+            line=line,
+        )
+
+    def loop_control(self, node: FASTNode):
+        children = self.create_children(node)
+        #Structure of loop control is:
+        # child[1]. Loop control variable
+        # child[1][0] Loop start
+        # child[1][1] Loop end
+        iteration_variable = children[1][0]
+        loop_start = children[1][1][0]
+        loop_end = children[1][1][1]
+        if len(children[1][1]) == 3:
+            loop_step = children[1][1][2]
+        else:
+            loop_step = ast_internal_classes.Int_Literal_Node(value="1")
+        init_expr = ast_internal_classes.BinOp_Node(lval=iteration_variable, op="=", rval=loop_start)
+        if isinstance(loop_step, ast_internal_classes.UnOp_Node):
+            if loop_step.op == "-":
+                cond_expr = ast_internal_classes.BinOp_Node(lval=iteration_variable, op=">=", rval=loop_end)
+        else:
+            cond_expr = ast_internal_classes.BinOp_Node(lval=iteration_variable, op="<=", rval=loop_end)
+        iter_expr = ast_internal_classes.BinOp_Node(lval=iteration_variable,
+                                                    op="=",
+                                                    rval=ast_internal_classes.BinOp_Node(lval=iteration_variable,
+                                                                                         op="+",
+                                                                                         rval=loop_step))
+        return ast_internal_classes.Loop_Control_Node(init=init_expr, cond=cond_expr, iter=iter_expr)
+
+    def block_nonlabel_do_construct(self, node: FASTNode):
+        children = self.create_children(node)
+        do = get_child(children, ast_internal_classes.Nonlabel_Do_Stmt_Node)
+        body = children[1:-1]
+        return ast_internal_classes.For_Stmt_Node(init=do.init,
+                                                  cond=do.cond,
+                                                  iter=do.iter,
+                                                  body=ast_internal_classes.Execution_Part_Node(execution=body),
+                                                  line_number=do.line_number)
+
+    def real_literal_constant(self, node: FASTNode):
+        return node
+
+    def subscript_triplet(self, node: FASTNode):
+        if node.string == ":":
+            return ast_internal_classes.ParDecl_Node(type="ALL")
+        children = self.create_children(node)
+        return ast_internal_classes.ParDecl_Node(type="RANGE", range=children)
+
+    def section_subscript_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Section_Subscript_List_Node(list=children)
+
+    def specification_part(self, node: FASTNode):
+        #TODO this can be refactored to consider more fortran declaration options. Currently limited to what is encountered in code.
+        others = [self.create_ast(i) for i in node.children if not isinstance(i, Fortran2008.Type_Declaration_Stmt)]
+
+        decls = [self.create_ast(i) for i in node.children if isinstance(i, Fortran2008.Type_Declaration_Stmt)]
+
+        uses = [self.create_ast(i) for i in node.children if isinstance(i, Fortran2008.Use_Stmt)]
+        tmp = [self.create_ast(i) for i in node.children]
+        typedecls = [i for i in tmp if isinstance(i, ast_internal_classes.Type_Decl_Node)]
+        symbols = []
+        for i in others:
+            if isinstance(i, list):
+                symbols.extend(j for j in i if isinstance(j, ast_internal_classes.Symbol_Array_Decl_Node))
+            if isinstance(i, ast_internal_classes.Decl_Stmt_Node):
+                symbols.extend(j for j in i.vardecl if isinstance(j, ast_internal_classes.Symbol_Array_Decl_Node))
+        for i in decls:
+            if isinstance(i, list):
+                symbols.extend(j for j in i if isinstance(j, ast_internal_classes.Symbol_Array_Decl_Node))
+            if isinstance(i, ast_internal_classes.Decl_Stmt_Node):
+                symbols.extend(j for j in i.vardecl if isinstance(j, ast_internal_classes.Symbol_Array_Decl_Node))
+        names_filtered = []
+        for j in symbols:
+            for i in decls:
+                names_filtered.extend(ii.name for ii in i.vardecl if j.name == ii.name)
+        decl_filtered = []
+        for i in decls:
+            # NOTE: Assignment/named expressions (walrus operator) works with Python 3.8 and later.
+            # if vardecl_filtered := [ii for ii in i.vardecl if ii.name not in names_filtered]:
+            vardecl_filtered = [ii for ii in i.vardecl if ii.name not in names_filtered]
+            if vardecl_filtered:
+                decl_filtered.append(ast_internal_classes.Decl_Stmt_Node(vardecl=vardecl_filtered))
+        return ast_internal_classes.Specification_Part_Node(specifications=decl_filtered,
+                                                            symbols=symbols,
+                                                            uses=uses,
+                                                            typedecls=typedecls)
+
+    def intrinsic_type_spec(self, node: FASTNode):
+        return node
+
+    def entity_decl_list(self, node: FASTNode):
+        return node
+
+    def int_literal_constant(self, node: FASTNode):
+        return ast_internal_classes.Int_Literal_Node(value=node.string)
+
+    def logical_literal_constant(self, node: FASTNode):
+        if node.string in [".TRUE.", ".true.", ".True."]:
+            return ast_internal_classes.Bool_Literal_Node(value="True")
+        if node.string in [".FALSE.", ".false.", ".False."]:
+            return ast_internal_classes.Bool_Literal_Node(value="False")
+        raise ValueError("Unknown logical literal constant")
+
+    def real_literal_constant(self, node: FASTNode):
+        return ast_internal_classes.Real_Literal_Node(value=node.string)
+
+    def actual_arg_spec_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Arg_List_Node(args=children)
+
+    def initialization(self, node: FASTNode):
+        return node
+
+    def name(self, node: FASTNode):
+        return ast_internal_classes.Name_Node(name=node.string)
+
+    def type_name(self, node: FASTNode):
+        return ast_internal_classes.Type_Name_Node(name=node.string)
+
+    def tuple_node(self, node: FASTNode):
+        return node
+
+    def str_node(self, node: FASTNode):
+        return node
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
new file mode 100644
index 0000000000..f4dba68fb4
--- /dev/null
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -0,0 +1,358 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, List, Tuple, Type, TypeVar, Union, overload
+
+# The node class is the base class for all nodes in the AST. It provides attributes including the line number and fields.
+# Attributes are not used when walking the tree, but are useful for debugging and for code generation.
+# The fields attribute is a list of the names of the attributes that are children of the node.
+
+
+class FNode(object):
+    def __init__(self, *args, **kwargs):  # real signature unknown
+        self.integrity_exceptions = []
+        self.read_vars = []
+        self.written_vars = []
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    _attributes = ("line_number", )
+    _fields = ()
+    integrity_exceptions: List
+    read_vars: List
+    written_vars: List
+
+    def __eq__(self, o: object) -> bool:
+        if type(self) is type(o):
+            # check that all fields and attributes match
+            self_field_vals = list(map(lambda name: getattr(self, name, None), self._fields))
+            self_attr_vals = list(map(lambda name: getattr(self, name, None), self._attributes))
+            o_field_vals = list(map(lambda name: getattr(o, name, None), o._fields))
+            o_attr_vals = list(map(lambda name: getattr(o, name, None), o._attributes))
+
+            return self_field_vals == o_field_vals and self_attr_vals == o_attr_vals
+        return False
+
+
+class Program_Node(FNode):
+    _attributes = ()
+    _fields = (
+        "main_program",
+        "function_definitions",
+        "subroutine_definitions",
+        "modules",
+    )
+
+
+class BinOp_Node(FNode):
+    _attributes = (
+        'op',
+        'type',
+    )
+    _fields = (
+        'lval',
+        'rval',
+    )
+
+
+class UnOp_Node(FNode):
+    _attributes = (
+        'op',
+        'postfix',
+        'type',
+    )
+    _fields = ('lval', )
+
+
+class Main_Program_Node(FNode):
+    _attributes = ("name", )
+    _fields = ("execution_part", "specification_part")
+
+
+class Module_Node(FNode):
+    _attributes = ('name', )
+    _fields = (
+        'specification_part',
+        'subroutine_definitions',
+        'function_definitions',
+    )
+
+
+class Function_Subprogram_Node(FNode):
+    _attributes = ('name', 'type', 'ret_name')
+    _fields = (
+        'args',
+        'specification_part',
+        'execution_part',
+    )
+
+
+class Subroutine_Subprogram_Node(FNode):
+    _attributes = ('name', 'type')
+    _fields = (
+        'args',
+        'specification_part',
+        'execution_part',
+    )
+
+
+class Module_Stmt_Node(FNode):
+    _attributes = ('name', )
+    _fields = ()
+
+
+class Program_Stmt_Node(FNode):
+    _attributes = ('name', )
+    _fields = ()
+
+
+class Subroutine_Stmt_Node(FNode):
+    _attributes = ('name', )
+    _fields = ('args', )
+
+
+class Function_Stmt_Node(FNode):
+    _attributes = ('name', )
+    _fields = ('args', 'return')
+
+
+class Name_Node(FNode):
+    _attributes = ('name', 'type')
+    _fields = ()
+
+
+class Name_Range_Node(FNode):
+    _attributes = ('name', 'type', 'arrname', 'pos')
+    _fields = ()
+
+
+class Type_Name_Node(FNode):
+    _attributes = ('name', 'type')
+    _fields = ()
+
+
+class Specification_Part_Node(FNode):
+    _fields = ('specifications', 'symbols', 'typedecls')
+
+
+class Execution_Part_Node(FNode):
+    _fields = ('execution', )
+
+
+class Statement_Node(FNode):
+    _attributes = ('col_offset', )
+    _fields = ()
+
+
+class Array_Subscript_Node(FNode):
+    _attributes = (
+        'name',
+        'type',
+    )
+    _fields = ('indices', )
+
+
+class Type_Decl_Node(Statement_Node):
+    _attributes = (
+        'name',
+        'type',
+    )
+    _fields = ()
+
+
+class Symbol_Decl_Node(Statement_Node):
+    _attributes = (
+        'name',
+        'type',
+        'alloc',
+    )
+    _fields = (
+        'sizes',
+        'typeref',
+        'init',
+    )
+
+
+class Symbol_Array_Decl_Node(Statement_Node):
+    _attributes = (
+        'name',
+        'type',
+        'alloc',
+    )
+    _fields = (
+        'sizes',
+        'typeref',
+        'init',
+    )
+
+
+class Var_Decl_Node(Statement_Node):
+    _attributes = (
+        'name',
+        'type',
+        'alloc',
+        'kind',
+    )
+    _fields = (
+        'sizes',
+        'typeref',
+        'init',
+    )
+
+
+class Arg_List_Node(FNode):
+    _fields = ('args', )
+
+
+class Component_Spec_List_Node(FNode):
+    _fields = ('args', )
+
+
+class Decl_Stmt_Node(Statement_Node):
+    _attributes = ()
+    _fields = ('vardecl', )
+
+
+class VarType:
+    _attributes = ()
+
+
+class Void(VarType):
+    _attributes = ()
+
+
+class Literal(FNode):
+    _attributes = ('value', )
+    _fields = ()
+
+
+class Int_Literal_Node(Literal):
+    _attributes = ()
+    _fields = ()
+
+
+class Real_Literal_Node(Literal):
+    _attributes = ()
+    _fields = ()
+
+
+class Bool_Literal_Node(Literal):
+    _attributes = ()
+    _fields = ()
+
+
+class String_Literal_Node(Literal):
+    _attributes = ()
+    _fields = ()
+
+
+class Char_Literal_Node(Literal):
+    _attributes = ()
+    _fields = ()
+
+
+class Call_Expr_Node(FNode):
+    _attributes = ('type', 'subroutine')
+    _fields = (
+        'name',
+        'args',
+    )
+
+
+class Array_Constructor_Node(FNode):
+    _attributes = ()
+    _fields = ('value_list', )
+
+
+class Ac_Value_List_Node(FNode):
+    _attributes = ()
+    _fields = ('value_list', )
+
+
+class Section_Subscript_List_Node(FNode):
+    _fields = ('list')
+
+
+class For_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = (
+        'init',
+        'cond',
+        'body',
+        'iter',
+    )
+
+
+class Map_Stmt_Node(For_Stmt_Node):
+    _attributes = ()
+    _fields = (
+        'init',
+        'cond',
+        'body',
+        'iter',
+    )
+
+
+class If_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = (
+        'cond',
+        'body',
+        'body_else',
+    )
+
+
+class Else_Separator_Node(FNode):
+    _attributes = ()
+    _fields = ()
+
+
+class Parenthesis_Expr_Node(FNode):
+    _attributes = ()
+    _fields = ('expr', )
+
+
+class Nonlabel_Do_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = (
+        'init',
+        'cond',
+        'iter',
+    )
+
+
+class Loop_Control_Node(FNode):
+    _attributes = ()
+    _fields = (
+        'init',
+        'cond',
+        'iter',
+    )
+
+
+class Else_If_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = ('cond', )
+
+
+class Only_List_Node(FNode):
+    _attributes = ()
+    _fields = ('names', )
+
+
+class ParDecl_Node(FNode):
+    _attributes = ('type', )
+    _fields = ('range', )
+
+
+class Structure_Constructor_Node(FNode):
+    _attributes = ('type', )
+    _fields = ('name', 'args')
+
+
+class Use_Stmt_Node(FNode):
+    _attributes = ('name', )
+    _fields = ('list', )
+
+
+class Write_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = ('args', )
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
new file mode 100644
index 0000000000..d8d83ab8c9
--- /dev/null
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -0,0 +1,905 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from dace.frontend.fortran import ast_components, ast_internal_classes
+from typing import List, Tuple, Set
+import copy
+
+
+def iter_fields(node: ast_internal_classes.FNode):
+    """
+    Yield a tuple of ``(fieldname, value)`` for each field in ``node._fields``
+    that is present on *node*.
+    """
+    if not hasattr(node, "_fields"):
+        a = 1
+    for field in node._fields:
+        try:
+            yield field, getattr(node, field)
+        except AttributeError:
+            pass
+
+
+def iter_child_nodes(node: ast_internal_classes.FNode):
+    """
+    Yield all direct child nodes of *node*, that is, all fields that are nodes
+    and all items of fields that are lists of nodes.
+    """
+    #print("CLASS: ",node.__class__)
+    #if isinstance(node,DeclRefExpr):
+    #print("NAME: ", node.name)
+
+    for name, field in iter_fields(node):
+        #print("NASME:",name)
+        if isinstance(field, ast_internal_classes.FNode):
+            yield field
+        elif isinstance(field, list):
+            for item in field:
+                if isinstance(item, ast_internal_classes.FNode):
+                    yield item
+
+
+class NodeVisitor(object):
+    """
+    A base node visitor class for Fortran ASTs.
+    Subclass it and define your own visit_XXX methods, where
+    XXX is the class name you want to visit with these
+    methods.
+    """
+    def visit(self, node: ast_internal_classes.FNode):
+        method = 'visit_' + node.__class__.__name__
+        visitor = getattr(self, method, self.generic_visit)
+        return visitor(node)
+
+    def generic_visit(self, node: ast_internal_classes.FNode):
+        """Called if no explicit visitor function exists for a node."""
+        for field, value in iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, ast_internal_classes.FNode):
+                        self.visit(item)
+            elif isinstance(value, ast_internal_classes.FNode):
+                self.visit(value)
+
+
+class NodeTransformer(NodeVisitor):
+    """
+    A base node visitor that walks the abstract syntax tree and allows
+    modification of nodes.
+    The `NodeTransformer` will walk the AST and use the return value of the
+    visitor methods to replace old nodes. 
+    """
+    def as_list(self, x):
+        if isinstance(x, list):
+            return x
+        if x is None:
+            return []
+        return [x]
+
+    def generic_visit(self, node: ast_internal_classes.FNode):
+        for field, old_value in iter_fields(node):
+            if isinstance(old_value, list):
+                new_values = []
+                for value in old_value:
+                    if isinstance(value, ast_internal_classes.FNode):
+                        value = self.visit(value)
+                        if value is None:
+                            continue
+                        elif not isinstance(value, ast_internal_classes.FNode):
+                            new_values.extend(value)
+                            continue
+                    new_values.append(value)
+                old_value[:] = new_values
+            elif isinstance(old_value, ast_internal_classes.FNode):
+                new_node = self.visit(old_value)
+                if new_node is None:
+                    delattr(node, field)
+                else:
+                    setattr(node, field, new_node)
+        return node
+
+
+class FindFunctionAndSubroutines(NodeVisitor):
+    """
+    Finds all function and subroutine names in the AST
+    :return: List of names
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Name_Node] = []
+
+    def visit_Subroutine_Subprogram_Node(self, node: ast_internal_classes.Subroutine_Subprogram_Node):
+        self.nodes.append(node.name)
+
+    def visit_Function_Subprogram_Node(self, node: ast_internal_classes.Function_Subprogram_Node):
+        self.nodes.append(node.name)
+
+
+class FindInputs(NodeVisitor):
+    """
+    Finds all inputs (reads) in the AST node and its children
+    :return: List of names
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Name_Node] = []
+
+    def visit_Name_Node(self, node: ast_internal_classes.Name_Node):
+        self.nodes.append(node)
+
+    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
+        self.nodes.append(node.name)
+        for i in node.indices:
+            self.visit(i)
+
+    def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node):
+        if node.op == "=":
+            if isinstance(node.lval, ast_internal_classes.Name_Node):
+                pass
+            elif isinstance(node.lval, ast_internal_classes.Array_Subscript_Node):
+                for i in node.lval.indices:
+                    self.visit(i)
+
+        else:
+            self.visit(node.lval)
+        self.visit(node.rval)
+
+
+class FindOutputs(NodeVisitor):
+    """
+    Finds all outputs (writes) in the AST node and its children
+    :return: List of names
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Name_Node] = []
+
+    def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node):
+        if node.op == "=":
+            if isinstance(node.lval, ast_internal_classes.Name_Node):
+                self.nodes.append(node.lval)
+            elif isinstance(node.lval, ast_internal_classes.Array_Subscript_Node):
+                self.nodes.append(node.lval.name)
+            self.visit(node.rval)
+
+
+class FindFunctionCalls(NodeVisitor):
+    """
+    Finds all function calls in the AST node and its children
+    :return: List of names
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Name_Node] = []
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        self.nodes.append(node)
+        for i in node.args:
+            self.visit(i)
+
+
+class CallToArray(NodeTransformer):
+    """
+    Fortran does not differentiate between arrays and functions.
+    We need to go over and convert all function calls to arrays.
+    So, we create a closure of all math and defined functions and 
+    create array expressions for the others.
+    """
+    def __init__(self, funcs=None):
+        if funcs is None:
+            funcs = []
+        self.funcs = funcs
+        self.excepted_funcs = [
+            "malloc", "exp", "pow", "sqrt", "cbrt", "max", "abs", "min", "__dace_sum", "__dace_sign", "tanh",
+            "__dace_epsilon"
+        ]
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        if isinstance(node.name, str):
+            return node
+        if node.name.name in self.excepted_funcs or node.name in self.funcs:
+            processed_args = []
+            for i in node.args:
+                arg = CallToArray(self.funcs).visit(i)
+                processed_args.append(arg)
+            node.args = processed_args
+            return node
+        indices = [CallToArray(self.funcs).visit(i) for i in node.args]
+        return ast_internal_classes.Array_Subscript_Node(name=node.name, indices=indices)
+
+
+class CallExtractorNodeLister(NodeVisitor):
+    """
+    Finds all function calls in the AST node and its children that have to be extracted into independent expressions
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Call_Expr_Node] = []
+
+    def visit_For_Stmt_Node(self, node: ast_internal_classes.For_Stmt_Node):
+        return
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        stop = False
+        if hasattr(node, "subroutine"):
+            if node.subroutine is True:
+                stop = True
+        if not stop and node.name.name not in [
+                "malloc", "exp", "pow", "sqrt", "cbrt", "max", "min", "abs", "tanh", "__dace_epsilon"
+        ]:
+            self.nodes.append(node)
+        return self.generic_visit(node)
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        return
+
+
+class CallExtractor(NodeTransformer):
+    """
+    Uses the CallExtractorNodeLister to find all function calls
+    in the AST node and its children that have to be extracted into independent expressions
+    It then creates a new temporary variable for each of them and replaces the call with the variable.
+    """
+    def __init__(self, count=0):
+        self.count = count
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+
+        if node.name.name in ["malloc", "exp", "pow", "sqrt", "cbrt", "max", "min", "abs", "tanh", "__dace_epsilon"]:
+            return self.generic_visit(node)
+        if hasattr(node, "subroutine"):
+            if node.subroutine is True:
+                return self.generic_visit(node)
+        if not hasattr(self, "count"):
+            self.count = 0
+        else:
+            self.count = self.count + 1
+        tmp = self.count
+        return ast_internal_classes.Name_Node(name="tmp_call_" + str(tmp - 1))
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        newbody = []
+
+        for child in node.execution:
+            lister = CallExtractorNodeLister()
+            lister.visit(child)
+            res = lister.nodes
+            for i in res:
+                if i == child:
+                    res.pop(res.index(i))
+            temp = self.count
+            if res is not None:
+                for i in range(0, len(res)):
+
+                    if (res[i].name.name == "__dace_sum"):
+                        newbody.append(
+                            ast_internal_classes.Decl_Stmt_Node(vardecl=[
+                                ast_internal_classes.Var_Decl_Node(
+                                    name="tmp_call_" + str(temp),
+                                    type=res[i].type,
+                                    sizes=None,
+                                )
+                            ]))
+                        newbody.append(
+                            ast_internal_classes.BinOp_Node(lval=ast_internal_classes.Name_Node(name="tmp_call_" +
+                                                                                                str(temp)),
+                                                            op="=",
+                                                            rval=ast_internal_classes.Int_Literal_Node(value="0"),
+                                                            line_number=child.line_number))
+                    else:
+
+                        newbody.append(
+                            ast_internal_classes.Decl_Stmt_Node(vardecl=[
+                                ast_internal_classes.Var_Decl_Node(
+                                    name="tmp_call_" + str(temp),
+                                    type=res[i].type,
+                                    sizes=None,
+                                )
+                            ]))
+                    newbody.append(
+                        ast_internal_classes.BinOp_Node(op="=",
+                                                        lval=ast_internal_classes.Name_Node(name="tmp_call_" +
+                                                                                            str(temp),
+                                                                                            type=res[i].type),
+                                                        rval=res[i],
+                                                        line_number=child.line_number))
+                    temp = temp + 1
+            if isinstance(child, ast_internal_classes.Call_Expr_Node):
+                new_args = []
+                if hasattr(child, "args"):
+                    for i in child.args:
+                        new_args.append(self.visit(i))
+                new_child = ast_internal_classes.Call_Expr_Node(type=child.type,
+                                                                name=child.name,
+                                                                args=new_args,
+                                                                line_number=child.line_number)
+                newbody.append(new_child)
+            else:
+                newbody.append(self.visit(child))
+
+        return ast_internal_classes.Execution_Part_Node(execution=newbody)
+
+
+class IndexExtractorNodeLister(NodeVisitor):
+    """
+    Finds all array subscript expressions in the AST node and its children that have to be extracted into independent expressions
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.Array_Subscript_Node] = []
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
+            return self.generic_visit(node)
+        else:
+            return
+
+    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
+        self.nodes.append(node)
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        return
+
+
+class IndexExtractor(NodeTransformer):
+    """
+    Uses the IndexExtractorNodeLister to find all array subscript expressions
+    in the AST node and its children that have to be extracted into independent expressions
+    It then creates a new temporary variable for each of them and replaces the index expression with the variable.
+    """
+    def __init__(self, count=0):
+        self.count = count
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
+            return self.generic_visit(node)
+        else:
+            return node
+
+    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
+
+        tmp = self.count
+        new_indices = []
+        for i in node.indices:
+            if isinstance(i, ast_internal_classes.ParDecl_Node):
+                new_indices.append(i)
+            else:
+                new_indices.append(ast_internal_classes.Name_Node(name="tmp_index_" + str(tmp)))
+                tmp = tmp + 1
+        self.count = tmp
+        return ast_internal_classes.Array_Subscript_Node(
+            name=node.name,
+            indices=new_indices,
+        )
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        newbody = []
+
+        for child in node.execution:
+            lister = IndexExtractorNodeLister()
+            lister.visit(child)
+            res = lister.nodes
+            temp = self.count
+            if res is not None:
+                for j in res:
+                    for i in j.indices:
+                        if isinstance(i, ast_internal_classes.ParDecl_Node):
+                            continue
+                        else:
+                            tmp_name = "tmp_index_" + str(temp)
+                            temp = temp + 1
+                            newbody.append(
+                                ast_internal_classes.Decl_Stmt_Node(vardecl=[
+                                    ast_internal_classes.Var_Decl_Node(name=tmp_name,
+                                                                       type="INTEGER",
+                                                                       sizes=None,
+                                                                       line_number=child.line_number)
+                                ],
+                                                                    line_number=child.line_number))
+                            newbody.append(
+                                ast_internal_classes.BinOp_Node(
+                                    op="=",
+                                    lval=ast_internal_classes.Name_Node(name=tmp_name),
+                                    rval=ast_internal_classes.BinOp_Node(
+                                        op="-",
+                                        lval=i,
+                                        rval=ast_internal_classes.Int_Literal_Node(value="1"),
+                                        line_number=child.line_number),
+                                    line_number=child.line_number))
+            newbody.append(self.visit(child))
+        return ast_internal_classes.Execution_Part_Node(execution=newbody)
+
+
+class SignToIf(NodeTransformer):
+    """
+    Transforms all sign expressions into if statements
+    """
+    def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node):
+        if isinstance(node.rval, ast_internal_classes.Call_Expr_Node) and node.rval.name.name == "__dace_sign":
+            args = node.rval.args
+            lval = node.lval
+            cond = ast_internal_classes.BinOp_Node(op=">=",
+                                                   rval=ast_internal_classes.Real_Literal_Node(value="0.0"),
+                                                   lval=args[1],
+                                                   line_number=node.line_number)
+            body_if = ast_internal_classes.Execution_Part_Node(execution=[
+                ast_internal_classes.BinOp_Node(lval=copy.deepcopy(lval),
+                                                op="=",
+                                                rval=ast_internal_classes.Call_Expr_Node(
+                                                    name=ast_internal_classes.Name_Node(name="abs"),
+                                                    type="DOUBLE",
+                                                    args=[copy.deepcopy(args[0])],
+                                                    line_number=node.line_number),
+                                                line_number=node.line_number)
+            ])
+            body_else = ast_internal_classes.Execution_Part_Node(execution=[
+                ast_internal_classes.BinOp_Node(lval=copy.deepcopy(lval),
+                                                op="=",
+                                                rval=ast_internal_classes.UnOp_Node(
+                                                    op="-",
+                                                    lval=ast_internal_classes.Call_Expr_Node(
+                                                        name=ast_internal_classes.Name_Node(name="abs"),
+                                                        type="DOUBLE",
+                                                        args=[copy.deepcopy(args[0])],
+                                                        line_number=node.line_number),
+                                                    line_number=node.line_number),
+                                                line_number=node.line_number)
+            ])
+            return (ast_internal_classes.If_Stmt_Node(cond=cond,
+                                                      body=body_if,
+                                                      body_else=body_else,
+                                                      line_number=node.line_number))
+
+        else:
+            return self.generic_visit(node)
+
+
+class RenameArguments(NodeTransformer):
+    """
+    Renames all arguments of a function to the names of the arguments of the function call
+    Used when eliminating function statements
+    """
+    def __init__(self, node_args: list, call_args: list):
+        self.node_args = node_args
+        self.call_args = call_args
+
+    def visit_Name_Node(self, node: ast_internal_classes.Name_Node):
+        for i, j in zip(self.node_args, self.call_args):
+            if node.name == j.name:
+                return copy.deepcopy(i)
+        return node
+
+
+class ReplaceFunctionStatement(NodeTransformer):
+    """
+    Replaces a function statement with its content, similar to propagating a macro
+    """
+    def __init__(self, statement, replacement):
+        self.name = statement.name
+        self.content = replacement
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        if node.name == self.name:
+            return ast_internal_classes.Parenthesis_Expr_Node(expr=copy.deepcopy(self.content))
+        else:
+            return self.generic_visit(node)
+
+
+class ReplaceFunctionStatementPass(NodeTransformer):
+    """
+    Replaces a function statement with its content, similar to propagating a macro
+    """
+    def __init__(self, statefunc: list):
+        self.funcs = statefunc
+
+    def visit_Structure_Constructor_Node(self, node: ast_internal_classes.Structure_Constructor_Node):
+        for i in self.funcs:
+            if node.name.name == i[0].name.name:
+                ret_node = copy.deepcopy(i[1])
+                ret_node = RenameArguments(node.args, i[0].args).visit(ret_node)
+                return ast_internal_classes.Parenthesis_Expr_Node(expr=ret_node)
+        return self.generic_visit(node)
+
+    def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+        for i in self.funcs:
+            if node.name.name == i[0].name.name:
+                ret_node = copy.deepcopy(i[1])
+                ret_node = RenameArguments(node.args, i[0].args).visit(ret_node)
+                return ast_internal_classes.Parenthesis_Expr_Node(expr=ret_node)
+        return self.generic_visit(node)
+
+
+def functionStatementEliminator(node=ast_internal_classes.Program_Node):
+    """
+    Eliminates function statements from the AST
+    :param node: The AST to be transformed
+    :return: The transformed AST
+    :note Should only be used on the program node
+    """
+    main_program = localFunctionStatementEliminator(node.main_program)
+    function_definitions = [localFunctionStatementEliminator(i) for i in node.function_definitions]
+    subroutine_definitions = [localFunctionStatementEliminator(i) for i in node.subroutine_definitions]
+    modules = []
+    for i in node.modules:
+        module_function_definitions = [localFunctionStatementEliminator(j) for j in i.function_definitions]
+        module_subroutine_definitions = [localFunctionStatementEliminator(j) for j in i.subroutine_definitions]
+        modules.append(
+            ast_internal_classes.Module_Node(
+                name=i.name,
+                specification_part=i.specification_part,
+                subroutine_definitions=module_subroutine_definitions,
+                function_definitions=module_function_definitions,
+            ))
+    return ast_internal_classes.Program_Node(main_program=main_program,
+                                             function_definitions=function_definitions,
+                                             subroutine_definitions=subroutine_definitions,
+                                             modules=modules)
+
+
+def localFunctionStatementEliminator(node: ast_internal_classes.FNode):
+    """
+    Eliminates function statements from the AST
+    :param node: The AST to be transformed
+    :return: The transformed AST
+    """
+    spec = node.specification_part.specifications
+    exec = node.execution_part.execution
+    new_exec = exec.copy()
+    to_change = []
+    for i in exec:
+        if isinstance(i, ast_internal_classes.BinOp_Node):
+            if i.op == "=":
+                if isinstance(i.lval, ast_internal_classes.Call_Expr_Node) or isinstance(
+                        i.lval, ast_internal_classes.Structure_Constructor_Node):
+                    function_statement_name = i.lval.name
+                    is_actually_function_statement = False
+                    #In Fortran, function statement are defined as scalar values, but called as arrays, so by identifiying that it is called as a call_expr or structure_constructor, we also need to match the specification part and see that it is scalar rather than an array.
+                    found = False
+                    for j in spec:
+                        if found:
+                            break
+                        for k in j.vardecl:
+                            if k.name == function_statement_name.name:
+                                if k.sizes is None:
+                                    is_actually_function_statement = True
+                                    function_statement_type = k.type
+                                    j.vardecl.remove(k)
+                                    found = True
+                                    break
+                    if is_actually_function_statement:
+                        to_change.append([i.lval, i.rval])
+                        new_exec.remove(i)
+                        print("Function statement found and removed: ", function_statement_name)
+                    else:
+                        #There are no function statements after the first one that isn't
+                        break
+    still_changing = True
+    while still_changing:
+        still_changing = False
+        for i in to_change:
+            rval = i[1]
+            calls = FindFunctionCalls()
+            calls.visit(rval)
+            for j in to_change:
+                for k in calls.nodes:
+                    if k.name == j[0].name:
+                        calls_to_replace = FindFunctionCalls()
+                        calls_to_replace.visit(j[1])
+                        #must check if it is recursive and contains other function statements
+                        it_is_simple = True
+                        for l in calls_to_replace.nodes:
+                            for m in to_change:
+                                if l.name == m[0].name:
+                                    it_is_simple = False
+                        if it_is_simple:
+                            still_changing = True
+                            i[1] = ReplaceFunctionStatement(j[0], j[1]).visit(rval)
+    final_exec = []
+    for i in new_exec:
+        final_exec.append(ReplaceFunctionStatementPass(to_change).visit(i))
+    node.execution_part.execution = final_exec
+    node.specification_part.specifications = spec
+    return node
+
+
+class ArrayLoopNodeLister(NodeVisitor):
+    """
+    Finds all array operations that have to be transformed to loops in the AST
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.FNode] = []
+        self.range_nodes: List[ast_internal_classes.FNode] = []
+
+    def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node):
+        rval_pardecls = [i for i in mywalk(node.rval) if isinstance(i, ast_internal_classes.ParDecl_Node)]
+        lval_pardecls = [i for i in mywalk(node.lval) if isinstance(i, ast_internal_classes.ParDecl_Node)]
+        if len(lval_pardecls) > 0:
+            if len(rval_pardecls) == 1:
+                self.range_nodes.append(node)
+                self.nodes.append(node)
+                return
+            elif len(rval_pardecls) > 1:
+                for i in rval_pardecls:
+                    if i != rval_pardecls[0]:
+                        raise NotImplementedError("Only supporting one range in right expression")
+
+                self.range_nodes.append(node)
+                self.nodes.append(node)
+                return
+            else:
+                self.nodes.append(node)
+                return
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        return
+
+
+class SumLoopNodeLister(NodeVisitor):
+    """
+    Finds all sum operations that have to be transformed to loops in the AST
+    """
+    def __init__(self):
+        self.nodes: List[ast_internal_classes.FNode] = []
+
+    def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node):
+
+        if isinstance(node.rval, ast_internal_classes.Call_Expr_Node):
+            if node.rval.name.name == "__dace_sum":
+                self.nodes.append(node)
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        return
+
+
+def par_Decl_Range_Finder(node: ast_internal_classes.Array_Subscript_Node,
+                          ranges: list,
+                          rangepos: list,
+                          count: int,
+                          newbody: list,
+                          declaration=True,
+                          is_sum_to_loop=False):
+    """
+    Helper function for the transformation of array operations and sums to loops
+    :param node: The AST to be transformed
+    :param ranges: The ranges of the loop
+    :param rangepos: The positions of the ranges
+    :param count: The current count of the loop
+    :param newbody: The new basic block that will contain the loop
+    :param declaration: Whether the declaration of the loop variable is needed
+    :param is_sum_to_loop: Whether the transformation is for a sum to loop
+    :return: Ranges, rangepos, newbody
+    """
+
+    currentindex = 0
+    indices = []
+    for i in node.indices:
+        if isinstance(i, ast_internal_classes.ParDecl_Node):
+            if i.type == "ALL":
+                ranges.append([
+                    ast_internal_classes.Int_Literal_Node(value="1"),
+                    ast_internal_classes.Name_Range_Node(name="f2dace_MAX",
+                                                         type="INTEGER",
+                                                         arrname=node.name,
+                                                         pos=currentindex)
+                ])
+            else:
+                ranges.append([i.range[0], i.range[1]])
+            rangepos.append(currentindex)
+            if declaration:
+                newbody.append(
+                    ast_internal_classes.Decl_Stmt_Node(vardecl=[
+                        ast_internal_classes.Symbol_Decl_Node(
+                            name="tmp_parfor_" + str(count + len(rangepos) - 1), type="INTEGER", sizes=None, init=None)
+                    ]))
+            indices.append(ast_internal_classes.Name_Node(name="tmp_parfor_" + str(count + len(rangepos) - 1)))
+        else:
+            indices.append(i)
+        currentindex += 1
+
+    node.indices = indices
+
+
+class ArrayToLoop(NodeTransformer):
+    """
+    Transforms the AST by removing array expressions and replacing them with loops
+    """
+    def __init__(self):
+        self.count = 0
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        newbody = []
+        for child in node.execution:
+            lister = ArrayLoopNodeLister()
+            lister.visit(child)
+            res = lister.nodes
+            res_range = lister.range_nodes
+            if res is not None and len(res) > 0:
+
+                current = child.lval
+                val = child.rval
+                ranges = []
+                rangepos = []
+                par_Decl_Range_Finder(current, ranges, rangepos, self.count, newbody, True)
+
+                if res_range is not None and len(res_range) > 0:
+                    rvals = [i for i in mywalk(val) if isinstance(i, ast_internal_classes.Array_Subscript_Node)]
+                    for i in rvals:
+                        rangeposrval = []
+                        rangesrval = []
+
+                        par_Decl_Range_Finder(i, rangesrval, rangeposrval, self.count, newbody, False)
+
+                        for i, j in zip(ranges, rangesrval):
+                            if i != j:
+                                if isinstance(i, list) and isinstance(j, list) and len(i) == len(j):
+                                    for k, l in zip(i, j):
+                                        if k != l:
+                                            if isinstance(k, ast_internal_classes.Name_Range_Node) and isinstance(
+                                                    l, ast_internal_classes.Name_Range_Node):
+                                                if k.name != l.name:
+                                                    raise NotImplementedError("Ranges must be the same")
+                                            else:
+                                                raise NotImplementedError("Ranges must be the same")
+                                else:
+                                    raise NotImplementedError("Ranges must be identical")
+
+                range_index = 0
+                body = ast_internal_classes.BinOp_Node(lval=current, op="=", rval=val, line_number=child.line_number)
+                for i in ranges:
+                    initrange = i[0]
+                    finalrange = i[1]
+                    init = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="=",
+                        rval=initrange,
+                        line_number=child.line_number)
+                    cond = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="<=",
+                        rval=finalrange,
+                        line_number=child.line_number)
+                    iter = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="=",
+                        rval=ast_internal_classes.BinOp_Node(
+                            lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                            op="+",
+                            rval=ast_internal_classes.Int_Literal_Node(value="1")),
+                        line_number=child.line_number)
+                    current_for = ast_internal_classes.Map_Stmt_Node(
+                        init=init,
+                        cond=cond,
+                        iter=iter,
+                        body=ast_internal_classes.Execution_Part_Node(execution=[body]),
+                        line_number=child.line_number)
+                    body = current_for
+                    range_index += 1
+
+                newbody.append(body)
+
+                self.count = self.count + range_index
+            else:
+                newbody.append(self.visit(child))
+        return ast_internal_classes.Execution_Part_Node(execution=newbody)
+
+
+def mywalk(node):
+    """
+    Recursively yield all descendant nodes in the tree starting at *node*
+    (including *node* itself), in no specified order.  This is useful if you
+    only want to modify nodes in place and don't care about the context.
+    """
+    from collections import deque
+    todo = deque([node])
+    while todo:
+        node = todo.popleft()
+        todo.extend(iter_child_nodes(node))
+        yield node
+
+
+class SumToLoop(NodeTransformer):
+    """
+    Transforms the AST by removing array sums and replacing them with loops
+    """
+    def __init__(self):
+        self.count = 0
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        newbody = []
+        for child in node.execution:
+            lister = SumLoopNodeLister()
+            lister.visit(child)
+            res = lister.nodes
+            if res is not None and len(res) > 0:
+
+                current = child.lval
+                val = child.rval
+                rvals = [i for i in mywalk(val) if isinstance(i, ast_internal_classes.Array_Subscript_Node)]
+                if len(rvals) != 1:
+                    raise NotImplementedError("Only one array can be summed")
+                val = rvals[0]
+                rangeposrval = []
+                rangesrval = []
+
+                par_Decl_Range_Finder(val, rangesrval, rangeposrval, self.count, newbody, False, True)
+
+                range_index = 0
+                body = ast_internal_classes.BinOp_Node(lval=current,
+                                                       op="=",
+                                                       rval=ast_internal_classes.BinOp_Node(
+                                                           lval=current,
+                                                           op="+",
+                                                           rval=val,
+                                                           line_number=child.line_number),
+                                                       line_number=child.line_number)
+                for i in rangesrval:
+                    initrange = i[0]
+                    finalrange = i[1]
+                    init = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="=",
+                        rval=initrange,
+                        line_number=child.line_number)
+                    cond = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="<=",
+                        rval=finalrange,
+                        line_number=child.line_number)
+                    iter = ast_internal_classes.BinOp_Node(
+                        lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                        op="=",
+                        rval=ast_internal_classes.BinOp_Node(
+                            lval=ast_internal_classes.Name_Node(name="tmp_parfor_" + str(self.count + range_index)),
+                            op="+",
+                            rval=ast_internal_classes.Int_Literal_Node(value="1")),
+                        line_number=child.line_number)
+                    current_for = ast_internal_classes.Map_Stmt_Node(
+                        init=init,
+                        cond=cond,
+                        iter=iter,
+                        body=ast_internal_classes.Execution_Part_Node(execution=[body]),
+                        line_number=child.line_number)
+                    body = current_for
+                    range_index += 1
+
+                newbody.append(body)
+
+                self.count = self.count + range_index
+            else:
+                newbody.append(self.visit(child))
+        return ast_internal_classes.Execution_Part_Node(execution=newbody)
+
+
+class RenameVar(NodeTransformer):
+    def __init__(self, oldname: str, newname: str):
+        self.oldname = oldname
+        self.newname = newname
+
+    def visit_Name_Node(self, node: ast_internal_classes.Name_Node):
+        return ast_internal_classes.Name_Node(name=self.newname) if node.name == self.oldname else node
+
+
+class ForDeclarer(NodeTransformer):
+    """
+    Ensures that each loop iterator is unique by extracting the actual iterator and assigning it to a uniquely named local variable
+    """
+    def __init__(self):
+        self.count = 0
+
+    def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
+        newbody = []
+        for child in node.execution:
+            if isinstance(child, ast_internal_classes.Map_Stmt_Node):
+                newbody.append(self.visit(child))
+                continue
+            if isinstance(child, ast_internal_classes.For_Stmt_Node):
+                newbody.append(
+                    ast_internal_classes.Decl_Stmt_Node(vardecl=[
+                        ast_internal_classes.Symbol_Decl_Node(
+                            name="_for_it_" + str(self.count), type="INTEGER", sizes=None, init=None)
+                    ]))
+                final_assign = ast_internal_classes.BinOp_Node(lval=child.init.lval,
+                                                               op="=",
+                                                               rval=child.cond.rval,
+                                                               line_number=child.line_number)
+                newfor = RenameVar(child.init.lval.name, "_for_it_" + str(self.count)).visit(child)
+                self.count += 1
+                newfor = self.visit(newfor)
+                newbody.append(newfor)
+
+            else:
+                newbody.append(self.visit(child))
+        return ast_internal_classes.Execution_Part_Node(execution=newbody)
diff --git a/dace/frontend/fortran/ast_utils.py b/dace/frontend/fortran/ast_utils.py
new file mode 100644
index 0000000000..64988b01d6
--- /dev/null
+++ b/dace/frontend/fortran/ast_utils.py
@@ -0,0 +1,363 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.api import parse
+import os
+import sys
+from fparser.common.readfortran import FortranStringReader, FortranFileReader
+
+#dace imports
+from dace import subsets
+from dace.data import Scalar
+from dace.sdfg import SDFG, SDFGState, InterstateEdge
+from dace import Memlet
+from dace.sdfg.nodes import Tasklet
+from dace import dtypes
+from dace import symbolic as sym
+from dace import DebugInfo as di
+from dace import Language as lang
+from dace.properties import CodeBlock
+from numpy import finfo as finf
+from numpy import float64 as fl
+
+from dace.frontend.fortran import ast_internal_classes
+from typing import List, Set
+
+fortrantypes2dacetypes = {
+    "DOUBLE": dtypes.float64,
+    "REAL": dtypes.float32,
+    "INTEGER": dtypes.int32,
+    "BOOL": dtypes.int32,  #This is a hack to allow fortran to pass through external C 
+    #"BOOL": dtypes.int32,
+}
+
+
+def add_tasklet(substate: SDFGState, name: str, vars_in: Set[str], vars_out: Set[str], code: str, debuginfo: list,
+                source: str):
+    tasklet = substate.add_tasklet(name="T" + name,
+                                   inputs=vars_in,
+                                   outputs=vars_out,
+                                   code=code,
+                                   debuginfo=di(start_line=debuginfo[0], start_column=debuginfo[1], filename=source),
+                                   language=lang.Python)
+    return tasklet
+
+
+def add_memlet_read(substate: SDFGState, var_name: str, tasklet: Tasklet, dest_conn: str, memlet_range: str):
+    src = substate.add_access(var_name)
+    if memlet_range != "":
+        substate.add_memlet_path(src, tasklet, dst_conn=dest_conn, memlet=Memlet(expr=var_name, subset=memlet_range))
+    else:
+        substate.add_memlet_path(src, tasklet, dst_conn=dest_conn, memlet=Memlet(expr=var_name))
+
+
+def add_memlet_write(substate: SDFGState, var_name: str, tasklet: Tasklet, source_conn: str, memlet_range: str):
+    dst = substate.add_write(var_name)
+    if memlet_range != "":
+        substate.add_memlet_path(tasklet, dst, src_conn=source_conn, memlet=Memlet(expr=var_name, subset=memlet_range))
+    else:
+        substate.add_memlet_path(tasklet, dst, src_conn=source_conn, memlet=Memlet(expr=var_name))
+
+
+def add_simple_state_to_sdfg(state: SDFGState, top_sdfg: SDFG, state_name: str):
+    if state.last_sdfg_states.get(top_sdfg) is not None:
+        substate = top_sdfg.add_state(state_name)
+    else:
+        substate = top_sdfg.add_state(state_name, is_start_state=True)
+    finish_add_state_to_sdfg(state, top_sdfg, substate)
+    return substate
+
+
+def finish_add_state_to_sdfg(state: SDFGState, top_sdfg: SDFG, substate: SDFGState):
+    if state.last_sdfg_states.get(top_sdfg) is not None:
+        top_sdfg.add_edge(state.last_sdfg_states[top_sdfg], substate, InterstateEdge())
+    state.last_sdfg_states[top_sdfg] = substate
+
+
+def get_name(node: ast_internal_classes.FNode):
+    if isinstance(node, ast_internal_classes.Name_Node):
+        return node.name
+    elif isinstance(node, ast_internal_classes.Array_Subscript_Node):
+        return node.name.name
+    else:
+        raise NameError("Name not found")
+
+
+class TaskletWriter:
+    """
+    Class that writes a python tasklet from a node
+    :param outputs: list of output variables
+    :param outputs_changes: list of names output variables should be changed to
+    :param input: list of input variables
+    :param input_changes: list of names input variables should be changed to
+    :param sdfg: sdfg the tasklet will be part of
+    :param name_mapping: mapping of names in the code to names in the sdfg
+    :return: python code for a tasklet, as a string
+    """
+    def __init__(self,
+                 outputs: List[str],
+                 outputs_changes: List[str],
+                 sdfg: SDFG = None,
+                 name_mapping=None,
+                 input: List[str] = None,
+                 input_changes: List[str] = None):
+        self.outputs = outputs
+        self.outputs_changes = outputs_changes
+        self.sdfg = sdfg
+        self.mapping = name_mapping
+        self.input = input
+        self.input_changes = input_changes
+
+        self.ast_elements = {
+            ast_internal_classes.BinOp_Node: self.binop2string,
+            ast_internal_classes.Name_Node: self.name2string,
+            ast_internal_classes.Name_Range_Node: self.name2string,
+            ast_internal_classes.Int_Literal_Node: self.intlit2string,
+            ast_internal_classes.Real_Literal_Node: self.floatlit2string,
+            ast_internal_classes.Bool_Literal_Node: self.boollit2string,
+            ast_internal_classes.UnOp_Node: self.unop2string,
+            ast_internal_classes.Array_Subscript_Node: self.arraysub2string,
+            ast_internal_classes.Parenthesis_Expr_Node: self.parenthesis2string,
+            ast_internal_classes.Call_Expr_Node: self.call2string,
+            ast_internal_classes.ParDecl_Node: self.pardecl2string,
+        }
+
+    def pardecl2string(self, node: ast_internal_classes.ParDecl_Node):
+        #At this point in the process, the should not be any ParDecl nodes left in the AST - they should have been replaced by the appropriate ranges
+        return f"ERROR{node.type}"
+
+    def write_code(self, node: ast_internal_classes.FNode):
+        """
+        :param node: node to write code for
+        :return: python code for the node, as a string
+        :note Main function of the class, writes the code for a node
+        :note If the node is a string, it is returned as is
+        :note If the node is not a string, it is checked if it is in the ast_elements dictionary
+        :note If it is, the appropriate function is called with the node as an argument, leading to a recursive traversal of the tree spanned by the node
+        :note If it not, an error is raised
+
+        """
+        if node.__class__ in self.ast_elements:
+            text = self.ast_elements[node.__class__](node)
+            if text is None:
+                raise NameError("Error in code generation")
+
+            return text
+        elif isinstance(node, str):
+            return node
+        else:
+            raise NameError("Error in code generation" + node.__class__.__name__)
+
+    def arraysub2string(self, node: ast_internal_classes.Array_Subscript_Node):
+        str_to_return = self.write_code(node.name) + "[" + self.write_code(node.indices[0])
+        for i in node.indices[1:]:
+            str_to_return += ", " + self.write_code(i)
+        str_to_return += "]"
+        return str_to_return
+
+    def name2string(self, node):
+        if isinstance(node, str):
+            return node
+
+        return_value = node.name
+        name = node.name
+        for i in self.sdfg.arrays:
+            sdfg_name = self.mapping.get(self.sdfg).get(name)
+            if sdfg_name == i:
+                name = i
+                break
+
+        if len(self.outputs) > 0:
+            if name == self.outputs[0]:
+                if self.outputs[0] != self.outputs_changes[0]:
+                    name = self.outputs_changes[0]
+                self.outputs.pop(0)
+                self.outputs_changes.pop(0)
+
+        if self.input is not None and len(self.input) > 0:
+            if name == self.input[0]:
+                if self.input[0] != self.input_changes[0]:
+                    name = self.input_changes[0]
+                else:
+                    pass
+                self.input.pop(0)
+                self.input_changes.pop(0)
+        return name
+
+    def intlit2string(self, node: ast_internal_classes.Int_Literal_Node):
+
+        return "".join(map(str, node.value))
+
+    def floatlit2string(self, node: ast_internal_classes.Real_Literal_Node):
+
+        return "".join(map(str, node.value))
+
+    def boollit2string(self, node: ast_internal_classes.Bool_Literal_Node):
+
+        return str(node.value)
+
+    def unop2string(self, node: ast_internal_classes.UnOp_Node):
+        op = node.op
+        if op == ".NOT.":
+            op = "not "
+        return op + self.write_code(node.lval)
+
+    def parenthesis2string(self, node: ast_internal_classes.Parenthesis_Expr_Node):
+        return "(" + self.write_code(node.expr) + ")"
+
+    def call2string(self, node: ast_internal_classes.Call_Expr_Node):
+        # This is a replacement for the epsilon function in fortran
+        if node.name.name == "__dace_epsilon":
+            return str(finf(fl).eps)
+        if node.name.name == "pow":
+            return " ( " + self.write_code(node.args[0]) + " ** " + self.write_code(node.args[1]) + "  ) "
+        return_str = self.write_code(node.name) + "(" + self.write_code(node.args[0])
+        for i in node.args[1:]:
+            return_str += ", " + self.write_code(i)
+        return_str += ")"
+        return return_str
+
+    def binop2string(self, node: ast_internal_classes.BinOp_Node):
+
+        op = node.op
+        if op == ".EQ.":
+            op = "=="
+        if op == ".AND.":
+            op = " and "
+        if op == ".OR.":
+            op = " or "
+        if op == ".NE.":
+            op = "!="
+        if op == "/=":
+            op = "!="
+        if op == ".NOT.":
+            op = "!"
+        if op == ".LE.":
+            op = "<="
+        if op == ".GE.":
+            op = ">="
+        if op == ".LT.":
+            op = "<"
+        if op == ".GT.":
+            op = ">"
+        #TODO Add list of missing operators
+
+        left = self.write_code(node.lval)
+        right = self.write_code(node.rval)
+        if op != "=":
+            return "(" + left + op + right + ")"
+        else:
+            return left + op + right
+
+
+def generate_memlet(op, top_sdfg, state):
+    if state.name_mapping.get(top_sdfg).get(get_name(op)) is not None:
+        shape = top_sdfg.arrays[state.name_mapping[top_sdfg][get_name(op)]].shape
+    elif state.name_mapping.get(state.globalsdfg).get(get_name(op)) is not None:
+        shape = state.globalsdfg.arrays[state.name_mapping[state.globalsdfg][get_name(op)]].shape
+    else:
+        raise NameError("Variable name not found: ", get_name(op))
+    indices = []
+    if isinstance(op, ast_internal_classes.Array_Subscript_Node):
+        for i in op.indices:
+            tw = TaskletWriter([], [], top_sdfg, state.name_mapping)
+            text = tw.write_code(i)
+            #This might need to be replaced with the name in the context of the top/current sdfg
+            indices.append(sym.pystr_to_symbolic(text))
+    memlet = '0'
+    if len(shape) == 1:
+        if shape[0] == 1:
+            return memlet
+
+    all_indices = indices + [None] * (len(shape) - len(indices))
+    subset = subsets.Range([(i, i, 1) if i is not None else (1, s, 1) for i, s in zip(all_indices, shape)])
+    return subset
+
+
+class ProcessedWriter(TaskletWriter):
+    """
+    This class is derived from the TaskletWriter class and is used to write the code of a tasklet that's on an interstate edge rather than a computational tasklet.
+    :note The only differences are in that the names for the sdfg mapping are used, and that the indices are considered to be one-bases rather than zero-based. 
+    """
+    def __init__(self, sdfg: SDFG, mapping):
+        self.sdfg = sdfg
+        self.mapping = mapping
+        self.ast_elements = {
+            ast_internal_classes.BinOp_Node: self.binop2string,
+            ast_internal_classes.Name_Node: self.name2string,
+            ast_internal_classes.Name_Range_Node: self.namerange2string,
+            ast_internal_classes.Int_Literal_Node: self.intlit2string,
+            ast_internal_classes.Real_Literal_Node: self.floatlit2string,
+            ast_internal_classes.Bool_Literal_Node: self.boollit2string,
+            ast_internal_classes.UnOp_Node: self.unop2string,
+            ast_internal_classes.Array_Subscript_Node: self.arraysub2string,
+            ast_internal_classes.Parenthesis_Expr_Node: self.parenthesis2string,
+            ast_internal_classes.Call_Expr_Node: self.call2string,
+            ast_internal_classes.ParDecl_Node: self.pardecl2string,
+        }
+
+    def name2string(self, node: ast_internal_classes.Name_Node):
+        name = node.name
+        for i in self.sdfg.arrays:
+            sdfg_name = self.mapping.get(self.sdfg).get(name)
+            if sdfg_name == i:
+                name = i
+                break
+        return name
+
+    def arraysub2string(self, node: ast_internal_classes.Array_Subscript_Node):
+        str_to_return = self.write_code(node.name) + "[(" + self.write_code(node.indices[0]) + "+1)"
+        for i in node.indices[1:]:
+            str_to_return += ",( " + self.write_code(i) + "+1)"
+        str_to_return += "]"
+        return str_to_return
+
+    def namerange2string(self, node: ast_internal_classes.Name_Range_Node):
+        name = node.name
+        if name == "f2dace_MAX":
+            arr = self.sdfg.arrays.get(self.mapping[self.sdfg][node.arrname.name])
+            name = str(arr.shape[node.pos])
+            return name
+        else:
+            return self.name2string(node)
+
+
+class Context:
+    def __init__(self, name):
+        self.name = name
+        self.constants = {}
+        self.symbols = []
+        self.containers = []
+        self.read_vars = []
+        self.written_vars = []
+
+
+class NameMap(dict):
+    def __getitem__(self, k):
+        assert isinstance(k, SDFG)
+        if k not in self:
+            self[k] = {}
+
+        return super().__getitem__(k)
+
+    def get(self, k):
+        return self[k]
+
+    def __setitem__(self, k, v) -> None:
+        assert isinstance(k, SDFG)
+        return super().__setitem__(k, v)
+
+
+class ModuleMap(dict):
+    def __getitem__(self, k):
+        assert isinstance(k, ast_internal_classes.Module_Node)
+        if k not in self:
+            self[k] = {}
+
+        return super().__getitem__(k)
+
+    def get(self, k):
+        return self[k]
+
+    def __setitem__(self, k, v) -> None:
+        assert isinstance(k, ast_internal_classes.Module_Node)
+        return super().__setitem__(k, v)
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
new file mode 100644
index 0000000000..311840a62a
--- /dev/null
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -0,0 +1,1061 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from venv import create
+import warnings
+
+from dace.data import Scalar
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+from typing import List, Tuple, Set
+from dace import dtypes
+from dace import Language as lang
+from dace import data as dat
+from dace import SDFG, InterstateEdge, Memlet, pointer, nodes
+from dace import symbolic as sym
+from copy import deepcopy as dpcp
+
+from dace.properties import CodeBlock
+from fparser.two.parser import ParserFactory as pf
+from fparser.common.readfortran import FortranStringReader as fsr
+from fparser.common.readfortran import FortranFileReader as ffr
+from fparser.two.symbol_table import SymbolTable
+
+
+class AST_translator:
+    """  
+    This class is responsible for translating the internal AST into a SDFG.
+    """
+    def __init__(self, ast: ast_components.InternalFortranAst, source: str):
+        """
+        :ast: The internal fortran AST to be used for translation
+        :source: The source file name from which the AST was generated
+        """
+        self.tables = ast.tables
+        self.top_level = None
+        self.globalsdfg = None
+        self.functions_and_subroutines = ast.functions_and_subroutines
+        self.name_mapping = ast_utils.NameMap()
+        self.contexts = {}
+        self.views = 0
+        self.libstates = []
+        self.file_name = source
+        self.all_array_names = []
+        self.last_sdfg_states = {}
+        self.last_loop_continues = {}
+        self.last_loop_breaks = {}
+        self.last_returns = {}
+        self.module_vars = []
+        self.libraries = {}
+        self.last_call_expression = {}
+        self.ast_elements = {
+            ast_internal_classes.If_Stmt_Node: self.ifstmt2sdfg,
+            ast_internal_classes.For_Stmt_Node: self.forstmt2sdfg,
+            ast_internal_classes.Map_Stmt_Node: self.forstmt2sdfg,
+            ast_internal_classes.Execution_Part_Node: self.basicblock2sdfg,
+            ast_internal_classes.Subroutine_Subprogram_Node: self.subroutine2sdfg,
+            ast_internal_classes.BinOp_Node: self.binop2sdfg,
+            ast_internal_classes.Decl_Stmt_Node: self.declstmt2sdfg,
+            ast_internal_classes.Var_Decl_Node: self.vardecl2sdfg,
+            ast_internal_classes.Symbol_Decl_Node: self.symbol2sdfg,
+            ast_internal_classes.Symbol_Array_Decl_Node: self.symbolarray2sdfg,
+            ast_internal_classes.Call_Expr_Node: self.call2sdfg,
+            ast_internal_classes.Program_Node: self.ast2sdfg,
+            ast_internal_classes.Write_Stmt_Node: self.write2sdfg,
+        }
+        
+
+    def get_dace_type(self, type):
+        """  
+        This function matches the fortran type to the corresponding dace type
+        by referencing the ast_utils.fortrantypes2dacetypes dictionary.
+        """
+        if isinstance(type, str):
+            return ast_utils.fortrantypes2dacetypes[type]
+
+    def get_name_mapping_in_context(self, sdfg: SDFG):
+        """
+        This function returns a copy of the name mapping union
+         for the given sdfg and the top-level sdfg.
+        """
+        a = self.name_mapping[self.globalsdfg].copy()
+        if sdfg is not self.globalsdfg:
+            a.update(self.name_mapping[sdfg])
+        return a
+
+    def get_arrays_in_context(self, sdfg: SDFG):
+        """
+        This function returns a copy of the union of arrays 
+        for the given sdfg and the top-level sdfg.
+        """
+        a = self.globalsdfg.arrays.copy()
+        if sdfg is not self.globalsdfg:
+            a.update(sdfg.arrays)
+        return a
+
+    def get_memlet_range(self, sdfg: SDFG, variables: List[ast_internal_classes.FNode], var_name: str,
+                         var_name_tasklet: str) -> str:
+        """
+        This function returns the memlet range for the given variable.
+        :param sdfg: The sdfg in which the variable is used
+        :param variables: The list of variables in the current context
+        :param var_name: The name of the variable for which the memlet range should be returned
+        :param var_name_tasklet: The name of the variable in the tasklet
+        :return: The memlet range for the given variable
+        """
+        var = self.get_arrays_in_context(sdfg).get(var_name)
+
+        if len(var.shape) == 0:
+            return ""
+
+        if (len(var.shape) == 1 and var.shape[0] == 1):
+            return "0"
+
+        for o_v in variables:
+            if o_v.name == var_name_tasklet:
+                return ast_utils.generate_memlet(o_v, sdfg, self)
+
+    def translate(self, node: ast_internal_classes.FNode, sdfg: SDFG):
+        """
+        This function is responsible for translating the AST into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        :note: This function is recursive and will call itself for all child nodes
+        :note: This function will call the appropriate function for the node type
+        :note: The dictionary ast_elements, part of the class itself contains all functions that are called for the different node types
+        """
+        if node.__class__ in self.ast_elements:
+            self.ast_elements[node.__class__](node, sdfg)
+        elif isinstance(node, list):
+            for i in node:
+                self.translate(i, sdfg)
+        else:
+            warnings.warn("WARNING:", node.__class__.__name__)
+
+    def ast2sdfg(self, node: ast_internal_classes.Program_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating the Fortran AST into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        :note: This function is recursive and will call itself for all child nodes
+        :note: This function will call the appropriate function for the node type
+        :note: The dictionary ast_elements, part of the class itself contains all functions that are called for the different node types
+        """
+        self.globalsdfg = sdfg
+        for i in node.modules:
+            for j in i.specification_part.typedecls:
+                self.translate(j, sdfg)
+                for k in j.vardecl:
+                    self.module_vars.append((k.name, i.name))
+            for j in i.specification_part.symbols:
+                self.translate(j, sdfg)
+                for k in j.vardecl:
+                    self.module_vars.append((k.name, i.name))
+            for j in i.specification_part.specifications:
+                self.translate(j, sdfg)
+                for k in j.vardecl:
+                    self.module_vars.append((k.name, i.name))
+
+        for i in node.main_program.specification_part.typedecls:
+            self.translate(i, sdfg)
+        for i in node.main_program.specification_part.symbols:
+            self.translate(i, sdfg)
+        for i in node.main_program.specification_part.specifications:
+            self.translate(i, sdfg)
+        self.translate(node.main_program.execution_part.execution, sdfg)
+
+    def basicblock2sdfg(self, node: ast_internal_classes.Execution_Part_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran basic blocks into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        for i in node.execution:
+            self.translate(i, sdfg)
+
+    def write2sdfg(self, node: ast_internal_classes.Write_Stmt_Node, sdfg: SDFG):
+        #TODO implement
+        raise NotImplementedError("Fortran write statements are not implemented yet")
+
+    def ifstmt2sdfg(self, node: ast_internal_classes.If_Stmt_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran if statements into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        name = f"If_l_{str(node.line_number[0])}_c_{str(node.line_number[1])}"
+        begin_state = ast_utils.add_simple_state_to_sdfg(self, sdfg, f"Begin{name}")
+        guard_substate = sdfg.add_state(f"Guard{name}")
+        sdfg.add_edge(begin_state, guard_substate, InterstateEdge())
+
+        condition = ast_utils.ProcessedWriter(sdfg, self.name_mapping).write_code(node.cond)
+
+        body_ifstart_state = sdfg.add_state(f"BodyIfStart{name}")
+        self.last_sdfg_states[sdfg] = body_ifstart_state
+        self.translate(node.body, sdfg)
+        final_substate = sdfg.add_state(f"MergeState{name}")
+
+        sdfg.add_edge(guard_substate, body_ifstart_state, InterstateEdge(condition))
+
+        if self.last_sdfg_states[sdfg] not in [
+                self.last_loop_breaks.get(sdfg),
+                self.last_loop_continues.get(sdfg),
+                self.last_returns.get(sdfg)
+        ]:
+            body_ifend_state = ast_utils.add_simple_state_to_sdfg(self, sdfg, f"BodyIfEnd{name}")
+            sdfg.add_edge(body_ifend_state, final_substate, InterstateEdge())
+
+        if len(node.body_else.execution) > 0:
+            name_else = f"Else_l_{str(node.line_number[0])}_c_{str(node.line_number[1])}"
+            body_elsestart_state = sdfg.add_state("BodyElseStart" + name_else)
+            self.last_sdfg_states[sdfg] = body_elsestart_state
+            self.translate(node.body_else, sdfg)
+            body_elseend_state = ast_utils.add_simple_state_to_sdfg(self, sdfg, f"BodyElseEnd{name_else}")
+            sdfg.add_edge(guard_substate, body_elsestart_state, InterstateEdge("not (" + condition + ")"))
+            sdfg.add_edge(body_elseend_state, final_substate, InterstateEdge())
+        else:
+            sdfg.add_edge(guard_substate, final_substate, InterstateEdge("not (" + condition + ")"))
+        self.last_sdfg_states[sdfg] = final_substate
+
+    def forstmt2sdfg(self, node: ast_internal_classes.For_Stmt_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran for statements into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        declloop = False
+        name = "FOR_l_" + str(node.line_number[0]) + "_c_" + str(node.line_number[1])
+        begin_state = ast_utils.add_simple_state_to_sdfg(self, sdfg, "Begin" + name)
+        guard_substate = sdfg.add_state("Guard" + name)
+        final_substate = sdfg.add_state("Merge" + name)
+        self.last_sdfg_states[sdfg] = final_substate
+        decl_node = node.init
+        entry = {}
+        if isinstance(decl_node, ast_internal_classes.BinOp_Node):
+            if sdfg.symbols.get(decl_node.lval.name) is not None:
+                iter_name = decl_node.lval.name
+            elif self.name_mapping[sdfg].get(decl_node.lval.name) is not None:
+                iter_name = self.name_mapping[sdfg][decl_node.lval.name]
+            else:
+                raise ValueError("Unknown variable " + decl_node.lval.name)
+            entry[iter_name] = ast_utils.ProcessedWriter(sdfg, self.name_mapping).write_code(decl_node.rval)
+
+        sdfg.add_edge(begin_state, guard_substate, InterstateEdge(assignments=entry))
+
+        condition = ast_utils.ProcessedWriter(sdfg, self.name_mapping).write_code(node.cond)
+
+        increment = "i+0+1"
+        if isinstance(node.iter, ast_internal_classes.BinOp_Node):
+            increment = ast_utils.ProcessedWriter(sdfg, self.name_mapping).write_code(node.iter.rval)
+        entry = {iter_name: increment}
+
+        begin_loop_state = sdfg.add_state("BeginLoop" + name)
+        end_loop_state = sdfg.add_state("EndLoop" + name)
+        self.last_sdfg_states[sdfg] = begin_loop_state
+        self.last_loop_continues[sdfg] = end_loop_state
+        self.translate(node.body, sdfg)
+
+        sdfg.add_edge(self.last_sdfg_states[sdfg], end_loop_state, InterstateEdge())
+        sdfg.add_edge(guard_substate, begin_loop_state, InterstateEdge(condition))
+        sdfg.add_edge(end_loop_state, guard_substate, InterstateEdge(assignments=entry))
+        sdfg.add_edge(guard_substate, final_substate, InterstateEdge(f"not ({condition})"))
+        self.last_sdfg_states[sdfg] = final_substate
+
+    def symbol2sdfg(self, node: ast_internal_classes.Symbol_Decl_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran symbol declarations into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        if self.contexts.get(sdfg.name) is None:
+            self.contexts[sdfg.name] = ast_utils.Context(name=sdfg.name)
+        if self.contexts[sdfg.name].constants.get(node.name) is None:
+            if isinstance(node.init, ast_internal_classes.Int_Literal_Node) or isinstance(
+                    node.init, ast_internal_classes.Real_Literal_Node):
+                self.contexts[sdfg.name].constants[node.name] = node.init.value
+            if isinstance(node.init, ast_internal_classes.Name_Node):
+                self.contexts[sdfg.name].constants[node.name] = self.contexts[sdfg.name].constants[node.init.name]
+        datatype = self.get_dace_type(node.type)
+        if node.name not in sdfg.symbols:
+            sdfg.add_symbol(node.name, datatype)
+            if self.last_sdfg_states.get(sdfg) is None:
+                bstate = sdfg.add_state("SDFGbegin", is_start_state=True)
+                self.last_sdfg_states[sdfg] = bstate
+            if node.init is not None:
+                substate = sdfg.add_state(f"Dummystate_{node.name}")
+                increment = ast_utils.TaskletWriter([], [], sdfg, self.name_mapping).write_code(node.init)
+
+                entry = {node.name: increment}
+                sdfg.add_edge(self.last_sdfg_states[sdfg], substate, InterstateEdge(assignments=entry))
+                self.last_sdfg_states[sdfg] = substate
+
+    def symbolarray2sdfg(self, node: ast_internal_classes.Symbol_Array_Decl_Node, sdfg: SDFG):
+
+        return NotImplementedError(
+            "Symbol_Decl_Node not implemented. This should be done via a transformation that itemizes the constant array."
+        )
+
+    def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran subroutine declarations into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        if node.execution_part is None:
+            return
+
+        # First get the list of read and written variables
+        inputnodefinder = ast_transforms.FindInputs()
+        inputnodefinder.visit(node)
+        input_vars = inputnodefinder.nodes
+        outputnodefinder = ast_transforms.FindOutputs()
+        outputnodefinder.visit(node)
+        output_vars = outputnodefinder.nodes
+        write_names = list(dict.fromkeys([i.name for i in output_vars]))
+        read_names = list(dict.fromkeys([i.name for i in input_vars]))
+
+
+        # Collect the parameters and the function signature to comnpare and link
+        parameters = node.args.copy()
+
+        new_sdfg = SDFG(node.name.name)
+        substate = ast_utils.add_simple_state_to_sdfg(self, sdfg, "state" + node.name.name)
+        variables_in_call = []
+        if self.last_call_expression.get(sdfg) is not None:
+            variables_in_call = self.last_call_expression[sdfg]
+
+        # Sanity check to make sure the parameter numbers match
+        if not ((len(variables_in_call) == len(parameters)) or
+                (len(variables_in_call) == len(parameters) + 1
+                 and not isinstance(node.result_type, ast_internal_classes.Void))):
+            for i in variables_in_call:
+                print("VAR CALL: ", i.name)
+            for j in parameters:
+                print("LOCAL TO UPDATE: ", j.name)
+            raise ValueError("number of parameters does not match the function signature")
+
+        # creating new arrays for nested sdfg
+        ins_in_new_sdfg = []
+        outs_in_new_sdfg = []
+
+        views = []
+        ind_count = 0
+
+        var2 = []
+        literals = []
+        literal_values = []
+        par2 = []
+
+        symbol_arguments = []
+
+        # First we need to check if the parameters are literals or variables
+        for arg_i, variable in enumerate(variables_in_call):
+            # print(i.__class__)
+            if isinstance(variable, ast_internal_classes.Name_Node):
+                varname = variable.name
+            elif isinstance(variable, ast_internal_classes.Array_Subscript_Node):
+                varname = variable.name.name
+            if isinstance(variable, ast_internal_classes.Literal) or varname == "LITERAL":
+                literals.append(parameters[arg_i])
+                literal_values.append(variable)
+                continue
+            elif varname in sdfg.symbols:
+                symbol_arguments.append((parameters[arg_i], variable))
+                continue
+
+            par2.append(parameters[arg_i])
+            var2.append(variable)
+
+        #This handles the case where the function is called with literals
+        variables_in_call = var2
+        parameters = par2
+        assigns = []
+        for lit, litval in zip(literals, literal_values):
+            local_name = lit
+            assigns.append(
+                ast_internal_classes.BinOp_Node(lval=ast_internal_classes.Name_Node(name=local_name.name),
+                                                rval=litval,
+                                                op="=",
+                                                line_number=node.line_number))
+
+        # This handles the case where the function is called with symbols
+        for parameter, symbol in symbol_arguments:
+            if parameter.name != symbol.name:
+                assigns.append(
+                    ast_internal_classes.BinOp_Node(lval=ast_internal_classes.Name_Node(name=parameter.name),
+                                                    rval=ast_internal_classes.Name_Node(name=symbol.name),
+                                                    op="=",
+                                                    line_number=node.line_number))
+
+        # This handles the case where the function is called with variables starting with the case that the variable is local to the calling SDFG
+        for variable_in_call in variables_in_call:
+            all_arrays = self.get_arrays_in_context(sdfg)
+
+            sdfg_name = self.name_mapping.get(sdfg).get(ast_utils.get_name(variable_in_call))
+            globalsdfg_name = self.name_mapping.get(self.globalsdfg).get(ast_utils.get_name(variable_in_call))
+            matched = False
+            for array_name, array in all_arrays.items():
+                if array_name in [sdfg_name]:
+                    matched = True
+                    local_name = parameters[variables_in_call.index(variable_in_call)]
+                    self.name_mapping[new_sdfg][local_name.name] = new_sdfg._find_new_name(local_name.name)
+                    self.all_array_names.append(self.name_mapping[new_sdfg][local_name.name])
+                    if local_name.name in read_names:
+                        ins_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+                    if local_name.name in write_names:
+                        outs_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+
+                    indices = 0
+                    index_list = []
+                    shape = []
+                    tmp_node = variable_in_call
+                    strides = list(array.strides)
+                    offsets = list(array.offset)
+                    mysize = 1
+
+                    if isinstance(variable_in_call, ast_internal_classes.Array_Subscript_Node):
+                        changed_indices = 0
+                        for i in variable_in_call.indices:
+                            if isinstance(i, ast_internal_classes.ParDecl_Node):
+                                if i.type == "ALL":
+                                    shape.append(array.shape[indices])
+                                    mysize = mysize * array.shape[indices]
+                                else:
+                                    raise NotImplementedError("Index in ParDecl should be ALL")
+                            else:
+                                text = ast_utils.ProcessedWriter(sdfg, self.name_mapping).write_code(i)
+                                index_list.append(sym.pystr_to_symbolic(text))
+                                strides.pop(indices - changed_indices)
+                                offsets.pop(indices - changed_indices)
+                                changed_indices += 1
+                            indices = indices + 1
+
+                    if isinstance(variable_in_call, ast_internal_classes.Name_Node):
+                        shape = list(array.shape)
+                    # Functionally, this identifies the case where the array is in fact a scalar
+                    if shape == () or shape == (1, ) or shape == [] or shape == [1]:
+                        new_sdfg.add_scalar(self.name_mapping[new_sdfg][local_name.name], array.dtype, array.storage)
+                    else:
+                        # This is the case where the array is not a scalar and we need to create a view
+                        if not isinstance(variable_in_call, ast_internal_classes.Name_Node):
+                            offsets_zero = []
+                            for index in offsets:
+                                offsets_zero.append(0)
+                            viewname, view = sdfg.add_view(array_name + "_view_" + str(self.views),
+                                                           shape,
+                                                           array.dtype,
+                                                           storage=array.storage,
+                                                           strides=strides,
+                                                           offset=offsets_zero)
+                            from dace import subsets
+
+                            all_indices = [None] * (len(array.shape) - len(index_list)) + index_list
+                            subset = subsets.Range([(i, i, 1) if i is not None else (1, s, 1)
+                                                    for i, s in zip(all_indices, array.shape)])
+                            smallsubset = subsets.Range([(0, s - 1, 1) for s in shape])
+
+                            memlet = Memlet(f'{array_name}[{subset}]->{smallsubset}')
+                            memlet2 = Memlet(f'{viewname}[{smallsubset}]->{subset}')
+                            wv = None
+                            rv = None
+                            if local_name.name in read_names:
+                                r = substate.add_read(array_name)
+                                wv = substate.add_write(viewname)
+                                substate.add_edge(r, None, wv, 'views', dpcp(memlet))
+                            if local_name.name in write_names:
+                                rv = substate.add_read(viewname)
+                                w = substate.add_write(array_name)
+                                substate.add_edge(rv, 'views2', w, None, dpcp(memlet2))
+
+                            self.views = self.views + 1
+                            views.append([array_name, wv, rv, variables_in_call.index(variable_in_call)])
+
+                        new_sdfg.add_array(self.name_mapping[new_sdfg][local_name.name],
+                                           shape,
+                                           array.dtype,
+                                           array.storage,
+                                           strides=strides,
+                                           offset=offsets)
+            if not matched:
+                # This handles the case where the function is called with global variables
+                for array_name, array in all_arrays.items():
+                    if array_name in [globalsdfg_name]:
+                        local_name = parameters[variables_in_call.index(variable_in_call)]
+                        self.name_mapping[new_sdfg][local_name.name] = new_sdfg._find_new_name(local_name.name)
+                        self.all_array_names.append(self.name_mapping[new_sdfg][local_name.name])
+                        if local_name.name in read_names:
+                            ins_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+                        if local_name.name in write_names:
+                            outs_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+                        #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+
+                        indices = 0
+                        if isinstance(variable_in_call, ast_internal_classes.Array_Subscript_Node):
+                            indices = len(variable_in_call.indices)
+
+                        shape = array.shape[indices:]
+
+                        if shape == () or shape == (1, ):
+                            new_sdfg.add_scalar(self.name_mapping[new_sdfg][local_name.name], array.dtype,
+                                                array.storage)
+                        else:
+                            new_sdfg.add_array(self.name_mapping[new_sdfg][local_name.name],
+                                               shape,
+                                               array.dtype,
+                                               array.storage,
+                                               strides=array.strides,
+                                               offset=array.offset)
+
+        # Preparing symbol dictionary for nested sdfg
+        sym_dict = {}
+        for i in sdfg.symbols:
+            sym_dict[i] = i
+
+        not_found_write_names = []
+        not_found_read_names = []
+        for i in write_names:
+            if self.name_mapping[new_sdfg].get(i) is None:
+                not_found_write_names.append(i)
+        for i in read_names:
+            if self.name_mapping[new_sdfg].get(i) is None:
+                not_found_read_names.append(i)
+
+        # This handles the library states that are needed to inject dataflow to prevent library calls from being reordered
+        # Currently not sufficient for all cases
+        for i in self.libstates:
+            self.name_mapping[new_sdfg][i] = new_sdfg._find_new_name(i)
+            self.all_array_names.append(self.name_mapping[new_sdfg][i])
+            if i in read_names:
+                ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+            if i in write_names:
+                outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+            new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], dtypes.int32, transient=False)
+        addedmemlets = []
+        globalmemlets = []
+        # This handles the case where the function is called with read variables found in a module
+        for i in not_found_read_names:
+            if i in [a[0] for a in self.module_vars]:
+                if self.name_mapping[sdfg].get(i) is not None:
+                    self.name_mapping[new_sdfg][i] = new_sdfg._find_new_name(i)
+                    addedmemlets.append(i)
+                    self.all_array_names.append(self.name_mapping[new_sdfg][i])
+                    if i in read_names:
+                        ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    if i in write_names:
+                        outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    array_in_global = sdfg.arrays[self.name_mapping[sdfg][i]]
+                    if isinstance(array_in_global, Scalar):
+                        new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
+                    elif array_in_global.type == "Array":
+                        new_sdfg.add_array(self.name_mapping[new_sdfg][i],
+                                           array_in_global.shape,
+                                           array_in_global.dtype,
+                                           array_in_global.storage,
+                                           transient=False,
+                                           strides=array_in_global.strides,
+                                           offset=array_in_global.offset)
+                elif self.name_mapping[self.globalsdfg].get(i) is not None:
+                    self.name_mapping[new_sdfg][i] = new_sdfg._find_new_name(i)
+                    globalmemlets.append(i)
+                    self.all_array_names.append(self.name_mapping[new_sdfg][i])
+                    if i in read_names:
+                        ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    if i in write_names:
+                        outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    array_in_global = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
+                    if isinstance(array_in_global, Scalar):
+                        new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
+                    elif array_in_global.type == "Array":
+                        new_sdfg.add_array(self.name_mapping[new_sdfg][i],
+                                           array_in_global.shape,
+                                           array_in_global.dtype,
+                                           array_in_global.storage,
+                                           transient=False,
+                                           strides=array_in_global.strides,
+                                           offset=array_in_global.offset)
+        # This handles the case where the function is called with wrriten but not read variables found in a module                        
+        for i in not_found_write_names:
+            if i in not_found_read_names:
+                continue
+            if i in [a[0] for a in self.module_vars]:
+                if self.name_mapping[sdfg].get(i) is not None:
+                    self.name_mapping[new_sdfg][i] = new_sdfg._find_new_name(i)
+                    addedmemlets.append(i)
+                    self.all_array_names.append(self.name_mapping[new_sdfg][i])
+                    if i in read_names:
+                        ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    if i in write_names:
+                        outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    array = sdfg.arrays[self.name_mapping[sdfg][i]]
+                    if isinstance(array_in_global, Scalar):
+                        new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
+                    elif array_in_global.type == "Array":
+                        new_sdfg.add_array(self.name_mapping[new_sdfg][i],
+                                           array_in_global.shape,
+                                           array_in_global.dtype,
+                                           array_in_global.storage,
+                                           transient=False,
+                                           strides=array_in_global.strides,
+                                           offset=array_in_global.offset)
+                elif self.name_mapping[self.globalsdfg].get(i) is not None:
+                    self.name_mapping[new_sdfg][i] = new_sdfg._find_new_name(i)
+                    globalmemlets.append(i)
+                    self.all_array_names.append(self.name_mapping[new_sdfg][i])
+                    if i in read_names:
+                        ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    if i in write_names:
+                        outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    array = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
+                    if isinstance(array_in_global, Scalar):
+                        new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
+                    elif array_in_global.type == "Array":
+                        new_sdfg.add_array(self.name_mapping[new_sdfg][i],
+                                           array_in_global.shape,
+                                           array_in_global.dtype,
+                                           array_in_global.storage,
+                                           transient=False,
+                                           strides=array_in_global.strides,
+                                           offset=array_in_global.offset)
+
+        internal_sdfg = substate.add_nested_sdfg(new_sdfg,
+                                                 sdfg,
+                                                 ins_in_new_sdfg,
+                                                 outs_in_new_sdfg,
+                                                 symbol_mapping=sym_dict)
+        
+        # Now adding memlets
+        for i in self.libstates:
+            memlet = "0"
+            if i in write_names:
+                ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][i], internal_sdfg,
+                                            self.name_mapping[new_sdfg][i], memlet)
+            if i in read_names:
+                ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][i], internal_sdfg,
+                                           self.name_mapping[new_sdfg][i], memlet)
+
+        for i in variables_in_call:
+
+            local_name = parameters[variables_in_call.index(i)]
+            if self.name_mapping.get(sdfg).get(ast_utils.get_name(i)) is not None:
+                var = sdfg.arrays.get(self.name_mapping[sdfg][ast_utils.get_name(i)])
+                mapped_name = self.name_mapping[sdfg][ast_utils.get_name(i)]
+            # TODO: FIx symbols in function calls
+            elif ast_utils.get_name(i) in sdfg.symbols:
+                var = ast_utils.get_name(i)
+                mapped_name = ast_utils.get_name(i)
+            elif self.name_mapping.get(self.globalsdfg).get(ast_utils.get_name(i)) is not None:
+                var = self.globalsdfg.arrays.get(self.name_mapping[self.globalsdfg][ast_utils.get_name(i)])
+                mapped_name = self.name_mapping[self.globalsdfg][ast_utils.get_name(i)]
+            else:
+                raise NameError("Variable name not found: " + ast_utils.get_name(i))
+
+            # print("Context change:",i.name," ",var.shape)
+            if not hasattr(var, "shape") or len(var.shape) == 0:
+                memlet = ""
+            elif (len(var.shape) == 1 and var.shape[0] == 1):
+                memlet = "0"
+            else:
+                memlet = ast_utils.generate_memlet(i, sdfg, self)
+
+            found = False
+            for elem in views:
+                if mapped_name == elem[0] and elem[3] == variables_in_call.index(i):
+                    found = True
+
+                    if local_name.name in write_names:
+                        memlet = subsets.Range([(0, s - 1, 1) for s in sdfg.arrays[elem[2].label].shape])
+                        substate.add_memlet_path(internal_sdfg,
+                                                 elem[2],
+                                                 src_conn=self.name_mapping[new_sdfg][local_name.name],
+                                                 memlet=Memlet(expr=elem[2].label, subset=memlet))
+                    if local_name.name in read_names:
+                        memlet = subsets.Range([(0, s - 1, 1) for s in sdfg.arrays[elem[1].label].shape])
+                        substate.add_memlet_path(elem[1],
+                                                 internal_sdfg,
+                                                 dst_conn=self.name_mapping[new_sdfg][local_name.name],
+                                                 memlet=Memlet(expr=elem[1].label, subset=memlet))
+
+            if not found:
+                if local_name.name in write_names:
+                    ast_utils.add_memlet_write(substate, mapped_name, internal_sdfg,
+                                                self.name_mapping[new_sdfg][local_name.name], memlet)
+                if local_name.name in read_names:
+                    ast_utils.add_memlet_read(substate, mapped_name, internal_sdfg,
+                                               self.name_mapping[new_sdfg][local_name.name], memlet)
+
+        for i in addedmemlets:
+
+            memlet = ast_utils.generate_memlet(ast_internal_classes.Name_Node(name=i), sdfg, self)
+            if local_name.name in write_names:
+                ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][i], internal_sdfg,
+                                            self.name_mapping[new_sdfg][i], memlet)
+            if local_name.name in read_names:
+                ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][i], internal_sdfg,
+                                           self.name_mapping[new_sdfg][i], memlet)
+        for i in globalmemlets:
+
+            memlet = ast_utils.generate_memlet(ast_internal_classes.Name_Node(name=i), sdfg, self)
+            if local_name.name in write_names:
+                ast_utils.add_memlet_write(substate, self.name_mapping[self.globalsdfg][i], internal_sdfg,
+                                            self.name_mapping[new_sdfg][i], memlet)
+            if local_name.name in read_names:
+                ast_utils.add_memlet_read(substate, self.name_mapping[self.globalsdfg][i], internal_sdfg,
+                                           self.name_mapping[new_sdfg][i], memlet)
+
+        #Finally, now that the nested sdfg is built and the memlets are added, we can parse the internal of the subroutine and add it to the SDFG.
+
+        if node.execution_part is not None:
+            for j in node.specification_part.uses:
+                for k in j.list:
+                    if self.contexts.get(new_sdfg.name) is None:
+                        self.contexts[new_sdfg.name] = ast_utils.Context(name=new_sdfg.name)
+                    if self.contexts[new_sdfg.name].constants.get(
+                            ast_utils.get_name(k)) is None and self.contexts[self.globalsdfg.name].constants.get(
+                                ast_utils.get_name(k)) is not None:
+                        self.contexts[new_sdfg.name].constants[ast_utils.get_name(k)] = self.contexts[
+                            self.globalsdfg.name].constants[ast_utils.get_name(k)]
+
+                    pass
+            for j in node.specification_part.specifications:
+                self.declstmt2sdfg(j, new_sdfg)
+            for i in assigns:
+                self.translate(i, new_sdfg)
+            self.translate(node.execution_part, new_sdfg)
+
+    
+    def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG):
+        """
+        This parses binary operations to tasklets in a new state or creates
+        a function call with a nested SDFG if the operation is a function
+        call rather than a simple assignment.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        calls = ast_transforms.FindFunctionCalls()
+        calls.visit(node)
+        if len(calls.nodes) == 1:
+            augmented_call = calls.nodes[0]
+            if augmented_call.name.name not in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh", "__dace_epsilon"]:
+                augmented_call.args.append(node.lval)
+                augmented_call.hasret = True
+                self.call2sdfg(augmented_call, sdfg)
+                return
+
+        outputnodefinder = ast_transforms.FindOutputs()
+        outputnodefinder.visit(node)
+        output_vars = outputnodefinder.nodes
+        output_names = []
+        output_names_tasklet = []
+
+        for i in output_vars:
+            mapped_name = self.get_name_mapping_in_context(sdfg).get(i.name)
+            arrays = self.get_arrays_in_context(sdfg)
+
+            if mapped_name in arrays and mapped_name not in output_names:
+                output_names.append(mapped_name)
+                output_names_tasklet.append(i.name)
+
+        inputnodefinder = ast_transforms.FindInputs()
+        inputnodefinder.visit(node)
+        input_vars = inputnodefinder.nodes
+        input_names = []
+        input_names_tasklet = []
+
+        for i in input_vars:
+            mapped_name = self.get_name_mapping_in_context(sdfg).get(i.name)
+            arrays = self.get_arrays_in_context(sdfg)
+            if i.name in sdfg.symbols:
+                continue
+            if mapped_name in arrays:  # and mapped_name not in input_names:
+                count = input_names.count(mapped_name)
+                input_names.append(mapped_name)
+                input_names_tasklet.append(i.name + "_" + str(count) + "_in")
+
+        substate = ast_utils.add_simple_state_to_sdfg(
+            self, sdfg, "_state_l" + str(node.line_number[0]) + "_c" + str(node.line_number[1]))
+
+        #input_names_tasklet = [i_t + "_in" for i_t in input_names]
+        output_names_changed = [o_t + "_out" for o_t in output_names]
+        #output_names_changed = [o_t for o_t in output_names_tasklet]
+        #output_names_dict = {on: dace.pointer(dace.int32) for on in output_names_changed}
+
+        tasklet = ast_utils.add_tasklet(substate, "_l" + str(node.line_number[0]) + "_c" + str(node.line_number[1]),
+                                         input_names_tasklet, output_names_changed, "text", node.line_number,
+                                         self.file_name)
+
+        for i, j in zip(input_names, input_names_tasklet):
+            memlet_range = self.get_memlet_range(sdfg, input_vars, i, j)
+            ast_utils.add_memlet_read(substate, i, tasklet, j, memlet_range)
+
+        for i, j, k in zip(output_names, output_names_tasklet, output_names_changed):
+
+            memlet_range = self.get_memlet_range(sdfg, output_vars, i, j)
+            ast_utils.add_memlet_write(substate, i, tasklet, k, memlet_range)
+        tw = ast_utils.TaskletWriter(output_names, output_names_changed, sdfg, self.name_mapping, input_names,
+                                      input_names_tasklet)
+
+        text = tw.write_code(node)
+        tasklet.code = CodeBlock(text, lang.Python)
+
+
+    def call2sdfg(self, node: ast_internal_classes.Call_Expr_Node, sdfg: SDFG):
+        """
+        This parses function calls to a nested SDFG 
+        or creates a tasklet with an external library call.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        """
+
+        self.last_call_expression[sdfg] = node.args
+        match_found = False
+        rettype = "INTEGER"
+        hasret = False
+        if node.name in self.functions_and_subroutines:
+            for i in self.top_level.function_definitions:
+                if i.name == node.name:
+                    self.function2sdfg(i, sdfg)
+                    return
+            for i in self.top_level.subroutine_definitions:
+                if i.name == node.name:
+                    self.subroutine2sdfg(i, sdfg)
+                    return
+            for j in self.top_level.modules:
+                for i in j.function_definitions:
+                    if i.name == node.name:
+                        self.function2sdfg(i, sdfg)
+                        return
+                for i in j.subroutine_definitions:
+                    if i.name == node.name:
+                        self.subroutine2sdfg(i, sdfg)
+                        return
+        else:
+            # This part handles the case that it's an external library call
+            libstate = self.libraries.get(node.name.name)
+            if not isinstance(rettype, ast_internal_classes.Void) and hasattr(node, "hasret"):
+                if node.hasret:
+                    hasret = True
+                    retval = node.args.pop(len(node.args) - 1)
+            if node.name == "free":
+                return
+            input_names_tasklet = {}
+            output_names_tasklet = []
+            input_names = []
+            output_names = []
+            special_list_in = {}
+            special_list_out = []
+            if libstate is not None:
+                special_list_in[self.name_mapping[sdfg][libstate] + "_task"] = dtypes.pointer(
+                    sdfg.arrays.get(self.name_mapping[sdfg][libstate]).dtype)
+                special_list_out.append(self.name_mapping[sdfg][libstate] + "_task_out")
+            used_vars = [
+                node for node in ast_transforms.mywalk(node) if isinstance(node, ast_internal_classes.Name_Node)
+            ]
+
+            for i in used_vars:
+                for j in sdfg.arrays:
+                    if self.name_mapping.get(sdfg).get(i.name) == j and j not in input_names:
+                        elem = sdfg.arrays.get(j)
+                        scalar = False
+                        if len(elem.shape) == 0:
+                            scalar = True
+                        elif (len(elem.shape) == 1 and elem.shape[0] == 1):
+                            scalar = True
+                        if not scalar and not node.name.name in ["fprintf", "printf"]:
+                            output_names.append(j)
+                            output_names_tasklet.append(i.name)
+
+                        input_names_tasklet[i.name] = dtypes.pointer(elem.dtype)
+                        input_names.append(j)
+
+            output_names_changed = []
+            for o, o_t in zip(output_names, output_names_tasklet):
+                output_names_changed.append(o_t + "_out")
+
+            tw = ast_utils.TaskletWriter(output_names_tasklet.copy(), output_names_changed.copy(), sdfg,
+                                          self.name_mapping)
+            if not isinstance(rettype, ast_internal_classes.Void) and hasret:
+                special_list_in[retval.name] = pointer(self.get_dace_type(rettype))
+                special_list_out.append(retval.name + "_out")
+                text = tw.write_code(
+                    ast_internal_classes.BinOp_Node(lval=retval, op="=", rval=node, line_number=node.line_number))
+
+            else:
+                text = tw.write_code(node)
+            substate = ast_utils.add_simple_state_to_sdfg(self, sdfg, "_state" + str(node.line_number[0]))
+
+            tasklet = ast_utils.add_tasklet(substate, str(node.line_number[0]), {
+                **input_names_tasklet,
+                **special_list_in
+            }, output_names_changed + special_list_out, "text", node.line_number, self.file_name)
+            if libstate is not None:
+                ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][libstate], tasklet,
+                                           self.name_mapping[sdfg][libstate] + "_task", "0")
+
+                ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][libstate], tasklet,
+                                            self.name_mapping[sdfg][libstate] + "_task_out", "0")
+            if not isinstance(rettype, ast_internal_classes.Void) and hasret:
+                ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][retval.name], tasklet, retval.name, "0")
+
+                ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][retval.name], tasklet,
+                                            retval.name + "_out", "0")
+
+            for i, j in zip(input_names, input_names_tasklet):
+                memlet_range = self.get_memlet_range(sdfg, used_vars, i, j)
+                ast_utils.add_memlet_read(substate, i, tasklet, j, memlet_range)
+
+            for i, j, k in zip(output_names, output_names_tasklet, output_names_changed):
+
+                memlet_range = self.get_memlet_range(sdfg, used_vars, i, j)
+                ast_utils.add_memlet_write(substate, i, tasklet, k, memlet_range)
+
+            setattr(tasklet, "code", CodeBlock(text, lang.Python))
+
+    def declstmt2sdfg(self, node: ast_internal_classes.Decl_Stmt_Node, sdfg: SDFG):
+        """
+        This function translates a variable declaration statement to an access node on the sdfg
+        :param node: The node to translate
+        :param sdfg: The sdfg to attach the access node to
+        :note This function is the top level of the declaration, most implementation is in vardecl2sdfg
+        """
+        for i in node.vardecl:
+            self.translate(i, sdfg)
+
+    def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
+        """
+        This function translates a variable declaration to an access node on the sdfg
+        :param node: The node to translate
+        :param sdfg: The sdfg to attach the access node to
+
+        """
+        #if the sdfg is the toplevel-sdfg, the variable is a global variable
+        transient = True
+        # find the type
+        datatype = self.get_dace_type(node.type)
+        # get the dimensions
+        if node.sizes is not None:
+            sizes = []
+            offset = []
+            offset_value = -1
+            for i in node.sizes:
+                tw = ast_utils.TaskletWriter([], [], sdfg, self.name_mapping)
+                text = tw.write_code(i)
+                sizes.append(sym.pystr_to_symbolic(text))
+                offset.append(offset_value)
+
+        else:
+            sizes = None
+        # create and check name - if variable is already defined (function argument and defined in declaration part) simply stop
+        if self.name_mapping[sdfg].get(node.name) is not None:
+            return
+            
+        if node.name in sdfg.symbols:
+            return
+            
+        self.name_mapping[sdfg][node.name] = sdfg._find_new_name(node.name)
+
+        if sizes is None:
+            sdfg.add_scalar(self.name_mapping[sdfg][node.name], dtype=datatype, transient=transient)
+        else:
+            strides = [dat._prod(sizes[:i]) for i in range(len(sizes))]
+            sdfg.add_array(self.name_mapping[sdfg][node.name],
+                           shape=sizes,
+                           dtype=datatype,
+                           offset=offset,
+                           strides=strides,
+                           transient=transient)
+        
+        self.all_array_names.append(self.name_mapping[sdfg][node.name])
+        if self.contexts.get(sdfg.name) is None:
+            self.contexts[sdfg.name] = ast_utils.Context(name=sdfg.name)
+        if node.name not in self.contexts[sdfg.name].containers:
+            self.contexts[sdfg.name].containers.append(node.name)
+
+
+def create_sdfg_from_string(
+    source_string: str,
+    sdfg_name: str,
+):
+    """
+    Creates an SDFG from a fortran file in a string
+    :param source_string: The fortran file as a string
+    :param sdfg_name: The name to be given to the resulting SDFG
+    :return: The resulting SDFG
+    
+    """
+    parser = pf().create(std="f2008")
+    reader = fsr(source_string)
+    ast = parser(reader)
+    tables = SymbolTable
+    own_ast = ast_components.InternalFortranAst(ast, tables)
+    program = own_ast.create_ast(ast)
+    functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
+    functions_and_subroutines_builder.visit(program)
+    own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
+    program = ast_transforms.functionStatementEliminator(program)
+    program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
+    program = ast_transforms.CallExtractor().visit(program)
+    program = ast_transforms.SignToIf().visit(program)
+    program = ast_transforms.ArrayToLoop().visit(program)
+    program = ast_transforms.SumToLoop().visit(program)
+    program = ast_transforms.ForDeclarer().visit(program)
+    program = ast_transforms.IndexExtractor().visit(program)
+    ast2sdfg = AST_translator(own_ast, __file__)
+    sdfg = SDFG(sdfg_name)
+    ast2sdfg.top_level = program
+    ast2sdfg.globalsdfg = sdfg
+    ast2sdfg.translate(program, sdfg)
+
+    for node, parent in sdfg.all_nodes_recursive():
+        if isinstance(node, nodes.NestedSDFG):
+            if 'test_function' in node.sdfg.name:
+                sdfg = node.sdfg
+                break
+    sdfg.parent = None
+    sdfg.parent_sdfg = None
+    sdfg.parent_nsdfg_node = None
+    sdfg.reset_sdfg_list()
+    return sdfg
+
+
+def create_sdfg_from_fortran_file(source_string: str):
+    """
+    Creates an SDFG from a fortran file
+    :param source_string: The fortran file name
+    :return: The resulting SDFG
+
+    """
+    parser = pf().create(std="f2008")
+    reader = ffr(source_string)
+    ast = parser(reader)
+    tables = SymbolTable
+    own_ast = ast_components.InternalFortranAst(ast, tables)
+    program = own_ast.create_ast(ast)
+    functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
+    functions_and_subroutines_builder.visit(program)
+    own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
+    program = ast_transforms.functionStatementEliminator(program)
+    program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
+    program = ast_transforms.CallExtractor().visit(program)
+    program = ast_transforms.SignToIf().visit(program)
+    program = ast_transforms.ArrayToLoop().visit(program)
+    program = ast_transforms.SumToLoop().visit(program)
+    program = ast_transforms.ForDeclarer().visit(program)
+    program = ast_transforms.IndexExtractor().visit(program)
+    ast2sdfg = AST_translator(own_ast, __file__)
+    sdfg = SDFG(source_string)
+    ast2sdfg.top_level = program
+    ast2sdfg.globalsdfg = sdfg
+    ast2sdfg.translate(program, sdfg)
+
+    return sdfg

From b7a8b5f001f93f717659a8e011cf8f06e745c526 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 12 Jun 2023 00:22:37 -0700
Subject: [PATCH 104/392] Bump version

---
 dace/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/version.py b/dace/version.py
index 62ea085b13..9b67b07d2f 100644
--- a/dace/version.py
+++ b/dace/version.py
@@ -1 +1 @@
-__version__ = '0.14.3'
+__version__ = '0.14.4'

From 49e8dde46a8a887f4004d09911d9d2b6bd3b3dec Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 12 Jun 2023 22:45:31 +0200
Subject: [PATCH 105/392] reverting incorrect extSDFG addition

---
 dace/sdfg/nodes.py | 144 ---------------------------------------------
 dace/sdfg/sdfg.py  |   5 --
 dace/sdfg/state.py |  69 ----------------------
 3 files changed, 218 deletions(-)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 4bb9f8f6b2..866d77bed6 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -645,150 +645,6 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None):
 # ------------------------------------------------------------------------------
 
 
-@make_properties
-class ExternalNestedSDFG(CodeNode):
-    """ An SDFG state node that will contain an SDFG of its own. It has outside connectors, but lacks the nestedSDFG.
-        This node is used to represent a nested SDFG that is not yet defined, but will be defined later.
-
-        :note: A nested SDFG cannot create recursion (one of its parent SDFGs).
-    """
-
-    # NOTE: We cannot use SDFG as the type because of an import loop
-    sdfg = SDFGReferenceProperty(desc="The SDFG", allow_none=True)
-    schedule = EnumProperty(dtype=dtypes.ScheduleType,
-                            desc="SDFG schedule",
-                            allow_none=True,
-                            default=dtypes.ScheduleType.Default)
-    symbol_mapping = DictProperty(key_type=str,
-                                  value_type=dace.symbolic.pystr_to_symbolic,
-                                  desc="Mapping between internal symbols and their values, expressed as "
-                                  "symbolic expressions")
-    debuginfo = DebugInfoProperty()
-    is_collapsed = Property(dtype=bool, desc="Show this node/scope/state as collapsed", default=False)
-
-    instrument = EnumProperty(dtype=dtypes.InstrumentationType,
-                              desc="Measure execution statistics with given method",
-                              default=dtypes.InstrumentationType.No_Instrumentation)
-
-    no_inline = Property(dtype=bool,
-                         desc="If True, this nested SDFG will not be inlined during "
-                         "simplification",
-                         default=False)
-
-    unique_name = Property(dtype=str, desc="Unique name of the SDFG", default="")
-
-    def __init__(self,
-                 label,
-                 sdfg,
-                 inputs: Set[str],
-                 outputs: Set[str],
-                 symbol_mapping: Dict[str, Any] = None,
-                 schedule=dtypes.ScheduleType.Default,
-                 location=None,
-                 debuginfo=None):
-        from dace.sdfg import SDFG
-        super(ExternalNestedSDFG, self).__init__(label, location, inputs, outputs)
-
-        # Properties
-        self.sdfg: SDFG = sdfg
-        self.symbol_mapping = symbol_mapping or {}
-        self.schedule = schedule
-        self.debuginfo = debuginfo
-    
-    def __deepcopy__(self, memo):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            setattr(result, k, dcpy(v, memo))
-        if result._sdfg is not None:
-            result._sdfg.parent_nsdfg_node = result
-        return result
-
-    @staticmethod
-    def from_json(json_obj, context=None):
-        from dace import SDFG  # Avoid import loop
-
-        # We have to load the SDFG first.
-        ret = NestedSDFG("nolabel", SDFG('nosdfg'), {}, {})
-
-        dace.serialize.set_properties_from_json(ret, json_obj, context)
-
-        if context and 'sdfg_state' in context:
-            ret.sdfg.parent = context['sdfg_state']
-        if context and 'sdfg' in context:
-            ret.sdfg.parent_sdfg = context['sdfg']
-
-        ret.sdfg.parent_nsdfg_node = ret
-
-        ret.sdfg.update_sdfg_list([])
-
-        return ret
-
-    @property
-    def free_symbols(self) -> Set[str]:
-        return set().union(*(map(str,
-                                 pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()),
-                           *(map(str,
-                                 pystr_to_symbolic(v).free_symbols) for v in self.location.values()))
-
-    def infer_connector_types(self, sdfg, state):
-        # Avoid import loop
-        from dace.sdfg.infer_types import infer_connector_types, infer_aliasing
-
-        # Propagate aliasing information into SDFG
-        infer_aliasing(self, sdfg, state)
-
-        # Infer internal connector types
-        infer_connector_types(self.sdfg)
-
-    def __str__(self):
-        if not self.label:
-            return "SDFG"
-        else:
-            return self.label
-
-    def validate(self, sdfg, state, references: Optional[Set[int]] = None):
-        if not dtypes.validate_name(self.label):
-            raise NameError('Invalid nested SDFG name "%s"' % self.label)
-        for in_conn in self.in_connectors:
-            if not dtypes.validate_name(in_conn):
-                raise NameError('Invalid input connector "%s"' % in_conn)
-        for out_conn in self.out_connectors:
-            if not dtypes.validate_name(out_conn):
-                raise NameError('Invalid output connector "%s"' % out_conn)
-        connectors = self.in_connectors.keys() | self.out_connectors.keys()
-        for conn in connectors:
-            if conn not in self.sdfg.arrays:
-                raise NameError(
-                    f'Connector "{conn}" was given but is not a registered data descriptor in the nested SDFG. '
-                    'Example: parameter passed to a function without a matching array within it.')
-        for dname, desc in self.sdfg.arrays.items():
-            # TODO(later): Disallow scalars without access nodes (so that this
-            #              check passes for them too).
-            if isinstance(desc, data.Scalar):
-                continue
-            if not desc.transient and dname not in connectors:
-                raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname)
-            if dname in connectors and desc.transient:
-                raise NameError('"%s" is a connector but its corresponding array is transient' % dname)
-
-        # Validate undefined symbols
-        symbols = set(k for k in self.sdfg.free_symbols if k not in connectors)
-        missing_symbols = [s for s in symbols if s not in self.symbol_mapping]
-        if missing_symbols:
-            raise ValueError('Missing symbols on nested SDFG: %s' % (missing_symbols))
-        extra_symbols = self.symbol_mapping.keys() - symbols
-        if len(extra_symbols) > 0:
-            # TODO: Elevate to an error?
-            warnings.warn(f"{self.label} maps to unused symbol(s): {extra_symbols}")
-
-        # Recursively validate nested SDFG
-        self.sdfg.validate(references)
-
-# ------------------------------------------------------------------------------
-
-
 # Scope entry class
 class EntryNode(Node):
     """ A type of node that opens a scope (e.g., Map or Consume). """
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index f0d38b2081..bee601e7b1 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2692,8 +2692,3 @@ def make_array_memlet(self, array: str):
            :return: a Memlet that fully transfers array
         """
         return dace.Memlet.from_array(array, self.data(array))
-
-@make_properties    
-class SDFGShell(SDFG):
-    """ A shell SDFG that allows inputs, outputs and SDFG properties but does not contain the actual SDFG. Can be transformed into an SDFG by loading in the actual content.
-   """
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index bd5a5f2205..0796bf00d0 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -1174,75 +1174,6 @@ def add_nested_sdfg(
                 sdfg.add_symbol(sym, infer_expr_type(symval, self.parent.symbols) or dtypes.typeclass(int))
 
         return s
-    
-    def add_external_nested_sdfg(
-        self,
-        sdfg: 'dace.sdfg.SDFG',
-        parent,
-        inputs: Union[Set[str], Dict[str, dtypes.typeclass]],
-        outputs: Union[Set[str], Dict[str, dtypes.typeclass]],
-        symbol_mapping: Dict[str, Any] = None,
-        name=None,
-        schedule=dtypes.ScheduleType.Default,
-        location=None,
-        debuginfo=None,
-    ):
-        """ Adds an external nested SDFG to the SDFG state. """
-        if name is None:
-            name = sdfg.label
-        debuginfo = _getdebuginfo(debuginfo or self._default_lineinfo)
-
-        sdfg.parent = self
-        sdfg.parent_sdfg = self.parent
-
-        sdfg.update_sdfg_list([])
-
-        # Make dictionary of autodetect connector types from set
-        if isinstance(inputs, (set, collections.abc.KeysView)):
-            inputs = {k: None for k in inputs}
-        if isinstance(outputs, (set, collections.abc.KeysView)):
-            outputs = {k: None for k in outputs}
-
-        s = nd.ExternalNestedSDFG(
-            name,
-            sdfg,
-            inputs,
-            outputs,
-            symbol_mapping=symbol_mapping,
-            schedule=schedule,
-            location=location,
-            debuginfo=debuginfo,
-        )
-        self.add_node(s)
-
-        sdfg.parent_nsdfg_node = s
-
-        # Add "default" undefined symbols if None are given
-        symbols = sdfg.free_symbols
-        if symbol_mapping is None:
-            symbol_mapping = {s: s for s in symbols}
-            s.symbol_mapping = symbol_mapping
-
-        # Validate missing symbols
-        missing_symbols = [s for s in symbols if s not in symbol_mapping]
-        if missing_symbols and parent:
-            # If symbols are missing, try to get them from the parent SDFG
-            parent_mapping = {s: s for s in missing_symbols if s in parent.symbols}
-            symbol_mapping.update(parent_mapping)
-            s.symbol_mapping = symbol_mapping
-            missing_symbols = [s for s in symbols if s not in symbol_mapping]
-        if missing_symbols:
-            raise ValueError('Missing symbols on nested SDFG "%s": %s' % (name, missing_symbols))
-
-        # Add new global symbols to nested SDFG
-        from dace.codegen.tools.type_inference import infer_expr_type
-        for sym, symval in s.symbol_mapping.items():
-            if sym not in sdfg.symbols:
-                # TODO: Think of a better way to avoid calling
-                # symbols_defined_at in this moment
-                sdfg.add_symbol(sym, infer_expr_type(symval, self.parent.symbols) or dtypes.typeclass(int))
-
-        return s
 
     def add_map(
         self,

From 58bc67c04130056098a2b428f4ce2407491fb668 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 13 Jun 2023 00:38:54 +0200
Subject: [PATCH 106/392] adding tests for fortran frontend

---
 dace/cli/fcdc.py                       |  62 +++++++
 dace/frontend/fortran/__init__.py      |   0
 dace/sdfg/utils.py                     | 175 ++++++++++++++++--
 tests/fortran/array_test.py            | 162 +++++++++++++++++
 tests/fortran/dace_support_test.py     |  51 ++++++
 tests/fortran/fortran_language_test.py | 240 +++++++++++++++++++++++++
 tests/fortran/view_test.py             | 180 +++++++++++++++++++
 7 files changed, 851 insertions(+), 19 deletions(-)
 create mode 100644 dace/cli/fcdc.py
 create mode 100644 dace/frontend/fortran/__init__.py
 create mode 100644 tests/fortran/array_test.py
 create mode 100644 tests/fortran/dace_support_test.py
 create mode 100644 tests/fortran/fortran_language_test.py
 create mode 100644 tests/fortran/view_test.py

diff --git a/dace/cli/fcdc.py b/dace/cli/fcdc.py
new file mode 100644
index 0000000000..04f835d815
--- /dev/null
+++ b/dace/cli/fcdc.py
@@ -0,0 +1,62 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Simple Fortran SDFG command-line compiler. """
+
+import dace
+import os
+import sys
+import argparse
+import shutil
+from dace.frontend.fortran import fortran_parser
+
+def main():
+    # Command line options parser
+    parser = argparse.ArgumentParser(description='Fortran to SDFG command-line transpiler.')
+
+    # Required argument for Fortran file path
+    parser.add_argument('filepath', help='<PATH TO FORTRAN FILE>', type=str)
+
+    # Optional argument for output location
+    parser.add_argument('-o',
+                        '--out',
+                        type=str,
+                        help='If provided, saves library as the given file or in the specified path, '
+                        'together with a header file.')
+
+    parser.add_argument('-O',
+                        '--optimize',
+                        dest='optimize',
+                        action='store_true',
+                        help="If set, invokes the command-line optimization"
+                        " interface",
+                        default=False)
+
+    args = parser.parse_args()
+
+    filepath = args.filepath
+    if not os.path.isfile(filepath):
+        print('Fortran file', filepath, 'not found')
+        exit(1)
+
+    outpath = args.out
+
+    # Load SDFG
+    sdfg = fortran_parser.create_sdfg_from_fortran_file(filepath)
+
+    if args.optimize:
+        sdfg.optimize()
+
+    # Compile SDFG
+    sdfg.compile(outpath)
+
+    # Copying header file to optional path
+    if outpath is not None:
+        source = os.path.join(sdfg.build_folder, 'include', sdfg.name + '.h')
+        if os.path.isdir(outpath):
+            outpath = os.path.join(outpath, sdfg.name + '.h')
+        else:
+            outpath = os.path.join(os.path.dirname(outpath), sdfg.name + '.h')
+        shutil.copyfile(source, outpath)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/dace/frontend/fortran/__init__.py b/dace/frontend/fortran/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 0b62c96c0b..93ff2c79e8 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -15,7 +15,7 @@
 from dace.sdfg.nodes import Node, NestedSDFG
 from dace.sdfg.state import SDFGState, StateSubgraphView
 from dace.sdfg.scope import ScopeSubgraphView
-from dace.sdfg import nodes as nd, graph as gr
+from dace.sdfg import nodes as nd, graph as gr, propagation
 from dace import config, data as dt, dtypes, memlet as mm, subsets as sbs, symbolic
 from dace.cli.progress import optional_progressbar
 from string import ascii_uppercase
@@ -461,7 +461,7 @@ def merge_maps(
     inner_map_exit: nd.MapExit,
     param_merge: Callable[[ParamsType, ParamsType], ParamsType] = lambda p1, p2: p1 + p2,
     range_merge: Callable[[RangesType, RangesType], RangesType] = lambda r1, r2: type(r1)(r1.ranges + r2.ranges)
-) -> (nd.MapEntry, nd.MapExit):
+) -> Tuple[nd.MapEntry, nd.MapExit]:
     """ Merges two maps (their entries and exits). It is assumed that the
     operation is valid. """
 
@@ -825,12 +825,6 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
     in_edge = in_edges[0]
     out_edge = out_edges[0]
 
-    # Check if there is a 'views' connector
-    if in_edge.dst_conn and in_edge.dst_conn == 'views':
-        return in_edge
-    if out_edge.src_conn and out_edge.src_conn == 'views':
-        return out_edge
-
     # If there is one incoming and one outgoing edge, and one leads to a code
     # node, the one that leads to an access node is the viewed data.
     inmpath = state.memlet_path(in_edge)
@@ -857,6 +851,12 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
     if in_edge.data.data == view.data and out_edge.data.data == view.data:
         return None
 
+    # Check if there is a 'views' connector
+    if in_edge.dst_conn and in_edge.dst_conn == 'views':
+        return in_edge
+    if out_edge.src_conn and out_edge.src_conn == 'views':
+        return out_edge
+
     # If both memlets' data are the respective access nodes, the access
     # node at the highest scope is the one that is viewed.
     if isinstance(in_edge.src, nd.EntryNode):
@@ -1501,19 +1501,13 @@ def is_fpga_kernel(sdfg, state):
     if ("is_FPGA_kernel" in state.location and state.location["is_FPGA_kernel"] == False):
         return False
     data_nodes = state.data_nodes()
-    at_least_one_fpga_array = False
+    if len(data_nodes) == 0:
+        return False
     for n in data_nodes:
-        desc = n.desc(sdfg)
-        if desc.storage in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
-                            dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
-            at_least_one_fpga_array = True
-        if isinstance(desc, dt.Scalar):
-            continue
-        if desc.storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
-                                dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
+        if n.desc(sdfg).storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
+                                        dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
             return False
-
-    return at_least_one_fpga_array
+    return True
 
 
 def postdominators(
@@ -1651,3 +1645,146 @@ def check_sdfg(sdfg: SDFG):
                 assert node.sdfg.parent_sdfg is sdfg
                 assert node.sdfg.parent.parent is sdfg
                 check_sdfg(node.sdfg)
+
+
+def normalize_offsets(sdfg: SDFG):
+    """
+    Normalizes descriptor offsets to 0 and adjusts the Memlet subsets accordingly. This operation is done in-place.
+
+    :param sdfg: The SDFG to be normalized.
+    """
+
+    import ast
+    from dace.frontend.python import astutils
+
+    for sd in sdfg.all_sdfgs_recursive():
+        offsets = dict()
+        for arrname, arrdesc in sd.arrays.items():
+            if not isinstance(arrdesc, dt.Array):  # NOTE: Does this work with Views properly?
+                continue
+            if any(o != 0 for o in arrdesc.offset):
+                offsets[arrname] = arrdesc.offset
+                arrdesc.offset = [0] * len(arrdesc.shape)
+        if offsets:
+            for e in sd.edges():
+                memlets = e.data.get_read_memlets(sd.arrays)
+                for m in memlets:
+                    if m.data in offsets:
+                        m.subset.offset(offsets[m.data], False)
+                for node in ast.walk(e.data.condition.code[0]):
+                    if isinstance(node, ast.Subscript):
+                        m = memlets.pop(0)
+                        subscript: ast.Subscript = ast.parse(str(m)).body[0].value
+                        assert isinstance(node.value, ast.Name) and node.value.id == m.data
+                        node.slice = ast.copy_location(subscript.slice, node.slice)
+                e.data._cond_sympy = None
+                for k, v in e.data.assignments.items():
+                    vast = ast.parse(v)
+                    for node in ast.walk(vast):
+                        if isinstance(node, ast.Subscript):
+                            m = memlets.pop(0)
+                            subscript: ast.Subscript = ast.parse(str(m)).body[0].value
+                            assert isinstance(node.value, ast.Name) and node.value.id == m.data
+                            node.slice = ast.copy_location(subscript.slice, node.slice)
+                    newv = astutils.unparse(vast)
+                    e.data.assignments[k] = newv
+                assert not memlets
+            for state in sd.states():
+                # NOTE: Ideally, here we just want to iterate over the edges. However, we need to handle both the
+                # subset and the other subset. Therefore, it is safer to traverse the Memlet paths.
+                for node in state.nodes():
+                    if isinstance(node, nd.AccessNode) and node.data in offsets:
+                        off = offsets[node.data]
+                        visited = set()
+                        for e0 in state.all_edges(node):
+                            for e1 in state.memlet_tree(e0):
+                                if e1 in visited:
+                                    continue
+                                visited.add(e1)
+                                if e1.data.data == node.data:
+                                    e1.data.subset.offset(off, False)
+                                else:
+                                    e1.data.other_subset.offset(off, False)
+
+
+def prune_symbols(sdfg: SDFG):
+    """
+    Prunes unused symbols from the SDFG and the NestedSDFG symbol mappings. This operation is done in place. See also
+    `dace.transformation.interstate.PruneSymbols`.
+
+    :param sdfg: The SDFG to have its symbols pruned.
+    """
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, nd.NestedSDFG):
+                prune_symbols(node.sdfg)
+                declared_symbols = set(node.sdfg.symbols.keys())
+                free_symbols = node.sdfg.free_symbols
+                defined_symbols = declared_symbols - free_symbols
+                for s in defined_symbols:
+                    del node.sdfg.symbols[s]
+                    if s in node.symbol_mapping:
+                        del node.symbol_mapping[s]
+
+
+def make_dynamic_map_inputs_unique(sdfg: SDFG):
+    for sd in sdfg.all_sdfgs_recursive():
+        dynamic_map_inputs = set(sd.arrays.keys())
+        for state in sd.states():
+            for node in state.nodes():
+                repl_dict = {}
+                if isinstance(node, nd.MapEntry):
+                    # Find all dynamic map inputs
+                    for e in state.in_edges(node):
+                        if not e.dst_conn.startswith('IN_'):
+                            if e.dst_conn in dynamic_map_inputs:
+                                new_name = dt.find_new_name(e.dst_conn, dynamic_map_inputs)
+                                dynamic_map_inputs.add(new_name)
+                                repl_dict[e.dst_conn] = new_name
+                                e._dst_conn = new_name
+                            else:
+                                dynamic_map_inputs.add(e.dst_conn)
+                    if repl_dict:
+                        in_connectors = {repl_dict[n] if n in repl_dict else n: t for n, t in node.in_connectors.items()}
+                        node.in_connectors = in_connectors
+                        node.map.range.replace(repl_dict)
+                        state.scope_subgraph(node).replace_dict(repl_dict)
+                        propagation.propagate_memlets_scope(sd, state, state.scope_tree()[node])
+
+
+def get_thread_local_data(sdfg: SDFG) -> List[str]:
+    """ Returns a list of all data that are thread-local in the SDFG.
+
+    This method DOES NOT apply recursively to nested SDFGs. It is also does not take into account outer Maps.
+    
+    :param sdfg: The SDFG to check.
+    :return: A list of the names of all data that are thread-local in the SDFG.
+    """
+    # NOTE: We could exclude non-transient data here, but it is interesting to see if we find any non-transient data
+    # only inside a Map.
+    data_to_check = {name: None for name in sdfg.arrays.keys()}
+    for state in sdfg.nodes():
+        scope_dict = state.scope_dict()
+        for node in state.nodes():
+            if isinstance(node, nd.AccessNode):
+                # If the data was already removed from the candidated, continue
+                if node.data not in data_to_check:
+                    continue
+                # If the data is not in a scope, i.e., cannot be thread-local, remove it from the candidates
+                if scope_dict[node] is None:
+                    del data_to_check[node.data]
+                    continue
+                # If the data is in a Map ...
+                if isinstance(scope_dict[node], nd.MapEntry):
+                    # ... if we haven't seen the data yet, note down the scope
+                    if data_to_check[node.data] is None:
+                        data_to_check[node.data] = scope_dict[node]
+                    # ... if we have seen the data before, but in a different scope, remove it from the candidates
+                    elif data_to_check[node.data] != scope_dict[node]:
+                        del data_to_check[node.data]
+    
+    result = list(data_to_check.keys())
+    for name in result:
+        if not sdfg.arrays[name].transient:
+            warnings.warn(f'Found thread-local data "{name}" that is not transient.')
+    return result
diff --git a/tests/fortran/array_test.py b/tests/fortran/array_test.py
new file mode 100644
index 0000000000..0e286d0295
--- /dev/null
+++ b/tests/fortran/array_test.py
@@ -0,0 +1,162 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+
+from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_array_access():
+    test_string = """
+                    PROGRAM access_test
+                    implicit none
+                    double precision d(4)
+                    CALL array_access_test_function(d)
+                    end
+
+                    SUBROUTINE array_access_test_function(d)
+                    double precision d(4)
+
+                    d(2)=5.5
+                    
+                    END SUBROUTINE array_access_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "array_access_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([4], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0] == 42)
+    assert (a[1] == 5.5)
+    assert (a[2] == 42)
+
+
+def test_fortran_frontend_array_ranges():
+    test_string = """
+                    PROGRAM ranges_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL array_ranges_test_function(d)
+                    end
+
+                    SUBROUTINE array_ranges_test_function(d)
+                    double precision d(3,4,5),e(3,4,5),f(3,4,5)
+
+                    e(:,:,:)=1.0
+                    f(:,:,:)=2.0
+                    f(:,2:4,:)=3.0
+                    f(1,1,:)=4.0
+                    d(:,:,:)=e(:,:,:)+f(:,:,:)
+                    d(1,2:4,1)=e(1,2:4,1)*10.0
+                    d(1,1,1)=SUM(e(:,1,:))
+                    
+                    END SUBROUTINE array_ranges_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "array_access_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 0] == 15)
+    assert (d[0, 1, 0] == 10)
+    assert (d[1, 0, 0] == 3)
+    assert (d[2, 3, 3] == 4)
+    assert (d[0, 0, 2] == 5)
+
+
+def test_fortran_frontend_array_3dmap():
+    test_string = """
+                    PROGRAM array_3dmap_test
+                    implicit none
+                    double precision d(4,4,4)
+                    CALL array_3dmap_test_function(d)
+                    end
+
+                    SUBROUTINE array_3dmap_test_function(d)
+                    double precision d(4,4,4)
+
+                    d(:,:,:)=7
+                    
+                    END SUBROUTINE array_3dmap_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "array_3dmap_test")
+    sdfg.simplify(verbose=True)
+    sdutil.normalize_offsets(sdfg)
+    from dace.transformation.auto import auto_optimize as aopt
+    aopt.auto_optimize(sdfg, dtypes.DeviceType.CPU)
+    a = np.full([4, 4, 4], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0, 0, 0] == 7)
+    assert (a[3, 3, 3] == 7)
+
+
+def test_fortran_frontend_twoconnector():
+    test_string = """
+                    PROGRAM twoconnector_test
+                    implicit none
+                    double precision d(4)
+                    CALL twoconnector_test_function(d)
+                    end
+
+                    SUBROUTINE twoconnector_test_function(d)
+                    double precision d(4)
+
+                    d(2)=d(1)+d(3)
+                    
+                    END SUBROUTINE twoconnector_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "twoconnector_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([4], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0] == 42)
+    assert (a[1] == 84)
+    assert (a[2] == 42)
+
+
+def test_fortran_frontend_input_output_connector():
+    test_string = """
+                    PROGRAM ioc_test
+                    implicit none
+                    double precision d(2,3)
+                    CALL ioc_test_function(d)
+                    end
+
+                    SUBROUTINE ioc_test_function(d)
+                    double precision d(2,3)
+                    integer a,b
+
+                    a=1
+                    b=2
+                    d(:,:)=0.0
+                    d(a,b)=d(1,1)+5
+                    
+                    END SUBROUTINE ioc_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "ioc_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([2, 3], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0, 0] == 0)
+    assert (a[0, 1] == 5)
+    assert (a[1, 2] == 0)
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_array_3dmap()
+    test_fortran_frontend_array_access()
+    test_fortran_frontend_input_output_connector()
+    test_fortran_frontend_array_ranges()
+    test_fortran_frontend_twoconnector()
diff --git a/tests/fortran/dace_support_test.py b/tests/fortran/dace_support_test.py
new file mode 100644
index 0000000000..0763d2c43c
--- /dev/null
+++ b/tests/fortran/dace_support_test.py
@@ -0,0 +1,51 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+
+from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_simplify():
+    test_string = """
+                    PROGRAM symbol_test
+                    implicit none
+                    double precision d(2,3)
+                    CALL symbol_test_function(d)
+                    end
+
+                    SUBROUTINE symbol_test_function(d)
+                    double precision d(2,3)
+                    integer a,b
+
+                    a=1
+                    b=2
+                    d(:,:)=0.0
+                    d(a,b)=5
+                    
+                    END SUBROUTINE symbol_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "symbol_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([2, 3], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0, 0] == 0)
+    assert (a[0, 1] == 5)
+    assert (a[1, 2] == 0)
+
+
+if __name__ == "__main__":
+    test_fortran_frontend_simplify()
diff --git a/tests/fortran/fortran_language_test.py b/tests/fortran/fortran_language_test.py
new file mode 100644
index 0000000000..f8624b1cdc
--- /dev/null
+++ b/tests/fortran/fortran_language_test.py
@@ -0,0 +1,240 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+
+from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_real_kind_selector():
+    test_string = """
+                    PROGRAM real_kind_selector_test
+                    implicit none
+                    INTEGER, PARAMETER :: JPRB = SELECTED_REAL_KIND(13,300)
+                    INTEGER, PARAMETER :: JPIM = SELECTED_INT_KIND(9)
+                    REAL(KIND=JPRB) d(4)
+                    CALL real_kind_selector_test_function(d)
+                    end
+
+                    SUBROUTINE real_kind_selector_test_function(d)
+                    REAL(KIND=JPRB) d(4)
+                    INTEGER(KIND=JPIM) i
+
+                    i=7
+                    d(2)=5.5+i
+                    
+                    END SUBROUTINE real_kind_selector_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "real_kind_selector_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([4], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0] == 42)
+    assert (a[1] == 12.5)
+    assert (a[2] == 42)
+
+
+def test_fortran_frontend_if1():
+    test_string = """
+                    PROGRAM if1_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL if1_test_function(d)
+                    end
+
+                    SUBROUTINE if1_test_function(d)
+                    double precision d(3,4,5),ZFAC(10)
+                    integer JK,JL,RTT,NSSOPT
+                    integer ZTP1(10,10)
+                    JL=1
+                    JK=1
+                    ZTP1(JL,JK)=1.0
+                    RTT=2
+                    NSSOPT=1
+
+                    IF (ZTP1(JL,JK)>=RTT .OR. NSSOPT==0) THEN
+                      ZFAC(1)  = 1.0
+                    ELSE
+                      ZFAC(1)  = 2.0
+                    ENDIF
+                    d(1,1,1)=ZFAC(1)
+                                    
+                    END SUBROUTINE if1_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "if1_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 0] == 2)
+
+
+def test_fortran_frontend_loop1():
+    test_string = """
+                    PROGRAM loop1_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL loop1_test_function(d)
+                    end
+
+                    SUBROUTINE loop1_test_function(d)
+                   double precision d(3,4,5),ZFAC(10)
+                   INTEGER :: a, JK, JL,JM
+                   INTEGER, PARAMETER :: KLEV=10, N=10,NCLV=3
+
+                   double precision :: RLMIN,ZVQX(NCLV)
+                   LOGICAL :: LLCOOLJ,LLFALL(NCLV)
+                   LLFALL(:)= .FALSE.
+                   ZVQX(:)= 0.0
+                   ZVQX(2)= 1.0
+                   DO JM=1,NCLV
+                    IF (ZVQX(JM)>0.0) LLFALL(JM)=.TRUE. ! falling species
+                   ENDDO
+
+                   d(1,1,1)=LLFALL(1)
+                   d(1,1,2)=LLFALL(2)                 
+                   END SUBROUTINE loop1_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "loop1_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 0] == 0)
+    assert (d[0, 0, 1] == 1)
+
+
+def test_fortran_frontend_function_statement1():
+    test_string = """
+                    PROGRAM function_statement1_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL function_statement1_test_function(d)
+                    end
+
+                    SUBROUTINE function_statement1_test_function(d)
+                   double precision d(3,4,5)
+                   double precision :: PTARE,RTT(2),FOEDELTA,FOELDCP
+                   double precision :: RALVDCP(2),RALSDCP(2),RES
+
+                    FOEDELTA (PTARE) = MAX (0.0,SIGN(1.0,PTARE-RTT(1)))
+                    FOELDCP ( PTARE ) = FOEDELTA(PTARE)*RALVDCP(1) + (1.0-FOEDELTA(PTARE))*RALSDCP(1)
+
+                    RTT(1)=4.5
+                    RALVDCP(1)=4.9
+                    RALSDCP(1)=5.1
+                    d(1,1,1)=FOELDCP(3.0)
+                    RES=FOELDCP(3.0)
+                   d(1,1,2)=RES                 
+                   END SUBROUTINE function_statement1_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "function_statement1_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 0] == 5.1)
+    assert (d[0, 0, 1] == 5.1)
+
+
+def test_fortran_frontend_pow1():
+    test_string = """
+                    PROGRAM pow1_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL pow1_test_function(d)
+                    end
+
+                    SUBROUTINE pow1_test_function(d)
+                   double precision d(3,4,5)
+                  double precision :: ZSIGK(2), ZHRC(2),RAMID(2)
+
+                  ZSIGK(1)=4.8
+                  RAMID(1)=0.0
+                  ZHRC(1)=12.34
+                  IF(ZSIGK(1) > 0.8) THEN
+                          ZHRC(1)=RAMID(1)+(1.0-RAMID(1))*((ZSIGK(1)-0.8)/0.2)**2
+                  ENDIF
+                   d(1,1,2)=ZHRC(1)                 
+                   END SUBROUTINE pow1_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "pow1_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 1] == 400)
+
+
+def test_fortran_frontend_pow2():
+    test_string = """
+                    PROGRAM pow2_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL pow2_test_function(d)
+                    end
+
+                    SUBROUTINE pow2_test_function(d)
+                   double precision d(3,4,5)
+                  double precision :: ZSIGK(2), ZHRC(2),RAMID(2)
+
+                  ZSIGK(1)=4.8
+                  RAMID(1)=0.0
+                  ZHRC(1)=12.34
+                  IF(ZSIGK(1) > 0.8) THEN
+                          ZHRC(1)=RAMID(1)+(1.0-RAMID(1))*((ZSIGK(1)-0.8)/0.01)**1.5
+                  ENDIF
+                   d(1,1,2)=ZHRC(1)                 
+                   END SUBROUTINE pow2_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "pow2_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 1] == 8000)
+
+
+def test_fortran_frontend_sign1():
+    test_string = """
+                    PROGRAM sign1_test
+                    implicit none
+                    double precision d(3,4,5)
+                    CALL sign1_test_function(d)
+                    end
+
+                    SUBROUTINE sign1_test_function(d)
+                   double precision d(3,4,5)
+                  double precision :: ZSIGK(2), ZHRC(2),RAMID(2)
+
+                  ZSIGK(1)=4.8
+                  RAMID(1)=0.0
+                  ZHRC(1)=-12.34
+                   d(1,1,2)=SIGN(ZSIGK(1),ZHRC(1))                 
+                   END SUBROUTINE sign1_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "sign1_test")
+    sdfg.simplify(verbose=True)
+    d = np.full([3, 4, 5], 42, order="F", dtype=np.float64)
+    sdfg(d=d)
+    assert (d[0, 0, 1] == -4.8)
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_real_kind_selector()
+    test_fortran_frontend_if1()
+    test_fortran_frontend_loop1()
+    test_fortran_frontend_function_statement1()
+
+    test_fortran_frontend_pow1()
+    test_fortran_frontend_pow2()
+    test_fortran_frontend_sign1()
diff --git a/tests/fortran/view_test.py b/tests/fortran/view_test.py
new file mode 100644
index 0000000000..bc9336af7a
--- /dev/null
+++ b/tests/fortran/view_test.py
@@ -0,0 +1,180 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_view_test():
+    test_name = "view_test"
+    test_string = """
+                    PROGRAM """ + test_name + """_program
+implicit none
+double precision a(10,11,12)
+double precision res(1,1,2) 
+
+CALL """ + test_name + """_function(a,res)
+
+end
+
+SUBROUTINE """ + test_name + """_function(aa,res)
+
+double precision aa(10,11,12)
+double precision res(1,1,2) 
+
+call viewlens(aa(:,:,1),res)
+
+end SUBROUTINE """ + test_name + """_function
+
+SUBROUTINE viewlens(aa,res)
+
+IMPLICIT NONE
+
+double precision  :: aa(10,11,23) 
+double precision :: res(1,1,2)
+
+INTEGER ::  JK, JL
+
+res(1,1,1)=0.0
+DO JK=1,10
+  DO JL=1,11
+    res(1,1,1)=res(1,1,1)+aa(JK,JL)
+  ENDDO
+ENDDO
+aa(1,1)=res(1,1,1)
+
+
+END SUBROUTINE viewlens
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, test_name)
+    sdfg.simplify(verbose=True)
+    a = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+    b = np.full([1, 1, 2], 42, order="F", dtype=np.float64)
+    b[0, 0, 0] = 1
+    sdfg(aa=a, res=b)
+    assert (a[0, 0, 1] == 42)
+    assert (a[0, 0, 0] == 4620)
+    assert (b[0, 0, 0] == 4620)
+
+
+def test_fortran_frontend_view_test_2():
+    test_name = "view2_test"
+    test_string = """
+                    PROGRAM """ + test_name + """_program
+implicit none
+integer, parameter :: n=10
+double precision a(n,11,12),b(n,11,12),c(n,11,12)
+
+CALL """ + test_name + """_function(a,b,c,n)
+
+end
+
+SUBROUTINE """ + test_name + """_function(aa,bb,cc,n)
+
+integer, parameter :: n=10
+double precision a(n,11,12),b(n,11,12),c(n,11,12)
+integer j,k
+
+j=1
+    call viewlens(aa(:,:,j),bb(:,:,j),cc(:,:,j))
+k=2
+    call viewlens(aa(:,:,k),bb(:,:,k),cc(:,:,k))
+
+end SUBROUTINE """ + test_name + """_function
+
+SUBROUTINE viewlens(aa,bb,cc)
+
+IMPLICIT NONE
+
+double precision  :: aa(10,11),bb(10,11),cc(10,11) 
+
+INTEGER ::  JK, JL
+
+DO JK=1,10
+  DO JL=1,11
+    cc(JK,JL)=bb(JK,JL)+aa(JK,JL)
+  ENDDO
+ENDDO
+
+END SUBROUTINE viewlens
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, test_name)
+    sdfg.simplify(verbose=True)
+    a = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+    b = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+    c = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+
+    b[0, 0, 0] = 1
+    sdfg(aa=a, bb=b, cc=c, n=10)
+    assert (c[0, 0, 0] == 43)
+    assert (c[1, 1, 1] == 84)
+
+
+def test_fortran_frontend_view_test_3():
+    test_name = "view3_test"
+    test_string = """
+                    PROGRAM """ + test_name + """_program
+implicit none
+integer, parameter :: n=10
+double precision a(n,n+1,12),b(n,n+1,12)
+
+CALL """ + test_name + """_function(a,b,n)
+
+end
+
+SUBROUTINE """ + test_name + """_function(aa,bb,n)
+
+integer, parameter :: n=10
+double precision a(n,n+1,12),b(n,n+1,12)
+integer j,k
+
+j=1
+    call viewlens(aa(:,:,j),bb(:,:,j),bb(:,:,j+1))
+
+end SUBROUTINE """ + test_name + """_function
+
+SUBROUTINE viewlens(aa,bb,cc)
+
+IMPLICIT NONE
+
+double precision  :: aa(10,11),bb(10,11),cc(10,11) 
+
+INTEGER ::  JK, JL
+
+DO JK=1,10
+  DO JL=1,11
+    cc(JK,JL)=bb(JK,JL)+aa(JK,JL)
+  ENDDO
+ENDDO
+
+END SUBROUTINE viewlens
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, test_name)
+    sdfg.simplify(verbose=True)
+    a = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+    b = np.full([10, 11, 12], 42, order="F", dtype=np.float64)
+
+    b[0, 0, 0] = 1
+    sdfg(aa=a, bb=b, n=10)
+    assert (b[0, 0, 0] == 1)
+    assert (b[0, 0, 1] == 43)
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_view_test()
+    test_fortran_frontend_view_test_2()
+    test_fortran_frontend_view_test_3()

From 5c49dc9db3a19ae36387eb88a7dedeb6c0627a40 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 13 Jun 2023 01:42:24 +0200
Subject: [PATCH 107/392] fixing setup

---
 setup.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 12562c2a85..9e68ca8036 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@
           "License :: OSI Approved :: BSD License",
           "Operating System :: OS Independent",
       ],
-      python_requires='>=3.6, <3.11',
+      python_requires='>=3.6, <3.12',
       packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
       package_data={
           '': [
@@ -73,9 +73,9 @@
       },
       include_package_data=True,
       install_requires=[
-          'numpy', 'networkx >= 2.5', 'astunparse', 'sympy>=1.9', 'pyyaml', 'ply', 'websockets', 'requests', 'flask',
-          'aenum >= 3.1', 'dataclasses; python_version < "3.7"', 'dill', 'pyreadline;platform_system=="Windows"',
-          'typing-compat; python_version < "3.8"'
+         'numpy', 'networkx >= 2.5', 'astunparse', 'sympy<=1.9', 'pyyaml', 'ply', 'websockets', 'requests', 'flask',
+          'fparser', 'aenum >= 3.1', 'dataclasses; python_version < "3.7"', 'dill',
+          'pyreadline;platform_system=="Windows"', 'typing-compat; python_version < "3.8"'
       ] + cmake_requires,
       extras_require={
           'testing': ['coverage', 'pytest-cov', 'scipy', 'absl-py', 'opt_einsum', 'pymlir', 'click'],
@@ -86,6 +86,7 @@
               'dacelab = dace.cli.dacelab:main',
               'sdfv = dace.cli.sdfv:main',
               'sdfgcc = dace.cli.sdfgcc:main',
+              'fcfd = dace.cli.fcdc:main',
               'daceprof = dace.cli.daceprof:main',
           ],
       })

From da4add0fba7665c05684debcbc547975a1129e45 Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Tue, 13 Jun 2023 22:13:02 +0200
Subject: [PATCH 108/392] Fix temporal vectorization crashing dace when
 enumerating matches

---
 dace/transformation/subgraph/temporal_vectorization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/subgraph/temporal_vectorization.py b/dace/transformation/subgraph/temporal_vectorization.py
index 32fb98ec2d..19abd2c74c 100644
--- a/dace/transformation/subgraph/temporal_vectorization.py
+++ b/dace/transformation/subgraph/temporal_vectorization.py
@@ -46,7 +46,7 @@ def can_be_applied(self, sdfg: SDFG, subgraph: SubgraphView) -> bool:
         src_nodes = subgraph.source_nodes()
         dst_nodes = subgraph.sink_nodes()
         srcdst_nodes = src_nodes + dst_nodes
-        srcdst_arrays = [sdfg.arrays[node.data] for node in srcdst_nodes]
+        srcdst_arrays = [sdfg.arrays[node.data]for node in srcdst_nodes if isinstance(node, nodes.AccessNode)]
         access_nodes = [
             node for node in subgraph.nodes() if isinstance(node, nodes.AccessNode) and not node in srcdst_nodes
         ]

From 03b95fc07100d0c23a45611f676a6e3e530c35c0 Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Tue, 13 Jun 2023 22:15:53 +0200
Subject: [PATCH 109/392] Fix typo

---
 dace/transformation/subgraph/temporal_vectorization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/subgraph/temporal_vectorization.py b/dace/transformation/subgraph/temporal_vectorization.py
index 19abd2c74c..0ed87f56b3 100644
--- a/dace/transformation/subgraph/temporal_vectorization.py
+++ b/dace/transformation/subgraph/temporal_vectorization.py
@@ -46,7 +46,7 @@ def can_be_applied(self, sdfg: SDFG, subgraph: SubgraphView) -> bool:
         src_nodes = subgraph.source_nodes()
         dst_nodes = subgraph.sink_nodes()
         srcdst_nodes = src_nodes + dst_nodes
-        srcdst_arrays = [sdfg.arrays[node.data]for node in srcdst_nodes if isinstance(node, nodes.AccessNode)]
+        srcdst_arrays = [sdfg.arrays[node.data] for node in srcdst_nodes if isinstance(node, nodes.AccessNode)]
         access_nodes = [
             node for node in subgraph.nodes() if isinstance(node, nodes.AccessNode) and not node in srcdst_nodes
         ]

From e08f5bc4d62ef36173f8c4d538ebf16d84ae2822 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 13 Jun 2023 22:50:42 +0200
Subject: [PATCH 110/392] added test descriptions

---
 tests/fortran/array_test.py            | 16 +++++++++++++++-
 tests/fortran/dace_support_test.py     |  3 +++
 tests/fortran/fortran_language_test.py | 24 ++++++++++++++++++++++++
 tests/fortran/view_test.py             |  8 ++++++++
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/tests/fortran/array_test.py b/tests/fortran/array_test.py
index 0e286d0295..8685628012 100644
--- a/tests/fortran/array_test.py
+++ b/tests/fortran/array_test.py
@@ -7,7 +7,6 @@
 import numpy as np
 import pytest
 
-
 from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
 from dace.frontend.fortran import fortran_parser
 from fparser.two.symbol_table import SymbolTable
@@ -20,6 +19,9 @@
 
 
 def test_fortran_frontend_array_access():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
     test_string = """
                     PROGRAM access_test
                     implicit none
@@ -44,6 +46,9 @@ def test_fortran_frontend_array_access():
 
 
 def test_fortran_frontend_array_ranges():
+    """
+    Tests that the Fortran frontend can parse multidimenstional arrays with vectorized ranges and that the accessed indices are correct.
+    """
     test_string = """
                     PROGRAM ranges_test
                     implicit none
@@ -76,6 +81,9 @@ def test_fortran_frontend_array_ranges():
 
 
 def test_fortran_frontend_array_3dmap():
+    """
+    Tests that the normalization of multidimensional array indices works correctly.
+    """
     test_string = """
                     PROGRAM array_3dmap_test
                     implicit none
@@ -102,6 +110,9 @@ def test_fortran_frontend_array_3dmap():
 
 
 def test_fortran_frontend_twoconnector():
+    """
+    Tests that the multiple connectors to one array are handled correctly.
+    """
     test_string = """
                     PROGRAM twoconnector_test
                     implicit none
@@ -126,6 +137,9 @@ def test_fortran_frontend_twoconnector():
 
 
 def test_fortran_frontend_input_output_connector():
+    """
+    Tests that the presence of input and output connectors for the same array is handled correctly.
+    """
     test_string = """
                     PROGRAM ioc_test
                     implicit none
diff --git a/tests/fortran/dace_support_test.py b/tests/fortran/dace_support_test.py
index 0763d2c43c..096ea25a18 100644
--- a/tests/fortran/dace_support_test.py
+++ b/tests/fortran/dace_support_test.py
@@ -20,6 +20,9 @@
 
 
 def test_fortran_frontend_simplify():
+    """ 
+    Test that the DaCe simplify works with the input SDFG provided by the Fortran frontend.
+    """
     test_string = """
                     PROGRAM symbol_test
                     implicit none
diff --git a/tests/fortran/fortran_language_test.py b/tests/fortran/fortran_language_test.py
index f8624b1cdc..32ab23714b 100644
--- a/tests/fortran/fortran_language_test.py
+++ b/tests/fortran/fortran_language_test.py
@@ -20,6 +20,9 @@
 
 
 def test_fortran_frontend_real_kind_selector():
+    """
+    Tests that the size intrinsics are correctly parsed and translated to DaCe.
+    """
     test_string = """
                     PROGRAM real_kind_selector_test
                     implicit none
@@ -48,6 +51,9 @@ def test_fortran_frontend_real_kind_selector():
 
 
 def test_fortran_frontend_if1():
+    """
+    Tests that the if/else construct is correctly parsed and translated to DaCe.
+    """
     test_string = """
                     PROGRAM if1_test
                     implicit none
@@ -82,6 +88,10 @@ def test_fortran_frontend_if1():
 
 
 def test_fortran_frontend_loop1():
+    """
+    Tests that the loop construct is correctly parsed and translated to DaCe.
+    """
+    
     test_string = """
                     PROGRAM loop1_test
                     implicit none
@@ -116,6 +126,10 @@ def test_fortran_frontend_loop1():
 
 
 def test_fortran_frontend_function_statement1():
+    """
+    Tests that the function statement are correctly removed recursively.
+    """
+    
     test_string = """
                     PROGRAM function_statement1_test
                     implicit none
@@ -148,6 +162,9 @@ def test_fortran_frontend_function_statement1():
 
 
 def test_fortran_frontend_pow1():
+    """
+    Tests that the power intrinsic is correctly parsed and translated to DaCe. (should become a*a)
+    """
     test_string = """
                     PROGRAM pow1_test
                     implicit none
@@ -176,6 +193,10 @@ def test_fortran_frontend_pow1():
 
 
 def test_fortran_frontend_pow2():
+    """
+    Tests that the power intrinsic is correctly parsed and translated to DaCe (this time it's p sqrt p).
+    """
+    
     test_string = """
                     PROGRAM pow2_test
                     implicit none
@@ -204,6 +225,9 @@ def test_fortran_frontend_pow2():
 
 
 def test_fortran_frontend_sign1():
+    """
+    Tests that the sign intrinsic is correctly parsed and translated to DaCe.
+    """
     test_string = """
                     PROGRAM sign1_test
                     implicit none
diff --git a/tests/fortran/view_test.py b/tests/fortran/view_test.py
index bc9336af7a..8c00d47e98 100644
--- a/tests/fortran/view_test.py
+++ b/tests/fortran/view_test.py
@@ -19,6 +19,9 @@
 
 
 def test_fortran_frontend_view_test():
+    """
+    Tests to check whether Fortran array slices are correctly translates to DaCe views.
+    """
     test_name = "view_test"
     test_string = """
                     PROGRAM """ + test_name + """_program
@@ -71,6 +74,9 @@ def test_fortran_frontend_view_test():
 
 
 def test_fortran_frontend_view_test_2():
+    """
+    Tests to check whether Fortran array slices are correctly translates to DaCe views. This case necessitates multiple views per array in the same context.
+    """
     test_name = "view2_test"
     test_string = """
                     PROGRAM """ + test_name + """_program
@@ -124,6 +130,8 @@ def test_fortran_frontend_view_test_2():
 
 
 def test_fortran_frontend_view_test_3():
+    """
+    Tests to check whether Fortran array slices are correctly translates to DaCe views. This test generates multiple views from the same array in the same context.    """
     test_name = "view3_test"
     test_string = """
                     PROGRAM """ + test_name + """_program

From 9f7e18137da9570ca28cbf22150b709608450b60 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 13 Jun 2023 23:34:03 +0200
Subject: [PATCH 111/392] resolving comments

---
 dace/frontend/fortran/ast_components.py | 14 +++++---------
 dace/frontend/fortran/ast_transforms.py | 12 ++++++------
 dace/frontend/fortran/ast_utils.py      |  2 +-
 dace/frontend/fortran/fortran_parser.py |  5 +----
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 6e83d6c477..8bb4e96936 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -260,7 +260,6 @@ def create_children(self, node: FASTNode):
                                                  (list,
                                                   tuple)) else [self.create_ast(child) for child in node.children]
 
-
     def create_ast(self, node=None):
         """
         Creates an AST from a FASTNode
@@ -397,18 +396,15 @@ def intrinsic_function_reference(self, node: FASTNode):
             return ast_internal_classes.Int_Literal_Node(value=str(
                 math.ceil((math.log2(math.pow(10, int(args.args[0].value))) + 1) / 8)),
                                                          line_number=line)
-        # TODO This needs a better translation
+        # This selects the smallest kind that can hold the given number of digits (fp64,fp32 or fp16)
         elif name.name == "__dace_selected_real_kind":
-            if args.args[0].value == '13' and args.args[1].value == '300':
+            if int(args.args[0].value) >= 9 or int(args.args[1].value) > 126:
                 return ast_internal_classes.Int_Literal_Node(value="8", line_number=line)
-            elif args.args[0].value == '2' and args.args[1].value == '1':
-                return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
-            elif args.args[0].value == '4' and args.args[1].value == '2':
-                return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
-            elif args.args[0].value == '6' and args.args[1].value == '37':
+            elif int(args.args[0].value) >= 3 or int(args.args[1].value) > 14:
                 return ast_internal_classes.Int_Literal_Node(value="4", line_number=line)
             else:
-                raise NotImplementedError("Only real*8 is supported")
+                return ast_internal_classes.Int_Literal_Node(value="2", line_number=line)
+
         func_types = {
             "__dace_int": "INT",
             "__dace_dble": "DOUBLE",
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index d8d83ab8c9..62c5ad0c7e 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -24,9 +24,6 @@ def iter_child_nodes(node: ast_internal_classes.FNode):
     Yield all direct child nodes of *node*, that is, all fields that are nodes
     and all items of fields that are lists of nodes.
     """
-    #print("CLASS: ",node.__class__)
-    #if isinstance(node,DeclRefExpr):
-    #print("NAME: ", node.name)
 
     for name, field in iter_fields(node):
         #print("NASME:",name)
@@ -546,7 +543,10 @@ def localFunctionStatementEliminator(node: ast_internal_classes.FNode):
                         i.lval, ast_internal_classes.Structure_Constructor_Node):
                     function_statement_name = i.lval.name
                     is_actually_function_statement = False
-                    #In Fortran, function statement are defined as scalar values, but called as arrays, so by identifiying that it is called as a call_expr or structure_constructor, we also need to match the specification part and see that it is scalar rather than an array.
+                    # In Fortran, function statement are defined as scalar values, 
+                    # but called as arrays, so by identifiying that it is called as 
+                    # a call_expr or structure_constructor, we also need to match
+                    # the specification part and see that it is scalar rather than an array.
                     found = False
                     for j in spec:
                         if found:
@@ -562,9 +562,9 @@ def localFunctionStatementEliminator(node: ast_internal_classes.FNode):
                     if is_actually_function_statement:
                         to_change.append([i.lval, i.rval])
                         new_exec.remove(i)
-                        print("Function statement found and removed: ", function_statement_name)
+                        
                     else:
-                        #There are no function statements after the first one that isn't
+                        #There are no function statements after the first one that isn't a function statement
                         break
     still_changing = True
     while still_changing:
diff --git a/dace/frontend/fortran/ast_utils.py b/dace/frontend/fortran/ast_utils.py
index 64988b01d6..41cbeff1f9 100644
--- a/dace/frontend/fortran/ast_utils.py
+++ b/dace/frontend/fortran/ast_utils.py
@@ -27,7 +27,7 @@
     "REAL": dtypes.float32,
     "INTEGER": dtypes.int32,
     "BOOL": dtypes.int32,  #This is a hack to allow fortran to pass through external C 
-    #"BOOL": dtypes.int32,
+    
 }
 
 
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 311840a62a..03e0faae38 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -659,7 +659,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
             else:
                 raise NameError("Variable name not found: " + ast_utils.get_name(i))
 
-            # print("Context change:",i.name," ",var.shape)
+            
             if not hasattr(var, "shape") or len(var.shape) == 0:
                 memlet = ""
             elif (len(var.shape) == 1 and var.shape[0] == 1):
@@ -785,10 +785,7 @@ def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG):
         substate = ast_utils.add_simple_state_to_sdfg(
             self, sdfg, "_state_l" + str(node.line_number[0]) + "_c" + str(node.line_number[1]))
 
-        #input_names_tasklet = [i_t + "_in" for i_t in input_names]
         output_names_changed = [o_t + "_out" for o_t in output_names]
-        #output_names_changed = [o_t for o_t in output_names_tasklet]
-        #output_names_dict = {on: dace.pointer(dace.int32) for on in output_names_changed}
 
         tasklet = ast_utils.add_tasklet(substate, "_l" + str(node.line_number[0]) + "_c" + str(node.line_number[1]),
                                          input_names_tasklet, output_names_changed, "text", node.line_number,

From 7ca527ca13d7e9fe62b6d73d1e002f19d402d157 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 19 Jun 2023 22:37:00 +0800
Subject: [PATCH 112/392] Updated mpi_allgather_test.py for coding style
 consistency

---
 tests/library/mpi/mpi_allgather_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/library/mpi/mpi_allgather_test.py b/tests/library/mpi/mpi_allgather_test.py
index 1f0a30a4d1..1eebcd5676 100644
--- a/tests/library/mpi/mpi_allgather_test.py
+++ b/tests/library/mpi/mpi_allgather_test.py
@@ -22,7 +22,10 @@ def make_sdfg(dtype):
     outA = state.add_access("outA")
     allgather_node = mpi.nodes.allgather.Allgather("allgather")
 
-    state.add_memlet_path(inA, allgather_node, dst_conn="_inbuffer", memlet=Memlet.simple(inA, "0:n", num_accesses=n))
+    state.add_memlet_path(inA,
+                          allgather_node,
+                          dst_conn="_inbuffer",
+                          memlet=Memlet.simple(inA, "0:n", num_accesses=n))
     state.add_memlet_path(allgather_node,
                           outA,
                           src_conn="_outbuffer",

From 38be7493036c6401bba1d39c5bcba0e8b76af6ee Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 19 Jun 2023 22:55:59 +0800
Subject: [PATCH 113/392] Added alltoall node basic version based on other
 collectives

---
 dace/libraries/mpi/nodes/__init__.py   |  1 +
 dace/libraries/mpi/nodes/alltoall.py   | 84 ++++++++++++++++++++++++++
 tests/library/mpi/mpi_alltoall_test.py | 78 ++++++++++++++++++++++++
 3 files changed, 163 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/alltoall.py
 create mode 100644 tests/library/mpi/mpi_alltoall_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index b4789d952e..0cd36cc82f 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -10,5 +10,6 @@
 from .reduce import Reduce
 from .allreduce import Allreduce
 from .allgather import Allgather
+from .alltoall import Alltoall
 from .dummy import Dummy
 from .redistribute import Redistribute
diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py
new file mode 100644
index 0000000000..b0accfb52d
--- /dev/null
+++ b/dace/libraries/mpi/nodes/alltoall.py
@@ -0,0 +1,84 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandAlltoallMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
+        (inbuffer, in_count_str), (outbuffer, out_count_str) = node.validate(parent_sdfg, parent_state)
+        in_mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type)
+        out_mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(outbuffer.dtype.base_type)
+
+        if inbuffer.dtype.veclen > 1:
+            raise (NotImplementedError)
+
+        comm = "MPI_COMM_WORLD"
+        if node.grid:
+            comm = f"__state->{node.grid}_comm"
+
+        # code = f"""
+        #     MPI_Alltoall({buffer}, {count_str}, {mpi_dtype_str}, _outbuffer, {count_str}, {mpi_dtype_str}, {comm});
+        #     """
+        code = f"""
+            MPI_Alltoall(_inbuffer, {in_count_str}, {in_mpi_dtype_str}, \
+                        _outbuffer, {out_count_str}, {out_mpi_dtype_str}, \
+                        {comm});
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP)
+        return tasklet
+
+
+@dace.library.node
+class Alltoall(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandAlltoallMPI,
+    }
+    default_implementation = "MPI"
+
+    grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
+
+    def __init__(self, name, grid=None, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_inbuffer"}, outputs={"_outbuffer"}, **kwargs)
+        self.grid = grid
+
+    def validate(self, sdfg, state):
+        """
+        :return: A three-tuple (buffer, root) of the three data descriptors in the
+                 parent SDFG.
+        """
+
+        inbuffer, outbuffer = None, None
+        for e in state.out_edges(self):
+            if e.src_conn == "_outbuffer":
+                outbuffer = sdfg.arrays[e.data.data]
+        for e in state.in_edges(self):
+            if e.dst_conn == "_inbuffer":
+                inbuffer = sdfg.arrays[e.data.data]
+
+        in_count_str = "XXX"
+        out_count_str = "XXX"
+        for _, src_conn, _, _, data in state.out_edges(self):
+            if src_conn == '_outbuffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                out_count_str = "*".join(dims)
+        for _, _, _, dst_conn, data in state.in_edges(self):
+            if dst_conn == '_inbuffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                in_count_str = "*".join(dims)
+        
+        return (inbuffer, in_count_str), (outbuffer, out_count_str)
diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py
new file mode 100644
index 0000000000..cf155fc640
--- /dev/null
+++ b/tests/library/mpi/mpi_alltoall_test.py
@@ -0,0 +1,78 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import numpy as np
+import pytest
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_alltoall")
+    state = sdfg.add_state("dataflow")
+
+    sdfg.add_array("inbuf", [n], dtype, transient=False)
+    sdfg.add_array("outbuf", [n], dtype, transient=False)
+    inbuf = state.add_access("inbuf")
+    outbuf = state.add_access("outbuf")
+    alltoall_node = mpi.nodes.alltoall.Alltoall("alltoall")
+
+    state.add_memlet_path(inbuf,
+                          alltoall_node,
+                          dst_conn="_inbuffer",
+                          memlet=Memlet.simple(inbuf, "0:n", num_accesses=n))
+    state.add_memlet_path(alltoall_node,
+                          outbuf,
+                          src_conn="_outbuffer",
+                          memlet=Memlet.simple(outbuf, "0:n", num_accesses=n))
+
+    return sdfg
+
+
+###############################################################################
+
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.float64, marks=pytest.mark.mpi)
+])
+def test_mpi(implementation, dtype):
+    from mpi4py import MPI as MPI4PY
+    np_dtype = getattr(np, dtype.to_string())
+    comm = MPI4PY.COMM_WORLD
+    rank = comm.Get_rank()
+    commsize = comm.Get_size()
+    mpi_sdfg = None
+    if commsize < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+    for r in range(0, commsize):
+        if r == rank:
+            sdfg = make_sdfg(dtype)
+            mpi_sdfg = sdfg.compile()
+        comm.Barrier()
+
+    size = 128
+    size_per_proc = int(size/commsize)
+    A = np.arange(0, size, dtype=np_dtype)
+    B = np.full(size, 0, dtype=np_dtype)
+    mpi_sdfg(inbuf=A, outbuf=B, n=size_per_proc)
+
+    # now B should be an array of size,
+    # containing (size / size_per_proc) repeated chunked_data
+    chunked_data = A[rank * size_per_proc: (rank + 1) * size_per_proc]
+    correct_data = np.tile(chunked_data, int(size / size_per_proc))
+    if (not np.allclose(B, correct_data)):
+        raise (ValueError("The received values are not what I expected on root."))
+
+
+###############################################################################
+
+if __name__ == "__main__":
+    test_mpi("MPI", dace.float32)
+    test_mpi("MPI", dace.float64)
+
+###############################################################################

From ee5bb6f245e3e4b7e9cc7d36e7c9bdbe4da63637 Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Wed, 21 Jun 2023 07:32:14 +0200
Subject: [PATCH 114/392] Add input config reduction to cutouts (#1224)

* Add input config reduction to cutouts

* Fix issues with mincut

* Bugfixes

* Fixes round 2

* Add loop body states to cutouts

* Translate NSDFGs for cutouts

* Sync
---
 dace/sdfg/analysis/cutout.py | 321 +++++++++++++++++++++++++++++++++--
 tests/sdfg/cutout_test.py    | 104 +++++++++++-
 2 files changed, 406 insertions(+), 19 deletions(-)

diff --git a/dace/sdfg/analysis/cutout.py b/dace/sdfg/analysis/cutout.py
index b557cb185e..a72a6d7e54 100644
--- a/dace/sdfg/analysis/cutout.py
+++ b/dace/sdfg/analysis/cutout.py
@@ -3,6 +3,9 @@
 Functionality that allows users to "cut out" parts of an SDFG in a smart way (i.e., memory preserving) for localized
 testing or optimization.
 """
+import networkx as nx
+from networkx.algorithms.flow import edmondskarp
+import sympy as sp
 from collections import deque
 import copy
 from typing import Deque, Dict, List, Set, Tuple, Union, Optional, Any
@@ -15,6 +18,7 @@
                                                 PatternTransformation,
                                                 SubgraphTransformation,
                                                 SingleStateTransformation)
+from dace.transformation.interstate.loop_detection import DetectLoop
 from dace.transformation.passes.analysis import StateReachability
 
 
@@ -77,8 +81,10 @@ def translate_transformation_into(self, transformation: Union[PatternTransformat
                     # Ignore.
                     pass
         elif isinstance(transformation, MultiStateTransformation):
-            transformation._sdfg = self
-            transformation.sdfg_id = 0
+            new_sdfg_id = self._in_translation[transformation.sdfg_id]
+            new_sdfg = self.sdfg_list[new_sdfg_id]
+            transformation._sdfg = new_sdfg
+            transformation.sdfg_id = new_sdfg_id
             for k in transformation.subgraph.keys():
                 old_state = self._base_sdfg.node(transformation.subgraph[k])
                 try:
@@ -111,18 +117,40 @@ def from_json(cls, json_obj, context_info=None):
     @classmethod
     def from_transformation(
         cls, sdfg: SDFG, transformation: Union[PatternTransformation, SubgraphTransformation],
-        make_side_effects_global = True, use_alibi_nodes: bool = True
+        make_side_effects_global = True, use_alibi_nodes: bool = True, reduce_input_config = True,
+        symbols_map: Optional[Dict[str, Any]] = None
     ) -> Union['SDFGCutout', SDFG]:
+        """
+        Create a cutout from a transformation's set of affected graph elements.
+
+        :param sdfg: The SDFG to create the cutout from.
+        :param transformation: The transformation to create the cutout from.
+        :param make_side_effects_global: Whether to make side effect data containers global, i.e. non-transient.
+        :param use_alibi_nodes: Whether to use alibi nodes for the cutout across scope borders.
+        :param reduce_input_config: Whether to reduce the input configuration where possible in singlestate cutouts.
+        :param symbols_map: A mapping of symbols to values to use for the cutout. Optional, only used when reducing the
+                            input configuration.
+        :return: The cutout.
+        """
         affected_nodes = _transformation_determine_affected_nodes(sdfg, transformation)
 
+        if len(affected_nodes) == 0:
+            cut_sdfg = copy.deepcopy(sdfg)
+            transformation._sdfg = cut_sdfg
+            return cut_sdfg
+
         target_sdfg = sdfg
         if transformation.sdfg_id >= 0 and target_sdfg.sdfg_list is not None:
             target_sdfg = target_sdfg.sdfg_list[transformation.sdfg_id]
 
-        if isinstance(transformation, (SubgraphTransformation, SingleStateTransformation)):
-            state = target_sdfg.node(transformation.state_id)
+        if (all(isinstance(n, nd.Node) for n in affected_nodes) or
+            isinstance(transformation, (SubgraphTransformation, SingleStateTransformation))):
+            state = target_sdfg.parent
+            if transformation.state_id >= 0:
+                state = target_sdfg.node(transformation.state_id)
             cutout = cls.singlestate_cutout(state, *affected_nodes, make_side_effects_global=make_side_effects_global,
-                                            use_alibi_nodes=use_alibi_nodes)
+                                            use_alibi_nodes=use_alibi_nodes, reduce_input_config=reduce_input_config,
+                                            symbols_map=symbols_map)
             cutout.translate_transformation_into(transformation)
             return cutout
         elif isinstance(transformation, MultiStateTransformation):
@@ -132,14 +160,16 @@ def from_transformation(
                 cutout.translate_transformation_into(transformation)
             return cutout
         raise Exception('Unsupported transformation type: {}'.format(type(transformation)))
-
+                    
     @classmethod
     def singlestate_cutout(cls,
                            state: SDFGState,
                            *nodes: nd.Node,
                            make_copy: bool = True,
                            make_side_effects_global: bool = True,
-                           use_alibi_nodes: bool = True) -> 'SDFGCutout':
+                           use_alibi_nodes: bool = True,
+                           reduce_input_config: bool = False,
+                           symbols_map: Optional[Dict[str, Any]] = None) -> 'SDFGCutout':
         """
         Cut out a subgraph of a state from an SDFG to run separately for localized testing or optimization.
         The subgraph defined by the list of nodes will be extended to include access nodes of data containers necessary
@@ -155,8 +185,13 @@ def singlestate_cutout(cls,
                                          inside the cutout but may be read _after_ the cutout, are made global.
         :param use_alibi_nodes: If True, do not extend the cutout with access nodes that span outside of a scope, but
                                 introduce alibi nodes instead that represent only the accesses subset.
+        :param reduce_input_config: Whether to reduce the input configuration where possible in singlestate cutouts.
+        :param symbols_map: A mapping of symbols to values to use for the cutout. Optional, only used when reducing the
+                            input configuration.
         :return: The created SDFGCutout.
         """
+        if reduce_input_config:
+            nodes = _reduce_in_configuration(state, nodes, use_alibi_nodes, symbols_map)
         create_element = copy.deepcopy if make_copy else (lambda x: x)
         sdfg = state.parent
         subgraph: StateSubgraphView = StateSubgraphView(state, nodes)
@@ -272,6 +307,15 @@ def singlestate_cutout(cls,
         cutout._in_translation = in_translation
         cutout._out_translation = out_translation
 
+        # Translate in nested SDFG nodes and their SDFGs (their list id, specifically).
+        cutout.reset_sdfg_list()
+        outers = set(in_translation.keys())
+        for outer in outers:
+            if isinstance(outer, nd.NestedSDFG):
+                inner: nd.NestedSDFG = in_translation[outer]
+                cutout._in_translation[outer.sdfg.sdfg_id] = inner.sdfg.sdfg_id
+        _recursively_set_nsdfg_parents(cutout)
+
         return cutout
 
     @classmethod
@@ -319,14 +363,14 @@ def multistate_cutout(cls,
                 frontier, frontier_edges = bfs_queue.popleft()
                 if len(frontier_edges) == 0:
                     # No explicit start state, but also no frontier to select from.
-                    return sdfg
+                    return copy.deepcopy(sdfg)
                 elif len(frontier_edges) == 1:
                     # If there is only one predecessor frontier edge, its destination must be the start state.
                     start_state = list(frontier_edges)[0].dst
                 else:
                     if len(frontier) == 0:
                         # No explicit start state, but also no frontier to select from.
-                        return sdfg
+                        return copy.deepcopy(sdfg)
                     if len(frontier) == 1:
                         # For many frontier edges but only one frontier state, the frontier state is the new start state
                         # and is included in the cutout.
@@ -349,8 +393,18 @@ def multistate_cutout(cls,
             state_defined_symbols = state.defined_symbols()
             for sym in state_defined_symbols:
                 defined_symbols[sym] = state_defined_symbols[sym]
+        for edge in subgraph.edges():
+            is_edge: InterstateEdge = edge.data
+            available_symbols = sdfg.symbols.keys()
+            free_symbols |= (is_edge.free_symbols & available_symbols)
+            for rmem in is_edge.get_read_memlets(sdfg.arrays):
+                if rmem.data in cutout.arrays:
+                    continue
+                new_desc = sdfg.arrays[rmem.data].clone()
+                cutout.add_datadesc(rmem.data, new_desc)
         for sym in free_symbols:
-            cutout.add_symbol(sym, defined_symbols[sym])
+            if not sym in cutout.symbols:
+                cutout.add_symbol(sym, defined_symbols[sym])
 
         for state in cutout_states:
             for dnode in state.data_nodes():
@@ -413,6 +467,9 @@ def multistate_cutout(cls,
         cutout._in_translation = in_translation
         cutout._out_translation = out_translation
 
+        cutout.reset_sdfg_list()
+        _recursively_set_nsdfg_parents(cutout)
+
         return cutout
 
 
@@ -447,6 +504,27 @@ def _transformation_determine_affected_nodes(
             except KeyError:
                 # Ignored.
                 pass
+
+        # Transformations that modify a loop in any way must also include the loop init node, i.e. the state directly
+        # before the loop guard. Also make sure that ALL loop body states are part of the set of affected nodes.
+        # TODO: This is hacky and should be replaced with a more general mechanism - this is something that
+        #       transformation intents / transactions will need to solve.
+        if isinstance(transformation, DetectLoop):
+            if transformation.loop_guard is not None and transformation.loop_guard in target_sdfg.nodes():
+                for iedge in target_sdfg.in_edges(transformation.loop_guard):
+                    affected_nodes.add(iedge.src)
+            if transformation.loop_begin is not None and transformation.loop_begin in target_sdfg.nodes():
+                to_visit = [transformation.loop_begin]
+                while to_visit:
+                    state = to_visit.pop(0)
+                    for _, dst, _ in target_sdfg.out_edges(state):
+                        if dst not in affected_nodes and dst is not transformation.loop_guard:
+                            to_visit.append(dst)
+                    affected_nodes.add(state)
+
+        if len(affected_nodes) == 0 and transformation.state_id < 0 and target_sdfg.parent_nsdfg_node is not None:
+            # This is a transformation that affects a nested SDFG node, grab that NSDFG node.
+            affected_nodes.add(target_sdfg.parent_nsdfg_node)
     else:
         if transformation.sdfg_id >= 0 and target_sdfg.sdfg_list:
             target_sdfg = target_sdfg.sdfg_list[transformation.sdfg_id]
@@ -478,6 +556,202 @@ def _transformation_determine_affected_nodes(
 
     return affected_nodes
 
+def _reduce_in_configuration(state: SDFGState, affected_nodes: Set[nd.Node], use_alibi_nodes: bool = False,
+                             symbols_map: Optional[Dict[str, Any]] = None) -> Set[nd.Node]:
+    """
+    For a given set of nodes that should be cut out in a single state cutout, try to reduce the size of the input
+    configuration as much as possible by adding more nodes to find a S-T minimum 2-cut in the state.
+
+    :param state: The state in which to cut out.
+    :param affected_nodes: The set of nodes that should be cut out.
+    :param use_alibi_nodes: If True, use alibi nodes across scope borders.
+    :param symbols_map: A map of symbols to values. An assumption will be made about symbol values if None is provided.
+    :return: A new set of node greater than or equal to the initial cutout nodes, which makes up a minimized cutout.
+    """
+    subgraph: StateSubgraphView = StateSubgraphView(state, affected_nodes)
+    subgraph = _extend_subgraph_with_access_nodes(state, subgraph, use_alibi_nodes)
+    subgraph_nodes = set(subgraph.nodes())
+
+    # For the given state, determine what should count as the input configuration if we were to cut out the entire
+    # state.
+    state_reachability_dict = StateReachability().apply_pass(state.parent, None)
+    state_reach = state_reachability_dict[state.parent.sdfg_id]
+    reaching_cutout: Set[SDFGState] = set()
+    for k, v in state_reach.items():
+        if state in v:
+            reaching_cutout.add(k)
+    state_input_configuration = set()
+    check_for_write_before = set()
+    for dn in state.data_nodes():
+        if state.out_degree(dn) > 0:
+            # This is read from, add to the system state if it is written anywhere else in the graph.
+            # Except if it is also written to at the same time and is scalar or of size 1.
+            array = state.parent.arrays[dn.data]
+            if state.in_degree(dn) > 0 and (array.total_size == 1 or isinstance(array, data.Scalar)):
+                continue
+            elif not array.transient:
+                # Non-transients are always part of the input config if they are read and not overwritten anyway.
+                state_input_configuration.add(dn.data)
+            else:
+                check_for_write_before.add(dn.data)
+    for pre_state in reaching_cutout:
+        for dn in pre_state.data_nodes():
+            if pre_state.in_degree(dn) > 0:
+                # For any writes, check if they are reads from the cutout that need to be checked. If they are, they're
+                # part of the system state.
+                if dn.data in check_for_write_before:
+                    state_input_configuration.add(dn.data)
+
+    # If no explicit symbol map was provided, we have to make an assumption about symbol values to determine a minimum
+    # cut.
+    # TODO: This is a hack. Ideally, we should be able to determine the minimum cut without having to make assumptions
+    # about symbol values. Not sure how to do that yet.
+    if symbols_map is None:
+        symbols_map = dict()
+        consts = state.parent.constants
+        for s in state.parent.symbols:
+            if s in consts:
+                symbols_map[s] = consts[s]
+            else:
+                symbols_map[s] = 20
+
+    # Use a proxy graph to compute the minium cut.
+    proxy_graph = nx.DiGraph()
+
+    # By expanding over the borders of a scope (e.g. over the entry of a map), we know that we universally can only
+    # increase the size of the input configuration. Consequently, we can use the outer-most scope entry node as our
+    # source node for the minimum cut, if there is such a unique outer entry node.
+    source_candidates = set()
+    for n in subgraph_nodes:
+        source_candidates.add(state.entry_node(n))
+
+    source = None
+    scope_children = state.scope_children()
+    transitive_scope_children: Dict[SDFGState, Set[SDFGState]] = dict()
+    for k, v in scope_children.items():
+        queue = deque(v)
+        k_children = set(v)
+        while queue:
+            child = queue.popleft()
+            if child in scope_children:
+                n_children = set(scope_children[child])
+                queue.extend(n_children)
+                k_children.update(n_children)
+        transitive_scope_children[k] = k_children
+    if len(source_candidates) > 1:
+        for cand in source_candidates:
+            if all(other_cand in transitive_scope_children[cand] for other_cand in source_candidates):
+                source = cand
+                break
+    elif len(source_candidates) == 1:
+        source = list(source_candidates)[0]
+
+    # If there is no unique outer entry node, we use a proxy node as the source.
+    scope_nodes: Set[nd.Node] = set()
+    if source == None:
+        source = nd.Node()
+        scope_nodes = set(scope_children[None])
+    else:
+        scope_nodes = set(scope_children[source])
+        scope_nodes.add(source)
+    expand_with = set()
+    for n in scope_nodes:
+        if isinstance(n, nd.EntryNode):
+            exit = state.exit_node(n)
+            expand_with.add(exit)
+    scope_nodes.update(expand_with)
+    scope_subgraph = StateSubgraphView(state, scope_nodes)
+
+    # Add the source and a proxy sink to the proxy graph.
+    proxy_graph.add_node(source)
+    sink = nd.Node()
+    proxy_graph.add_node(sink)
+
+    # Build up the proxy graph.
+    for edge in scope_subgraph.edges():
+        proxy_edge_src = edge.src
+        proxy_edge_dst = edge.dst
+
+        vol = 0
+        memlet: Memlet = edge.data
+        if memlet.data:
+            vol = memlet.volume
+            if isinstance(vol, sp.Expr):
+                vol = vol.subs(symbols_map)
+
+        remain_free = False
+        if edge.src in subgraph_nodes and edge.dst in subgraph_nodes:
+            # Edge completely in subgraph, don't do anything. Unless the destination is an access node which is in the
+            # state input configuration, in which case we add an edge from the source to the sink with that volume.
+            if isinstance(edge.dst, nd.AccessNode) and memlet.data in state_input_configuration:
+                if proxy_graph.has_edge(source, sink):
+                    proxy_graph[source][sink]['capacity'] += vol
+                else:
+                    proxy_graph.add_node(source)
+                    proxy_graph.add_node(sink)
+                    proxy_graph.add_edge(source, sink, capacity=vol)
+            continue
+        elif edge.src in subgraph_nodes:
+            # Edge starts in subgraph, ends outside.
+            # If there's no path back inside, it's source is the proxy sink. Otherwise, it's source is set to the proxy
+            # source and the volume is made 0, since the value will already be part of the cutout.
+            if any([n in nx.descendants(state.nx, proxy_edge_src) for n in subgraph_nodes]):
+                proxy_edge_src = source
+                vol = 0
+                remain_free = True
+            else:
+                proxy_edge_src = sink
+        elif edge.dst in subgraph_nodes:
+            # Edge starts outside, ends in the subgraph. It's destination thus is the proxy sink.
+            proxy_edge_dst = sink
+
+        if isinstance(proxy_edge_dst, nd.AccessNode) and memlet.data in state_input_configuration:
+            # If the destination is an access node that is part of the state input configuration, we add an edge from
+            # the source with that volume.
+            if proxy_graph.has_edge(source, proxy_edge_dst):
+                proxy_graph[source][proxy_edge_dst]['capacity'] += vol
+            else:
+                proxy_graph.add_edge(source, proxy_edge_dst, capacity=vol)
+            # The actual edge between src and dst is set to have infinite capacity.
+            vol = float('inf')
+        elif isinstance(proxy_edge_src, nd.AccessNode) and not remain_free:
+            # All outgoing edges from access nodes (with data) are set to have infinite capacity.
+            vol = float('inf')
+
+        if isinstance(proxy_edge_src, nd.ExitNode):
+            proxy_edge_src = state.entry_node(proxy_edge_src)
+
+        if proxy_graph.has_edge(proxy_edge_src, proxy_edge_dst):
+            proxy_graph[proxy_edge_src][proxy_edge_dst]['capacity'] += vol
+        else:
+            proxy_graph.add_node(proxy_edge_src)
+            proxy_graph.add_node(proxy_edge_dst)
+            proxy_graph.add_edge(proxy_edge_src, proxy_edge_dst, capacity=vol)
+
+    for node in scope_nodes:
+        if isinstance(node, nd.AccessNode) and node.data in state_input_configuration:
+            if not proxy_graph.has_edge(source, node) and node.data in state.parent.arrays:
+                vol = state.parent.arrays[node.data].total_size
+                if isinstance(vol, sp.Expr):
+                    vol = vol.subs(symbols_map)
+                proxy_graph.add_edge(source, node, capacity=vol)
+
+    _, (_, non_reachable) = nx.minimum_cut(proxy_graph,
+                                                 source,
+                                                 sink,
+                                                 flow_func=edmondskarp.edmonds_karp)
+
+    non_reachable -= {sink}
+    if len(non_reachable) > 0:
+        subscope_expansions = set()
+        for n in non_reachable:
+            if isinstance(n, nd.EntryNode):
+                subscope_expansions.update(transitive_scope_children[n])
+            elif isinstance(n, nd.ExitNode):
+                subscope_expansions.update(transitive_scope_children[state.entry_node(n)])
+        return subgraph_nodes.union(non_reachable.union(subscope_expansions))
+    return subgraph_nodes
+
 def _stateset_predecessor_frontier(states: Set[SDFGState]) -> Tuple[Set[SDFGState], Set[Edge[InterstateEdge]]]:
     """
     For a set of states, return their predecessor frontier.
@@ -667,14 +941,17 @@ def _cutout_determine_input_config(ct: SDFG, inverse_cutout_reach: Set[SDFGState
     for state in cutout_states:
         for dn in state.data_nodes():
             noded_descriptors.add(dn.data)
-
-            array = ct.arrays[dn.data]
-            if not array.transient:
-                # Non-transients are always part of the system state.
-                input_configuration.add(dn.data)
-            elif state.out_degree(dn) > 0:
+            if state.out_degree(dn) > 0:
                 # This is read from, add to the system state if it is written anywhere else in the graph.
-                check_for_write_before.add(dn.data)
+                # Except if it is also written to at the same time and is scalar or of size 1.
+                array = ct.arrays[dn.data]
+                if state.in_degree(dn) > 0 and (array.total_size == 1 or isinstance(array, data.Scalar)):
+                    continue
+                elif not array.transient:
+                    # Non-transients are always part of the input config if they are read and not overwritten anyway.
+                    input_configuration.add(dn.data)
+                else:
+                    check_for_write_before.add(dn.data)
 
         original_state: Optional[SDFGState] = None
         try:
@@ -759,3 +1036,11 @@ def _cutout_determine_output_configuration(ct: SDFG, cutout_reach: Set[SDFGState
                     system_state.add(dn.data)
 
     return system_state
+
+
+def _recursively_set_nsdfg_parents(target: SDFG):
+    for state in target.states():
+        for n in state.nodes():
+            if isinstance(n, nd.NestedSDFG):
+                n.sdfg.parent_sdfg = target
+                _recursively_set_nsdfg_parents(n.sdfg)
diff --git a/tests/sdfg/cutout_test.py b/tests/sdfg/cutout_test.py
index 9ac338b3da..151c3cab47 100644
--- a/tests/sdfg/cutout_test.py
+++ b/tests/sdfg/cutout_test.py
@@ -1,7 +1,7 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 import numpy as np
 import dace
-from dace.sdfg.analysis.cutout import SDFGCutout
+from dace.sdfg.analysis.cutout import SDFGCutout, _reduce_in_configuration
 import pytest
 
 
@@ -312,6 +312,106 @@ def test_input_output_configuration():
     assert len(ct.arrays) == 4
 
 
+def test_minimum_cut_simple_no_further_input_config():
+    sdfg = dace.SDFG('mincut')
+    N = dace.symbol('N')
+    sdfg.add_array('A', [N], dace.float64)
+    sdfg.add_array('B', [N], dace.float64)
+    sdfg.add_array('C', [N, N], dace.float64)
+    sdfg.add_array('tmp1', [1], dace.float64, transient=True)
+    sdfg.add_array('tmp2', [1], dace.float64, transient=True)
+    sdfg.add_array('tmp3', [1], dace.float64, transient=True)
+    sdfg.add_array('tmp4', [1], dace.float64, transient=True)
+    sdfg.add_array('tmp5', [1], dace.float64, transient=True)
+    sdfg.add_array('tmp6', [1], dace.float64, transient=True)
+    state = sdfg.add_state('state')
+    mi, mo = state.add_map('map', dict(i='0:N', j='0:N'))
+    t1 = state.add_tasklet('t1', {'a', 'b'}, {'t'}, 't = a + b')
+    t2 = state.add_tasklet(
+        't2', {'tin'}, {'t1', 't2', 't3', 't4'}, 't1 = tin + 2\nt2 = tin * 2\nt3 = tin / 2\nt4 = tin + 1'
+    )
+    t3 = state.add_tasklet('t3', {'a', 'b'}, {'t'}, 't = a + b')
+    t4 = state.add_tasklet('t4', {'a', 'b', 'c'}, {'t'}, 't = (a - b) * c')
+    a_access = state.add_access('A')
+    b_access = state.add_access('B')
+    c_access = state.add_access('C')
+    tmp1_access = state.add_access('tmp1')
+    tmp2_access = state.add_access('tmp2')
+    tmp3_access = state.add_access('tmp3')
+    tmp4_access = state.add_access('tmp4')
+    tmp5_access = state.add_access('tmp5')
+    tmp6_access = state.add_access('tmp6')
+    state.add_memlet_path(a_access, mi, t1, dst_conn='a', memlet=dace.Memlet('A[i]'))
+    state.add_memlet_path(b_access, mi, t1, dst_conn='b', memlet=dace.Memlet('B[j]'))
+    state.add_edge(t1, 't', tmp1_access, None, dace.Memlet('tmp1[0]'))
+    state.add_edge(tmp1_access, None, t2, 'tin', dace.Memlet('tmp1[0]'))
+    state.add_edge(t2, 't1', tmp2_access, None, dace.Memlet('tmp2[0]'))
+    state.add_edge(t2, 't2', tmp3_access, None, dace.Memlet('tmp3[0]'))
+    state.add_edge(t2, 't3', tmp4_access, None, dace.Memlet('tmp4[0]'))
+    state.add_edge(t2, 't4', tmp5_access, None, dace.Memlet('tmp5[0]'))
+    state.add_edge(tmp2_access, None, t3, 'a', dace.Memlet('tmp2[0]'))
+    state.add_edge(tmp3_access, None, t3, 'b', dace.Memlet('tmp3[0]'))
+    state.add_edge(tmp4_access, None, t4, 'a', dace.Memlet('tmp4[0]'))
+    state.add_edge(tmp5_access, None, t4, 'b', dace.Memlet('tmp5[0]'))
+    state.add_edge(t3, 't', tmp6_access, None, dace.Memlet('tmp6[0]'))
+    state.add_edge(tmp6_access, None, t4, 'c', dace.Memlet('tmp6[0]'))
+    state.add_memlet_path(t4, mo, c_access, src_conn='t', memlet=dace.Memlet('C[i, j]'))
+
+    cutout = SDFGCutout.singlestate_cutout(state, t3, t4, tmp6_access, reduce_input_config=True)
+
+    c_state = cutout.nodes()[0]
+    c_nodes = set(c_state.nodes())
+    o_nodes = {t2, t3, t4, tmp6_access, tmp4_access, tmp5_access, tmp2_access, tmp3_access, tmp1_access, c_access}
+    assert len(c_nodes) == 10
+    for n in o_nodes:
+        assert cutout._in_translation[n] in c_nodes
+    for n in c_nodes:
+        assert cutout._out_translation[n] in o_nodes
+
+
+def test_minimum_cut_map_scopes():
+    sdfg = dace.SDFG('mincut')
+    sdfg.add_array('A', [10, 10], dace.float64)
+    sdfg.add_array('B', [10, 10], dace.float64)
+    sdfg.add_array('tmp_1', [10, 10], dace.float64, transient=True)
+    sdfg.add_array('tmp_2', [10, 10], dace.float64, transient=True)
+    sdfg.add_array('C', [10, 10], dace.float64)
+
+    state = sdfg.add_state('state')
+    t1 = state.add_tasklet('t1', {'in1', 'in2'}, {'out1'}, 'out1 = in1 + in2')
+    t2 = state.add_tasklet('t2', {'in1'}, {'out1'}, 'out1 = in1 * 2')
+    t3 = state.add_tasklet('t3', {'in1', 'in2'}, {'out1'}, 'out1 = in1 + in2')
+    m1_i, m1_o = state.add_map('m1', dict(i='0:10', j='0:10'))
+    m2_i, m2_o = state.add_map('m2', dict(i='0:10', j='0:10'))
+    m3_i, m3_o = state.add_map('m3', dict(i='0:10', j='0:10'))
+
+    a_access = state.add_access('A')
+    b_access = state.add_access('B')
+    c_access = state.add_access('C')
+    tmp1_access = state.add_access('tmp_1')
+    tmp2_access = state.add_access('tmp_2')
+
+    state.add_memlet_path(a_access, m1_i, t1, dst_conn='in1', memlet=dace.Memlet('A[i, j]'))
+    state.add_memlet_path(b_access, m1_i, t1, dst_conn='in2', memlet=dace.Memlet('B[i, j]'))
+    state.add_memlet_path(t1, m1_o, tmp1_access, src_conn='out1', memlet=dace.Memlet('tmp_1[i, j]'))
+    state.add_memlet_path(tmp1_access, m2_i, t2, dst_conn='in1', memlet=dace.Memlet('tmp_1[i, j]'))
+    state.add_memlet_path(t2, m2_o, tmp2_access, src_conn='out1', memlet=dace.Memlet('tmp_2[i, j]'))
+    state.add_memlet_path(tmp1_access, m3_i, t3, dst_conn='in1', memlet=dace.Memlet('tmp_1[i, j]'))
+    state.add_memlet_path(tmp2_access, m3_i, t3, dst_conn='in2', memlet=dace.Memlet('tmp_2[i, j]'))
+    state.add_memlet_path(t3, m3_o, c_access, src_conn='out1', memlet=dace.Memlet('C[i, j]'))
+
+    cutout = SDFGCutout.singlestate_cutout(state, t3, m3_i, m3_o, reduce_input_config=True)
+
+    c_state = cutout.nodes()[0]
+    c_nodes = set(c_state.nodes())
+    o_nodes = {t2, t3, tmp1_access, tmp2_access, c_access, m2_i, m2_o, m3_i, m3_o}
+    assert len(c_nodes) == 9
+    for n in o_nodes:
+        assert cutout._in_translation[n] in c_nodes
+    for n in c_nodes:
+        assert cutout._out_translation[n] in o_nodes
+
+
 if __name__ == '__main__':
     test_cutout_onenode()
     test_cutout_multinode()
@@ -322,3 +422,5 @@ def test_input_output_configuration():
     test_multistate_cutout_simple_expand()
     test_multistate_cutout_complex_expand()
     test_input_output_configuration()
+    test_minimum_cut_simple_no_further_input_config()
+    test_minimum_cut_map_scopes()

From e5085aeecdb6dba457053358e898bc75edc81378 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 22 Jun 2023 21:17:44 +0800
Subject: [PATCH 115/392] Fixed mpi_send_recv_test.py

---
 tests/library/mpi/mpi_send_recv_test.py | 35 +++++++++++--------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py
index 52034111a5..48c8170949 100644
--- a/tests/library/mpi/mpi_send_recv_test.py
+++ b/tests/library/mpi/mpi_send_recv_test.py
@@ -1,5 +1,6 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
+from dace.sdfg import utils
 from dace.memlet import Memlet
 import dace.libraries.mpi as mpi
 import numpy as np
@@ -75,21 +76,15 @@ def test_mpi():
 
 ###############################################################################
 
-myrank = dace.symbol('myrank', dtype=dace.int32)
-mysize = dace.symbol('mysize', dtype=dace.int32)
-
-
 @dace.program
-def dace_send_recv():
-    tmp1 = np.full([1], myrank, dtype=np.int32)
-    tmp2 = np.zeros([1], dtype=np.int32)
-    if myrank == 0:
-        dace.comm.Send(tmp1, 1, tag=42)
-        dace.comm.Recv(tmp2, mysize - 1, tag=42)
-    else:
-        dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42)
-        dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42)
-    return tmp2
+def dace_send_recv(rank: dace.int32, size: dace.int32):
+    src = np.full([1], (rank - 1) % size, dtype=np.int32)
+    dst = np.full([1], (rank + 1) % size, dtype=np.int32)
+    sbuf = np.full([1], rank, dtype=np.int32)
+    rbuf = np.zeros([1], dtype=np.int32)
+    dace.comm.Recv(rbuf, src, tag=42)
+    dace.comm.Send(sbuf, dst, tag=42)
+    return rbuf
 
 
 @pytest.mark.mpi
@@ -101,14 +96,14 @@ def test_dace_send_recv():
     mpi_sdfg = None
     if commsize < 2:
         raise ValueError("This test is supposed to be run with at least two processes!")
-    for r in range(0, commsize):
-        if r == rank:
-            mpi_sdfg = dace_send_recv.compile()
-        comm.Barrier()
+    sdfg = None
+    if rank == 0:
+        sdfg = dace_send_recv.to_sdfg(simplify=True)
+    mpi_sdfg = utils.distributed_compile(sdfg, comm)
 
-    prv_rank = mpi_sdfg(myrank=rank, mysize=commsize)
+    val = mpi_sdfg(rank=rank, size=commsize)
 
-    assert (prv_rank[0] == (rank - 1) % commsize)
+    assert (val[0] == (rank - 1) % commsize)
 
 
 ###############################################################################

From f8c9550db8fcfa28ae9c8a54663fd9da08f1b0aa Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 22 Jun 2023 21:26:06 +0800
Subject: [PATCH 116/392] Added mpi4py replacement for send/recv

---
 dace/frontend/common/distr.py    |  2 ++
 tests/library/mpi/mpi4py_test.py | 41 ++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index c34fe54f41..c47040728f 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -372,6 +372,7 @@ def _gather(pv: 'ProgramVisitor',
 
 ##### Point-To-Point Communication
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send')
 @oprepo.replaces('dace.comm.Send')
 def _send(pv: 'ProgramVisitor',
           sdfg: SDFG,
@@ -572,6 +573,7 @@ def _pgrid_isend(pv: 'ProgramVisitor',
     return req
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv')
 @oprepo.replaces('dace.comm.Recv')
 def _recv(pv: 'ProgramVisitor',
           sdfg: SDFG,
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 7c314b7516..603a6786cb 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -175,14 +175,13 @@ def k3mm(A, B, C, D):
 
 @pytest.mark.mpi
 def test_isend_irecv():
-
     from mpi4py import MPI
     commworld = MPI.COMM_WORLD
     rank = commworld.Get_rank()
     size = commworld.Get_size()
 
     @dace.program
-    def chain(rank: dace.int32, size: dace.int32):
+    def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32):
         src = (rank - 1) % size
         dst = (rank + 1) % size
         req = np.empty((2, ), dtype=MPI.Request)
@@ -192,21 +191,49 @@ def chain(rank: dace.int32, size: dace.int32):
         req[1] = commworld.Irecv(rbuf, src, tag=0)
         MPI.Request.Waitall(req)
         return rbuf
-    
+
     sdfg = None
     if rank == 0:
-        sdfg = chain.to_sdfg(simplify=True)
+        sdfg = mpi4py_isend_irecv.to_sdfg(simplify=True)
     func = utils.distributed_compile(sdfg, commworld)
 
     val = func(rank=rank, size=size)
-    ref = chain.f(rank, size)
+    ref = mpi4py_isend_irecv.f(rank, size)
 
-    assert(val[0] == ref[0])
+    assert (val[0] == ref[0])
 
 
-if __name__ == "__main__":
+@pytest.mark.mpi
+def test_send_recv():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
 
+    @dace.program
+    def mpi4py_send_recv(rank: dace.int32, size: dace.int32):
+        src = np.full([1], (rank - 1) % size, dtype=np.int32)
+        dst = np.full([1], (rank + 1) % size, dtype=np.int32)
+        sbuf = np.full((1,), rank, dtype=np.int32)
+        commworld.Send(sbuf, dst, tag=0)
+        rbuf = np.empty((1, ), dtype=np.int32)
+        commworld.Recv(rbuf, src, tag=0)
+        return rbuf
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_send_recv.to_sdfg(simplify=True)
+    func = utils.distributed_compile(sdfg, commworld)
+
+    val = func(rank=rank, size=size)
+    ref = mpi4py_send_recv.f(rank, size)
+
+    assert (val[0] == ref[0])
+
+
+if __name__ == "__main__":
     # test_process_grid_bcast()
     # test_sub_grid_bcast()
     # test_3mm()
     test_isend_irecv()
+    test_send_recv()

From c13cdf0ff7776f6fb9c563b659de97b5adfcb8c5 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 22 Jun 2023 10:58:27 -0700
Subject: [PATCH 117/392] Disable OpenMP sections by default

---
 dace/config_schema.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index 30f5bdc924..7f31f7a4cf 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -249,7 +249,7 @@ required:
 
                     openmp_sections:
                         type: bool
-                        default: true
+                        default: false
                         title: Use OpenMP sections
                         description: >
                             If set to true, multiple connected components will

From 2a492f88ca82afe341b12fec7cf6a14ff7d808e1 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Fri, 23 Jun 2023 11:40:07 -0700
Subject: [PATCH 118/392] Add post-SDFG error checks for GPU-enabled runs

---
 dace/codegen/codegen.py            |  7 +++---
 dace/codegen/common.py             | 38 ++++++++++++++++++++++++++++
 dace/codegen/compiled_sdfg.py      | 40 ++++++++++++++++++++++++++----
 dace/codegen/targets/cuda.py       |  9 +++++--
 dace/codegen/targets/framecode.py  | 14 ++++++++---
 dace/codegen/targets/intel_fpga.py |  3 ++-
 dace/codegen/targets/mpi.py        |  3 ++-
 dace/codegen/targets/snitch.py     |  2 +-
 dace/codegen/targets/xilinx.py     |  3 ++-
 doc/setup/integration.rst          |  3 ++-
 10 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py
index e697133dae..e6bb6d9a50 100644
--- a/dace/codegen/codegen.py
+++ b/dace/codegen/codegen.py
@@ -31,7 +31,7 @@ def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str:
     exit_params = (sdfg.name, sdfg.name)
     proto += 'typedef void * %sHandle_t;\n' % sdfg.name
     proto += 'extern "C" %sHandle_t __dace_init_%s(%s);\n' % init_params
-    proto += 'extern "C" void __dace_exit_%s(%sHandle_t handle);\n' % exit_params
+    proto += 'extern "C" int __dace_exit_%s(%sHandle_t handle);\n' % exit_params
     proto += 'extern "C" void __program_%s(%sHandle_t handle%s);\n' % params
     return proto
 
@@ -69,15 +69,16 @@ def generate_dummy(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str:
 
 int main(int argc, char **argv) {{
     {sdfg.name}Handle_t handle;
+    int err;
 {allocations}
 
     handle = __dace_init_{sdfg.name}({init_params});
     __program_{sdfg.name}(handle{params});
-    __dace_exit_{sdfg.name}(handle);
+    err = __dace_exit_{sdfg.name}(handle);
 
 {deallocations}
 
-    return 0;
+    return err;
 }}
 '''
 
diff --git a/dace/codegen/common.py b/dace/codegen/common.py
index a626e36333..8cb5011bd1 100644
--- a/dace/codegen/common.py
+++ b/dace/codegen/common.py
@@ -144,3 +144,41 @@ def _try_execute(cmd: str) -> bool:
                        'set the DaCe configuration entry ``compiler.cuda.backend`` '
                        'or the ``DACE_compiler_cuda_backend`` environment variable '
                        'to either "cuda" or "hip".')
+
+
+def get_gpu_runtime_library() -> ctypes.CDLL:
+    backend = get_gpu_backend()
+    if backend == 'cuda':
+        libpath = ctypes.util.find_library('cudart')
+    elif backend == 'hip':
+        libpath = ctypes.util.find_library('amdhip64')
+    else:
+        raise RuntimeError(f'Cannot obtain GPU runtime library for backend {backend}')
+
+    if not libpath:
+        raise RuntimeError(f'GPU runtime library for {backend} not found. Please set LD_LIBRARY_PATH appropriately.')
+
+    return ctypes.CDLL(libpath)
+
+
+def get_gpu_runtime_error_string(err: int) -> str:
+    lib = get_gpu_runtime_library()
+
+    # Obtain the error string
+    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
+    geterrorstring.restype = ctypes.c_char_p
+    return geterrorstring(err).decode('utf-8')
+
+
+def get_gpu_runtime_last_error() -> str:
+    lib = get_gpu_runtime_library()
+
+    getlasterror = getattr(lib, f'{get_gpu_backend()}GetLastError')
+    res: int = getlasterror()
+    if res == 0:
+        return None
+
+    # Obtain the error string
+    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
+    geterrorstring.restype = ctypes.c_char_p
+    return geterrorstring(res).decode('utf-8')
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 8821628000..ce984511a8 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -5,14 +5,14 @@
 import re
 import shutil
 import subprocess
-from typing import Any, Callable, Dict, List, Tuple, Optional, Type
+from typing import Any, Callable, Dict, List, Tuple, Optional, Type, Union
 import warnings
 
 import numpy as np
 import sympy as sp
 
 from dace import data as dt, dtypes, hooks, symbolic
-from dace.codegen import exceptions as cgx
+from dace.codegen import exceptions as cgx, common
 from dace.config import Config
 from dace.frontend import operations
 
@@ -22,6 +22,7 @@ class ReloadableDLL(object):
     A reloadable shared object (or dynamically linked library), which
     bypasses Python's dynamic library reloading issues.
     """
+
     def __init__(self, library_filename, program_name):
         """
         Creates a new reloadable shared object.
@@ -181,6 +182,7 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._init = lib.get_symbol('__dace_init_{}'.format(sdfg.name))
         self._init.restype = ctypes.c_void_p
         self._exit = lib.get_symbol('__dace_exit_{}'.format(sdfg.name))
+        self._exit.restype = ctypes.c_int
         self._cfunc = lib.get_symbol('__program_{}'.format(sdfg.name))
 
         # Cache SDFG return values
@@ -197,6 +199,17 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._free_symbols = self._sdfg.free_symbols
         self.argnames = argnames
 
+        self.has_gpu_code = False
+        for _, _, aval in self._sdfg.arrays_recursive():
+            if aval.storage in dtypes.GPU_STORAGES:
+                self.has_gpu_code = True
+                break
+        if not self.has_gpu_code:
+            for node, _ in self._sdfg.all_nodes_recursive():
+                if getattr(node, 'schedule', False) in dtypes.GPU_SCHEDULES:
+                    self.has_gpu_code = True
+                    break
+
     def get_exported_function(self, name: str, restype=None) -> Optional[Callable[..., Any]]:
         """
         Tries to find a symbol by name in the compiled SDFG, and convert it to a callable function
@@ -297,8 +310,20 @@ def initialize(self, *args, **kwargs):
 
     def finalize(self):
         if self._exit is not None:
-            self._exit(self._libhandle)
+            res: int = self._exit(self._libhandle)
             self._initialized = False
+            if res != 0:
+                raise RuntimeError(
+                    f'An error was detected after running "{self._sdfg.name}": {self._get_error_text(res)}')
+
+    def _get_error_text(self, result: Union[str, int]) -> str:
+        if self.has_gpu_code:
+            if isinstance(result, int):
+                result = common.get_gpu_runtime_error_string(result)
+            return (f'{result}. Consider enabling synchronous debugging mode (environment variable: '
+                    'DACE_compiler_cuda_syncdebug=1) to see where the issue originates from.')
+        else:
+            return result
 
     def __call__(self, *args, **kwargs):
         # Update arguments from ordered list
@@ -312,11 +337,17 @@ def __call__(self, *args, **kwargs):
             if self._initialized is False:
                 self._lib.load()
                 self._initialize(initargtuple)
-            
+
             with hooks.invoke_compiled_sdfg_call_hooks(self, argtuple):
                 if self.do_not_execute is False:
                     self._cfunc(self._libhandle, *argtuple)
 
+            if self.has_gpu_code:
+                lasterror = common.get_gpu_runtime_last_error()
+                if lasterror is not None:
+                    raise RuntimeError(
+                        f'An error was detected when calling "{self._sdfg.name}": {self._get_error_text(lasterror)}')
+
             return self._convert_return_values()
         except (RuntimeError, TypeError, UnboundLocalError, KeyError, cgx.DuplicateDLLError, ReferenceError):
             self._lib.unload()
@@ -545,7 +576,6 @@ def _initialize_return_values(self, kwargs):
                 arr = self._create_array(*shape_desc)
                 self._return_arrays.append(arr)
 
-
     def _convert_return_values(self):
         # Return the values as they would be from a Python function
         if self._return_arrays is None or len(self._return_arrays) == 0:
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 056eec7e7e..441cc1fd0c 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -346,7 +346,7 @@ def get_generated_codeobjects(self):
 {file_header}
 
 DACE_EXPORTED int __dace_init_cuda({sdfg.name}_t *__state{params});
-DACE_EXPORTED void __dace_exit_cuda({sdfg.name}_t *__state);
+DACE_EXPORTED int __dace_exit_cuda({sdfg.name}_t *__state);
 
 {other_globalcode}
 
@@ -388,9 +388,13 @@ def get_generated_codeobjects(self):
     return 0;
 }}
 
-void __dace_exit_cuda({sdfg.name}_t *__state) {{
+int __dace_exit_cuda({sdfg.name}_t *__state) {{
     {exitcode}
 
+    // Synchronize and check for CUDA errors
+    int __err = 0;
+    __err = static_cast<int>({backend}DeviceSynchronize());
+
     // Destroy {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
         {backend}StreamDestroy(__state->gpu_context->streams[i]);
@@ -400,6 +404,7 @@ def get_generated_codeobjects(self):
     }}
 
     delete __state->gpu_context;
+    return __err;
 }}
 
 DACE_EXPORTED bool __dace_gpu_set_stream({sdfg.name}_t *__state, int streamid, gpuStream_t stream)
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 091c893d5b..09bbd30ab8 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -295,8 +295,9 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
     return __state;
 }}
 
-DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state)
+DACE_EXPORTED int __dace_exit_{sdfg.name}({sdfg.name}_t *__state)
 {{
+    int __err = 0;
 """, sdfg)
 
         # Instrumentation saving
@@ -315,7 +316,13 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
 
         for target in self._dispatcher.used_targets:
             if target.has_finalizer:
-                callsite_stream.write('__dace_exit_%s(__state);' % target.target_name, sdfg)
+                callsite_stream.write(
+                    f'''
+    int __err_{target.target_name} = __dace_exit_{target.target_name}(__state);
+    if (__err_{target.target_name}) {{
+        __err = __err_{target.target_name};
+    }}
+''', sdfg)
         for env in reversed(self.environments):
             finalize_code = _get_or_eval_sdfg_first_arg(env.finalize_code, sdfg)
             if finalize_code:
@@ -323,7 +330,8 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
                 callsite_stream.write(finalize_code)
                 callsite_stream.write("}")
 
-        callsite_stream.write('delete __state;\n}\n', sdfg)
+        callsite_stream.write('delete __state;\n', sdfg)
+        callsite_stream.write('return __err;\n}\n', sdfg)
 
     def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True):
 
diff --git a/dace/codegen/targets/intel_fpga.py b/dace/codegen/targets/intel_fpga.py
index ddbf531791..095a5ce9df 100644
--- a/dace/codegen/targets/intel_fpga.py
+++ b/dace/codegen/targets/intel_fpga.py
@@ -149,8 +149,9 @@ def get_generated_codeobjects(self):
     return 0;
 }}
 
-DACE_EXPORTED void __dace_exit_intel_fpga({sdfg.name}_t *__state) {{
+DACE_EXPORTED int __dace_exit_intel_fpga({sdfg.name}_t *__state) {{
     delete __state->fpga_context;
+    return 0;
 }}
 
 {host_code}""".format(signature=params_comma,
diff --git a/dace/codegen/targets/mpi.py b/dace/codegen/targets/mpi.py
index c2c85f897f..4c4603459a 100644
--- a/dace/codegen/targets/mpi.py
+++ b/dace/codegen/targets/mpi.py
@@ -46,7 +46,7 @@ def get_generated_codeobjects(self):
 {file_header}
 
 DACE_EXPORTED int __dace_init_mpi({sdfg.name}_t *__state{params});
-DACE_EXPORTED void __dace_exit_mpi({sdfg.name}_t *__state);
+DACE_EXPORTED int __dace_exit_mpi({sdfg.name}_t *__state);
 
 int __dace_init_mpi({sdfg.name}_t *__state{params}) {{
     int isinit = 0;
@@ -72,6 +72,7 @@ def get_generated_codeobjects(self):
 
     printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank,
            __dace_comm_size);
+    return 0;
 }}
 """.format(params=params_comma, sdfg=sdfg, file_header=fileheader.getvalue()), 'cpp', MPICodeGen, 'MPI')
         return [codeobj]
diff --git a/dace/codegen/targets/snitch.py b/dace/codegen/targets/snitch.py
index 389d906b36..1c4ba8f821 100644
--- a/dace/codegen/targets/snitch.py
+++ b/dace/codegen/targets/snitch.py
@@ -1080,7 +1080,7 @@ def gen_code_snitch(sdfg):
         hdrs += 'typedef void * %sHandle_t;\n' % sdfg.name
         hdrs += '#ifdef __cplusplus\nextern "C" {\n#endif\n'
         hdrs += '%sHandle_t __dace_init_%s(%s);\n' % init_params
-        hdrs += 'void __dace_exit_%s(%sHandle_t handle);\n' % exit_params
+        hdrs += 'int __dace_exit_%s(%sHandle_t handle);\n' % exit_params
         hdrs += 'void __program_%s(%sHandle_t handle%s);\n' % params
         hdrs += '#ifdef __cplusplus\n}\n#endif\n'
 
diff --git a/dace/codegen/targets/xilinx.py b/dace/codegen/targets/xilinx.py
index 38346601e4..e802907652 100644
--- a/dace/codegen/targets/xilinx.py
+++ b/dace/codegen/targets/xilinx.py
@@ -149,8 +149,9 @@ def get_generated_codeobjects(self):
     return 0;
 }}
 
-DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{
+DACE_EXPORTED int __dace_exit_xilinx({sdfg.name}_t *__state) {{
     delete __state->fpga_context;
+    return 0;
 }}
 
 {host_code}""".format(signature=params_comma,
diff --git a/doc/setup/integration.rst b/doc/setup/integration.rst
index 784c47cf0f..3e1fc5fa70 100644
--- a/doc/setup/integration.rst
+++ b/doc/setup/integration.rst
@@ -103,7 +103,8 @@ A compiled SDFG library contains three functions, which are named after the SDFG
       The function returns a handle to the state object, which is a struct containing all information that will persist
       between invocations of the SDFG. The other functions take this handle as their first argument. The arguments to
       this function are only the symbols used in the SDFG, ordered by name.
-    * ``__dace_exit_<SDFG name>``: Deallocates all arrays and frees all data descriptors in the given handle.
+    * ``__dace_exit_<SDFG name>``: Deallocates all arrays and frees all data descriptors in the given handle. Returns
+                                   a value of 0 if finalized successfully, or another value on failure.
     * ``__program_<SDFG name>``: The actual SDFG function, which takes the handle as its first argument, followed by
       the arguments to the SDFG, ordered by name, followed by the symbol arguments, also ordered by name.
 

From 226255d91e79d4fcfa154fc7529a5f13a09cf57a Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 26 Jun 2023 12:35:20 +0200
Subject: [PATCH 119/392] fixing CI

---
 requirements.txt | 1 +
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d75a38ac75..da67189b70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ charset-normalizer==3.1.0
 click==8.1.3
 dill==0.3.6
 Flask==2.3.2
+fparser==0.1.2
 idna==3.4
 importlib-metadata==6.6.0
 itsdangerous==2.1.2
diff --git a/setup.py b/setup.py
index 9e68ca8036..b1737aed5a 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@
       include_package_data=True,
       install_requires=[
          'numpy', 'networkx >= 2.5', 'astunparse', 'sympy<=1.9', 'pyyaml', 'ply', 'websockets', 'requests', 'flask',
-          'fparser', 'aenum >= 3.1', 'dataclasses; python_version < "3.7"', 'dill',
+          'fparser >= 0.1.2', 'aenum >= 3.1', 'dataclasses; python_version < "3.7"', 'dill',
           'pyreadline;platform_system=="Windows"', 'typing-compat; python_version < "3.8"'
       ] + cmake_requires,
       extras_require={

From 8a974de236f7ff0fba6217d982d11d5558bb8246 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 26 Jun 2023 05:45:10 -0700
Subject: [PATCH 120/392] Support NumPy 1.25 array function dispatchers

---
 dace/frontend/python/preprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 48bf5383d7..0df04e422c 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -563,8 +563,9 @@ def global_value_to_node(self,
                     parent_object = value.__self__
 
                 # If it is a callable object
+                # NumPy array dispatchers have an _implementation field and are NOT regarded as functions by Python
                 if (not inspect.isfunction(value) and not inspect.ismethod(value) and not inspect.isbuiltin(value)
-                        and hasattr(value, '__call__')):
+                        and hasattr(value, '__call__') and not hasattr(value, '_implementation')):
                     parent_object = value
                     value = value.__call__
 

From ecbf806e48f321c0d826aa97fd738fb4774ae96f Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 26 Jun 2023 06:00:38 -0700
Subject: [PATCH 121/392] Fix MPI dtor

---
 dace/codegen/targets/mpi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/mpi.py b/dace/codegen/targets/mpi.py
index 4c4603459a..419334ba5a 100644
--- a/dace/codegen/targets/mpi.py
+++ b/dace/codegen/targets/mpi.py
@@ -66,7 +66,7 @@ def get_generated_codeobjects(self):
     return 0;
 }}
 
-void __dace_exit_mpi({sdfg.name}_t *__state) {{
+int __dace_exit_mpi({sdfg.name}_t *__state) {{
     MPI_Comm_free(&__dace_mpi_comm);
     MPI_Finalize();
 

From aa42eaa31857a140b1f7b539ecdcfc3101c103aa Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 26 Jun 2023 06:39:14 -0700
Subject: [PATCH 122/392] Add informative details on failed kernel launch

---
 dace/codegen/targets/cuda.py                  | 11 +++++++++--
 dace/runtime/include/dace/cuda/cudacommon.cuh | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 441cc1fd0c..499b759c42 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1583,17 +1583,24 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                 self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
                 sdfg, state_id, scope_entry)
 
+        gdims = 'dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims))
+        bdims = ', '.join(_topy(block_dims))
         self._localcode.write(
             '''
 void  *{kname}_args[] = {{ {kargs} }};
 {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''.format(
                 kname=kernel_name,
                 kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
-                gdims='dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims)),
-                bdims=', '.join(_topy(block_dims)),
+                gdims=gdims,
+                bdims=bdims,
                 dynsmem=_topy(dynsmem_size),
                 stream=cudastream,
                 backend=self.backend), sdfg, state_id, scope_entry)
+
+        # Check kernel launch for errors
+        if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+            self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK("{kernel_name}", {gdims}, {bdims});')
+
         self._emit_sync(self._localcode)
 
         # Close the runkernel function
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 3c050f9d75..4ae6068658 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -5,6 +5,8 @@
 #if defined(__HIPCC__) || defined(WITH_HIP)
 typedef hipStream_t gpuStream_t;
 typedef hipEvent_t gpuEvent_t;
+#define gpuGetLastError hipGetLastError
+#define gpuGetErrorString hipGetErrorString
 
 #define DACE_CUDA_CHECK(err) do {                                            \
     hipError_t errr = (err);                                                 \
@@ -18,6 +20,8 @@ typedef hipEvent_t gpuEvent_t;
 
 typedef cudaStream_t gpuStream_t;
 typedef cudaEvent_t gpuEvent_t;
+#define gpuGetLastError cudaGetLastError
+#define gpuGetErrorString cudaGetErrorString
 
 #define DACE_CUDA_CHECK(err) do {                                            \
     cudaError_t errr = (err);                                                \
@@ -29,6 +33,16 @@ typedef cudaEvent_t gpuEvent_t;
 } while(0)
 #endif
 
+#define DACE_KERNEL_LAUNCH_CHECK(kernel_name, gdimx, gdimy, gdimz, bdimx, bdimy, bdimz) do { \
+    auto err = gpuGetLastError();                                                            \
+    if (err != decltype(err)(0)) {                                                           \
+        printf("ERROR launching kernel %s: %s (%d). Grid dimensions: "                       \
+               "(%d, %d, %d); Block dimensions: (%d, %d, %d).\n", kernel_name,               \
+               gpuGetErrorString(err), (int)err, gdimx, gdimy, gdimz, bdimx, bdimy, bdimz);  \
+        throw;                                                                               \
+    }                                                                                        \
+} while(0)
+
 namespace dace {
 namespace cuda {
 struct Context {

From 4ece8004cdbd9c2a18d101ee886fa2bae4ba1619 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 26 Jun 2023 16:22:32 +0200
Subject: [PATCH 123/392] testing new options to get CI to work

---
 dace/frontend/fortran/ast_components.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 8bb4e96936..5d6cb10bd4 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -1,6 +1,6 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from fparser.two import Fortran2008
-from fparser.two import Fortran2003
+from fparser.two import Fortran2008 as f08
+from fparser.two import Fortran2003 as f03
 from fparser.two import symbol_table
 
 import copy
@@ -101,7 +101,7 @@ class InternalFortranAst:
     for each entry in the dictionary, the key is the name of the class in the fparser AST and the value
     is the name of the function that will be used to translate the fparser AST to our AST
     """
-    def __init__(self, ast: Fortran2003.Program, tables: symbol_table.SymbolTables):
+    def __init__(self, ast: f03.Program, tables: symbol_table.SymbolTables):
         """
         Initialization of the AST converter
         :param ast: the fparser AST
@@ -494,12 +494,12 @@ def type_declaration_stmt(self, node: FASTNode):
 
         #decide if its a intrinsic variable type or a derived type
 
-        type_of_node = get_child(node, [Fortran2003.Intrinsic_Type_Spec, Fortran2003.Declaration_Type_Spec])
+        type_of_node = get_child(node, [f03.Intrinsic_Type_Spec, f03.Declaration_Type_Spec])
 
-        if isinstance(type_of_node, Fortran2003.Intrinsic_Type_Spec):
+        if isinstance(type_of_node, f03.Intrinsic_Type_Spec):
             derived_type = False
             basetype = type_of_node.items[0]
-        elif isinstance(type_of_node, Fortran2003.Declaration_Type_Spec):
+        elif isinstance(type_of_node, f03.Declaration_Type_Spec):
             derived_type = True
             basetype = type_of_node.items[1].string
         else:
@@ -532,7 +532,7 @@ def type_declaration_stmt(self, node: FASTNode):
         names_list = get_child(node, ["Entity_Decl_List", "Component_Decl_List"])
 
         #get the names out of the name list
-        names = get_children(names_list, [Fortran2003.Entity_Decl, Fortran2003.Component_Decl])
+        names = get_children(names_list, [f03.Entity_Decl, f03.Component_Decl])
 
         #get the attributes of the variables being defined
         # alloc relates to whether it is statically (False) or dynamically (True) allocated
@@ -560,7 +560,7 @@ def type_declaration_stmt(self, node: FASTNode):
                 size = []
                 for dim in array_sizes.children:
                     #sanity check
-                    if isinstance(dim, Fortran2003.Explicit_Shape_Spec):
+                    if isinstance(dim, f03.Explicit_Shape_Spec):
                         dim_expr = [i for i in dim.children if i is not None]
                         if len(dim_expr) == 1:
                             dim_expr = dim_expr[0]
@@ -571,7 +571,7 @@ def type_declaration_stmt(self, node: FASTNode):
             #handle initializiation
             init = None
 
-            initialization = get_children(var, Fortran2003.Initialization)
+            initialization = get_children(var, f03.Initialization)
             if len(initialization) == 1:
                 initialization = initialization[0]
                 #if there is an initialization, the actual expression is in the second child, with the first being the equals sign
@@ -957,11 +957,11 @@ def section_subscript_list(self, node: FASTNode):
 
     def specification_part(self, node: FASTNode):
         #TODO this can be refactored to consider more fortran declaration options. Currently limited to what is encountered in code.
-        others = [self.create_ast(i) for i in node.children if not isinstance(i, Fortran2008.Type_Declaration_Stmt)]
+        others = [self.create_ast(i) for i in node.children if not isinstance(i, f08.Type_Declaration_Stmt)]
 
-        decls = [self.create_ast(i) for i in node.children if isinstance(i, Fortran2008.Type_Declaration_Stmt)]
+        decls = [self.create_ast(i) for i in node.children if isinstance(i, f08.Type_Declaration_Stmt)]
 
-        uses = [self.create_ast(i) for i in node.children if isinstance(i, Fortran2008.Use_Stmt)]
+        uses = [self.create_ast(i) for i in node.children if isinstance(i, f08.Use_Stmt)]
         tmp = [self.create_ast(i) for i in node.children]
         typedecls = [i for i in tmp if isinstance(i, ast_internal_classes.Type_Decl_Node)]
         symbols = []

From 3143c90bb5b093f114a6805e600c9f7337d7e13d Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 26 Jun 2023 19:01:35 +0200
Subject: [PATCH 124/392] testing new options to get CI to work

---
 dace/frontend/fortran/ast_components.py | 2 +-
 dace/frontend/fortran/fortran_parser.py | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 5d6cb10bd4..e386bae23b 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -1,5 +1,5 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from fparser.two import Fortran2008 as f08
+from fparser.two.Fortran2008 import Fortran2008 as f08
 from fparser.two import Fortran2003 as f03
 from fparser.two import symbol_table
 
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 03e0faae38..3f3df33997 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -357,7 +357,6 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
 
         # First we need to check if the parameters are literals or variables
         for arg_i, variable in enumerate(variables_in_call):
-            # print(i.__class__)
             if isinstance(variable, ast_internal_classes.Name_Node):
                 varname = variable.name
             elif isinstance(variable, ast_internal_classes.Array_Subscript_Node):
@@ -494,7 +493,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                             ins_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
                         if local_name.name in write_names:
                             outs_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
-                        #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
+                        
 
                         indices = 0
                         if isinstance(variable_in_call, ast_internal_classes.Array_Subscript_Node):
@@ -569,7 +568,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    
                     array_in_global = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
@@ -594,7 +593,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    
                     array = sdfg.arrays[self.name_mapping[sdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
@@ -614,7 +613,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    #inouts_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
+                    
                     array = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)

From ee4e28923a9e2f120b7808a6c8b95e71b0859b94 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 26 Jun 2023 20:34:40 +0200
Subject: [PATCH 125/392] fixing is fpga call

---
 dace/sdfg/utils.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 93ff2c79e8..36084472d2 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1501,13 +1501,19 @@ def is_fpga_kernel(sdfg, state):
     if ("is_FPGA_kernel" in state.location and state.location["is_FPGA_kernel"] == False):
         return False
     data_nodes = state.data_nodes()
-    if len(data_nodes) == 0:
-        return False
+    at_least_one_fpga_array = False
     for n in data_nodes:
-        if n.desc(sdfg).storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
-                                        dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
+        desc = n.desc(sdfg)
+        if desc.storage in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
+                            dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
+            at_least_one_fpga_array = True
+        if isinstance(desc, dt.Scalar):
+            continue
+        if desc.storage not in (dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
+                                dtypes.StorageType.FPGA_Registers, dtypes.StorageType.FPGA_ShiftRegister):
             return False
-    return True
+
+    return at_least_one_fpga_array
 
 
 def postdominators(
@@ -1745,7 +1751,10 @@ def make_dynamic_map_inputs_unique(sdfg: SDFG):
                             else:
                                 dynamic_map_inputs.add(e.dst_conn)
                     if repl_dict:
-                        in_connectors = {repl_dict[n] if n in repl_dict else n: t for n, t in node.in_connectors.items()}
+                        in_connectors = {
+                            repl_dict[n] if n in repl_dict else n: t
+                            for n, t in node.in_connectors.items()
+                        }
                         node.in_connectors = in_connectors
                         node.map.range.replace(repl_dict)
                         state.scope_subgraph(node).replace_dict(repl_dict)
@@ -1782,7 +1791,7 @@ def get_thread_local_data(sdfg: SDFG) -> List[str]:
                     # ... if we have seen the data before, but in a different scope, remove it from the candidates
                     elif data_to_check[node.data] != scope_dict[node]:
                         del data_to_check[node.data]
-    
+
     result = list(data_to_check.keys())
     for name in result:
         if not sdfg.arrays[name].transient:

From 5c554f0a41bf2a40de2e8cde14568b8b70769275 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 26 Jun 2023 22:06:20 -0700
Subject: [PATCH 126/392] Add Windows support for GPU runtime check, make
 optional

---
 dace/codegen/common.py        | 9 ++++++++-
 dace/codegen/compiled_sdfg.py | 8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/common.py b/dace/codegen/common.py
index 8cb5011bd1..e8f2972c63 100644
--- a/dace/codegen/common.py
+++ b/dace/codegen/common.py
@@ -150,13 +150,20 @@ def get_gpu_runtime_library() -> ctypes.CDLL:
     backend = get_gpu_backend()
     if backend == 'cuda':
         libpath = ctypes.util.find_library('cudart')
+        if os.name == 'nt' and not libpath: # Windows-based search
+            for version in (12, 11, 10, 9):
+                libpath = ctypes.util.find_library(f'cudart64_{version}0')
+                if libpath:
+                    break
     elif backend == 'hip':
         libpath = ctypes.util.find_library('amdhip64')
     else:
         raise RuntimeError(f'Cannot obtain GPU runtime library for backend {backend}')
 
     if not libpath:
-        raise RuntimeError(f'GPU runtime library for {backend} not found. Please set LD_LIBRARY_PATH appropriately.')
+        envname = 'PATH' if os.name == 'nt' else 'LD_LIBRARY_PATH'
+        raise RuntimeError(f'GPU runtime library for {backend} not found. Please set the {envname} '
+                           'environment variable to point to the libraries.')
 
     return ctypes.CDLL(libpath)
 
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index ce984511a8..4538d6d9b4 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -343,7 +343,13 @@ def __call__(self, *args, **kwargs):
                     self._cfunc(self._libhandle, *argtuple)
 
             if self.has_gpu_code:
-                lasterror = common.get_gpu_runtime_last_error()
+                # Optionally get errors from call
+                try:
+                    lasterror = common.get_gpu_runtime_last_error()
+                except RuntimeError as ex:
+                    warnings.warn(f'Could not get last error from GPU runtime: {ex}')
+                    lasterror = None
+
                 if lasterror is not None:
                     raise RuntimeError(
                         f'An error was detected when calling "{self._sdfg.name}": {self._get_error_text(lasterror)}')

From f221a26f1679db5c0b198f387df5b4e664985d0e Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 26 Jun 2023 22:17:37 -0700
Subject: [PATCH 127/392] Add GPU launch bounds property

---
 dace/codegen/targets/cuda.py            | 20 ++++++--
 dace/sdfg/nodes.py                      | 20 ++++----
 tests/codegen/gpu_launch_bounds_test.py | 68 +++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 14 deletions(-)
 create mode 100644 tests/codegen/gpu_launch_bounds_test.py

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 499b759c42..6c1e8a20ef 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1503,10 +1503,22 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
         if create_grid_barrier:
             extra_kernel_args_typed.append('cub::GridBarrier __gbar')
 
-        # Write kernel prototype
         node = dfg_scope.source_nodes()[0]
+
+        # Set kernel launch bounds
+        if node.gpu_launch_bounds == "-1":
+            launch_bounds = ''
+        elif node.gpu_launch_bounds == "0":
+            if any(symbolic.issymbolic(b) for b in block_dims):
+                launch_bounds = ''
+            else:
+                launch_bounds = f'__launch_bounds__({_topy(prod(block_dims))})'
+        else:
+            launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})'
+
+        # Write kernel prototype
         self._localcode.write(
-            '__global__ void %s(%s) {\n' % (kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), sdfg,
+            '__global__ void %s %s(%s) {\n' % (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), sdfg,
             state_id, node)
 
         # Write constant expressions in GPU code
@@ -1755,7 +1767,7 @@ def get_kernel_dimensions(self, dfg_scope):
             if block_size is None:
                 if has_dtbmap:
                     if (Config.get('compiler', 'cuda', 'dynamic_map_block_size') == 'max'):
-                        block_size = ['max', 1, 1]
+                        raise NotImplementedError('max dynamic block size unimplemented')
                     else:
                         block_size = [
                             int(b) for b in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')
@@ -1766,7 +1778,7 @@ def get_kernel_dimensions(self, dfg_scope):
                                       Config.get('compiler', 'cuda', 'default_block_size'))
 
                     if (Config.get('compiler', 'cuda', 'default_block_size') == 'max'):
-                        block_size = ['max', 1, 1]
+                        raise NotImplementedError('max dynamic block size unimplemented')
                     else:
                         block_size = [int(b) for b in Config.get('compiler', 'cuda', 'default_block_size').split(',')]
 
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 866d77bed6..e703c7863e 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -549,7 +549,7 @@ def __init__(self,
         self.symbol_mapping = symbol_mapping or {}
         self.schedule = schedule
         self.debuginfo = debuginfo
-    
+
     def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
@@ -648,7 +648,6 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None):
 # Scope entry class
 class EntryNode(Node):
     """ A type of node that opens a scope (e.g., Map or Consume). """
-
     def validate(self, sdfg, state):
         self.map.validate(sdfg, state, self)
 
@@ -659,7 +658,6 @@ def validate(self, sdfg, state):
 # Scope exit class
 class ExitNode(Node):
     """ A type of node that closes a scope (e.g., Map or Consume). """
-
     def validate(self, sdfg, state):
         self.map.validate(sdfg, state, self)
 
@@ -673,7 +671,6 @@ class MapEntry(EntryNode):
         
         :see: Map
     """
-
     def __init__(self, map: 'Map', dynamic_inputs=None):
         super(MapEntry, self).__init__(dynamic_inputs or set())
         if map is None:
@@ -750,7 +747,6 @@ class MapExit(ExitNode):
         
         :see: Map
     """
-
     def __init__(self, map: 'Map'):
         super(MapExit, self).__init__()
         if map is None:
@@ -849,6 +845,14 @@ class Map(object):
                                   optional=True,
                                   optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES)
 
+    gpu_launch_bounds = Property(dtype=str,
+                                 default="0",
+                                 desc="GPU kernel launch bounds. A value of -1 disables the statement, 0 (default) "
+                                 "enables the statement if block size is not symbolic, and any other value "
+                                 "(including tuples) sets it explicitly.",
+                                 optional=True,
+                                 optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES)
+
     def __init__(self,
                  label,
                  params,
@@ -899,7 +903,6 @@ class ConsumeEntry(EntryNode):
         
         :see: Consume
     """
-
     def __init__(self, consume: 'Consume', dynamic_inputs=None):
         super(ConsumeEntry, self).__init__(dynamic_inputs or set())
         if consume is None:
@@ -978,7 +981,6 @@ class ConsumeExit(ExitNode):
         
         :see: Consume
     """
-
     def __init__(self, consume: 'Consume'):
         super(ConsumeExit, self).__init__()
         if consume is None:
@@ -1090,7 +1092,6 @@ def get_param_num(self):
 
 @dace.serialize.serializable
 class PipelineEntry(MapEntry):
-
     @staticmethod
     def map_type():
         return PipelineScope
@@ -1123,7 +1124,6 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]:
 
 @dace.serialize.serializable
 class PipelineExit(MapExit):
-
     @staticmethod
     def map_type():
         return PipelineScope
@@ -1332,7 +1332,7 @@ def register_implementation(cls, name, transformation_type):
         """Register an implementation to belong to this library node type."""
         cls.implementations[name] = transformation_type
         transformation_type._match_node = cls
-    
+
     @property
     def free_symbols(self) -> Set[str]:
         fsyms = super(LibraryNode, self).free_symbols
diff --git a/tests/codegen/gpu_launch_bounds_test.py b/tests/codegen/gpu_launch_bounds_test.py
new file mode 100644
index 0000000000..4618c9cab0
--- /dev/null
+++ b/tests/codegen/gpu_launch_bounds_test.py
@@ -0,0 +1,68 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import dace
+import pytest
+
+
+@pytest.mark.gpu
+def test_launch_bounds_default():
+    @dace.program
+    def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:100, 0:20] @ dace.ScheduleType.GPU_Device:
+            a[i, j] = 1
+
+    with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32,2,1'):
+        assert '__launch_bounds__(64)' in prog.to_sdfg().generate_code()[1].code
+
+
+@pytest.mark.gpu
+def test_launch_bounds_implicit():
+    @dace.program
+    def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device:
+            for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i * 2 + bi, j * 2 + bj] = 1
+
+    assert '__launch_bounds__(4)' in prog.to_sdfg().generate_code()[1].code
+
+
+@pytest.mark.gpu
+def test_launch_bounds_implicit_sym():
+    B = dace.symbol('B')
+
+    @dace.program
+    def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device:
+            for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i * B + bi, j * B + bj] = 1
+
+    assert '__launch_bounds__' not in prog.to_sdfg().generate_code()[1].code
+
+
+@pytest.mark.gpu
+def test_launch_bounds_explicit():
+    B = 2
+
+    @dace.program
+    def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device:
+            for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i * B + bi, j * B + bj] = 1
+
+    sdfg = prog.to_sdfg()
+    for n, _ in sdfg.all_nodes_recursive():
+        if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device:
+            mapentry = n
+            break
+
+    mapentry.map.gpu_launch_bounds = '-1'
+    assert '__launch_bounds__' not in sdfg.generate_code()[1].code
+    mapentry.map.gpu_launch_bounds = '5, 1'
+    assert '__launch_bounds__(5, 1)' in sdfg.generate_code()[1].code
+
+
+if __name__ == '__main__':
+    test_launch_bounds_default()
+    test_launch_bounds_implicit()
+    test_launch_bounds_implicit_sym()
+    test_launch_bounds_explicit()

From b6c5f9e33ae4f48ddf73f79ca7c453daea1985ec Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 06:33:07 -0700
Subject: [PATCH 128/392] Always check for GPU runtime errors if returned

---
 dace/codegen/targets/cpp.py                   |  10 +-
 dace/codegen/targets/cuda.py                  | 104 +++----
 dace/runtime/include/dace/cuda/cudacommon.cuh | 257 +++++++++---------
 dace/runtime/include/dace/cuda/stream.cuh     |  26 +-
 dace/runtime/include/dace/stream.h            |   8 +-
 tests/parse_state_struct_test.py              |   9 +-
 6 files changed, 209 insertions(+), 205 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 868419db3d..d5e7cacc53 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -1318,7 +1318,7 @@ def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream):
         if hasattr(e.src, "_cuda_stream") and e.src._cuda_stream != 'nullptr':
             cudastream = "__state->gpu_context->streams[%d]" % e.src._cuda_stream
             callsite_stream.write(
-                "%sStreamSynchronize(%s);" % (common.get_gpu_backend(), cudastream),
+                "DACE_GPU_CHECK(%sStreamSynchronize(%s));" % (common.get_gpu_backend(), cudastream),
                 sdfg,
                 state_id,
                 [e.src, e.dst],
@@ -1356,9 +1356,9 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
             if isinstance(desc, data.Array) and desc.start_offset != 0:
                 ptrname = f'({ptrname} - {sym2cpp(desc.start_offset)})'
             if Config.get_bool('compiler', 'cuda', 'syncdebug'):
-                callsite_stream.write(f'DACE_CUDA_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg,
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg,
                                       state_id, scope_exit)
-                callsite_stream.write(f'DACE_CUDA_CHECK({backend}DeviceSynchronize());')
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}DeviceSynchronize());')
             else:
                 callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', sdfg, state_id, scope_exit)
             to_remove.add((sd, name))
@@ -1380,8 +1380,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
             if (isinstance(edge.dst, nodes.AccessNode) and hasattr(edge.dst, '_cuda_stream')
                     and edge.dst._cuda_stream != node._cuda_stream):
                 callsite_stream.write(
-                    """{backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream});
-{backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0);""".format(
+                    """DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream}));
+DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));""".format(
                         ev=edge._cuda_event if hasattr(edge, "_cuda_event") else 0,
                         src_stream=cudastream,
                         dst_stream=edge.dst._cuda_stream,
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 6c1e8a20ef..1c2471807a 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -136,8 +136,8 @@ def __init__(self, frame_codegen, sdfg: SDFG):
 
     def _emit_sync(self, codestream: CodeIOStream):
         if Config.get_bool('compiler', 'cuda', 'syncdebug'):
-            codestream.write('''DACE_CUDA_CHECK({backend}GetLastError());
-            DACE_CUDA_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend))
+            codestream.write('''DACE_GPU_CHECK({backend}GetLastError());
+            DACE_GPU_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend))
 
     def preprocess(self, sdfg: SDFG) -> None:
         # Determine GPU backend
@@ -368,8 +368,8 @@ def get_generated_codeobjects(self):
 
     // Initialize {backend} before we run the application
     float *dev_X;
-    {backend}Malloc((void **) &dev_X, 1);
-    {backend}Free(dev_X);
+    DACE_GPU_CHECK({backend}Malloc((void **) &dev_X, 1));
+    DACE_GPU_CHECK({backend}Free(dev_X));
 
     {pool_header}
 
@@ -377,10 +377,10 @@ def get_generated_codeobjects(self):
 
     // Create {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
-        {backend}StreamCreateWithFlags(&__state->gpu_context->streams[i], {backend}StreamNonBlocking);
+        DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->streams[i], {backend}StreamNonBlocking));
     }}
     for(int i = 0; i < {nevents}; ++i) {{
-        {backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming);
+        DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming));
     }}
 
     {initcode}
@@ -397,10 +397,10 @@ def get_generated_codeobjects(self):
 
     // Destroy {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
-        {backend}StreamDestroy(__state->gpu_context->streams[i]);
+        DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->streams[i]));
     }}
     for(int i = 0; i < {nevents}; ++i) {{
-        {backend}EventDestroy(__state->gpu_context->events[i]);
+        DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i]));
     }}
 
     delete __state->gpu_context;
@@ -587,14 +587,17 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
                 cudastream = getattr(node, '_cuda_stream', 'nullptr')
                 if cudastream != 'nullptr':
                     cudastream = f'__state->gpu_context->streams[{cudastream}]'
-                result_alloc.write(f'{self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream});\n')
+                result_alloc.write(
+                    f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n'
+                )
                 self._emit_sync(result_alloc)
             else:
                 # Strides are left to the user's discretion
-                result_alloc.write('%sMalloc((void**)&%s, %s);\n' % (self.backend, dataname, arrsize_malloc))
+                result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' %
+                                   (self.backend, dataname, arrsize_malloc))
 
             if node.setzero:
-                result_alloc.write('%sMemset(%s, 0, %s);\n' % (self.backend, dataname, arrsize_malloc))
+                result_alloc.write('DACE_GPU_CHECK(%sMemset(%s, 0, %s));\n' % (self.backend, dataname, arrsize_malloc))
             if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0:
                 result_alloc.write(f'{dataname} += {cpp.sym2cpp(nodedesc.start_offset)};\n')
         elif nodedesc.storage == dtypes.StorageType.CPU_Pinned:
@@ -603,7 +606,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
 
             # Strides are left to the user's discretion
-            result_alloc.write('%sMallocHost(&%s, %s);\n' % (self.backend, dataname, arrsize_malloc))
+            result_alloc.write('DACE_GPU_CHECK(%sMallocHost(&%s, %s));\n' % (self.backend, dataname, arrsize_malloc))
             if node.setzero:
                 result_alloc.write('memset(%s, 0, %s);\n' % (dataname, arrsize_malloc))
             if nodedesc.start_offset != 0:
@@ -723,9 +726,9 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream,
 
         if nodedesc.storage == dtypes.StorageType.GPU_Global:
             if not nodedesc.pool:  # If pooled, will be freed somewhere else
-                callsite_stream.write('%sFree(%s);\n' % (self.backend, dataname), sdfg, state_id, node)
+                callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), sdfg, state_id, node)
         elif nodedesc.storage == dtypes.StorageType.CPU_Pinned:
-            callsite_stream.write('%sFreeHost(%s);\n' % (self.backend, dataname), sdfg, state_id, node)
+            callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), sdfg, state_id, node)
         elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \
              nodedesc.storage == dtypes.StorageType.Register:
             pass  # Do nothing
@@ -994,7 +997,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                     current_dst_expr = dst_expr + " + " + "+ ".join(
                         ["(__copyidx{} * ({}))".format(d, sym2cpp(s)) for d, s in enumerate(dst_strides[:-2])])
                     callsite_stream.write(
-                        '%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s);\n' %
+                        'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
                         (self.backend, current_dst_expr,
                          _topy(dst_strides[-2]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, current_src_expr,
                          sym2cpp(src_strides[-2]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
@@ -1011,7 +1014,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                 copysize += ' * sizeof(%s)' % dtype.ctype
 
                 callsite_stream.write(
-                    '%sMemcpyAsync(%s, %s, %s, %sMemcpy%sTo%s, %s);\n' %
+                    'DACE_GPU_CHECK(%sMemcpyAsync(%s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
                     (self.backend, dst_expr, src_expr, copysize, self.backend, src_location, dst_location, cudastream),
                     sdfg, state_id, [src_node, dst_node])
                 node_dtype = dst_node.desc(sdfg).dtype
@@ -1023,27 +1026,27 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                             tclass = field_type.type
                             length = node_dtype._length[field_name]
                             size = 'sizeof({})*{}[__idx].{}'.format(dtypes._CTYPES[tclass], str(src_node), length)
-                            callsite_stream.write('{backend}Malloc(&{dst}[__idx].{fname}, '
-                                                  '{sz});'.format(dst=str(dst_node),
-                                                                  fname=field_name,
-                                                                  sz=size,
-                                                                  backend=self.backend))
+                            callsite_stream.write('DACE_GPU_CHECK({backend}Malloc(&{dst}[__idx].{fname}, '
+                                                  '{sz}));'.format(dst=str(dst_node),
+                                                                   fname=field_name,
+                                                                   sz=size,
+                                                                   backend=self.backend))
                             callsite_stream.write(
-                                '{backend}MemcpyAsync({dst}[__idx].{fname}, '
+                                'DACE_GPU_CHECK({backend}MemcpyAsync({dst}[__idx].{fname}, '
                                 '{src}[__idx].{fname}, {sz}, '
-                                '{backend}Memcpy{sloc}To{dloc}, {stream});'.format(dst=str(dst_node),
-                                                                                   src=str(src_node),
-                                                                                   fname=field_name,
-                                                                                   sz=size,
-                                                                                   sloc=src_location,
-                                                                                   dloc=dst_location,
-                                                                                   stream=cudastream,
-                                                                                   backend=self.backend), sdfg,
+                                '{backend}Memcpy{sloc}To{dloc}, {stream}));'.format(dst=str(dst_node),
+                                                                                    src=str(src_node),
+                                                                                    fname=field_name,
+                                                                                    sz=size,
+                                                                                    sloc=src_location,
+                                                                                    dloc=dst_location,
+                                                                                    stream=cudastream,
+                                                                                    backend=self.backend), sdfg,
                                 state_id, [src_node, dst_node])
                     callsite_stream.write('}')
             elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)):
                 callsite_stream.write(
-                    '%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s);\n' %
+                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
                     (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
                      src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
                      'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
@@ -1051,7 +1054,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                     [src_node, dst_node])
             elif dims == 2:
                 callsite_stream.write(
-                    '%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s);\n' %
+                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
                     (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
                      src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
                      sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
@@ -1068,8 +1071,8 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                     syncstream = '__state->gpu_context->streams[%d]' % streamid
                     callsite_stream.write(
                         '''
-    {backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream});
-    {backend}StreamWaitEvent({dst_stream}, __state->gpu_context->events[{ev}], 0);
+    DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream}));
+    DACE_GPU_CHECK({backend}StreamWaitEvent({dst_stream}, __state->gpu_context->events[{ev}], 0));
                     '''.format(ev=event, src_stream=cudastream, dst_stream=syncstream, backend=self.backend), sdfg,
                         state_id, [src_node, dst_node])
 
@@ -1219,7 +1222,7 @@ def generate_state(self, sdfg, state, function_stream, callsite_stream):
                 if isinstance(desc, dt.Array) and desc.start_offset != 0:
                     ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})'
 
-                callsite_stream.write(f'{backend}Free({ptrname});\n', sd)
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd)
                 self._emit_sync(callsite_stream)
                 to_remove.add((sd, name))
             for sd, name in to_remove:
@@ -1249,8 +1252,8 @@ def generate_state(self, sdfg, state, function_stream, callsite_stream):
 
                 for stream in streams_to_sync:
                     callsite_stream.write(
-                        '%sStreamSynchronize(__state->gpu_context->streams[%d]);' % (self.backend, stream), sdfg,
-                        sdfg.node_id(state))
+                        'DACE_GPU_CHECK(%sStreamSynchronize(__state->gpu_context->streams[%d]));' %
+                        (self.backend, stream), sdfg, sdfg.node_id(state))
 
             # After synchronizing streams, generate state footer normally
             callsite_stream.write('\n')
@@ -1518,8 +1521,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
 
         # Write kernel prototype
         self._localcode.write(
-            '__global__ void %s %s(%s) {\n' % (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), sdfg,
-            state_id, node)
+            '__global__ void %s %s(%s) {\n' %
+            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), sdfg, state_id, node)
 
         # Write constant expressions in GPU code
         self._frame.generate_constants(sdfg, self._localcode)
@@ -1550,7 +1553,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
         if is_persistent:
             self._localcode.write('''
 int dace_number_SMs;
-{backend}DeviceGetAttribute(&dace_number_SMs, {backend}DevAttrMultiProcessorCount, 0);
+DACE_GPU_CHECK({backend}DeviceGetAttribute(&dace_number_SMs, {backend}DevAttrMultiProcessorCount, 0));
 int dace_number_blocks = ((int) ceil({fraction} * dace_number_SMs)) * {occupancy};
                 '''.format(fraction=Config.get('compiler', 'cuda', 'persistent_map_SM_fraction'),
                            occupancy=Config.get('compiler', 'cuda', 'persistent_map_occupancy'),
@@ -1600,18 +1603,17 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
         self._localcode.write(
             '''
 void  *{kname}_args[] = {{ {kargs} }};
-{backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''.format(
-                kname=kernel_name,
-                kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
-                gdims=gdims,
-                bdims=bdims,
-                dynsmem=_topy(dynsmem_size),
-                stream=cudastream,
-                backend=self.backend), sdfg, state_id, scope_entry)
+gpuError_t __err = {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''
+            .format(kname=kernel_name,
+                    kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
+                    gdims=gdims,
+                    bdims=bdims,
+                    dynsmem=_topy(dynsmem_size),
+                    stream=cudastream,
+                    backend=self.backend), sdfg, state_id, scope_entry)
 
         # Check kernel launch for errors
-        if Config.get_bool('compiler', 'cuda', 'syncdebug'):
-            self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK("{kernel_name}", {gdims}, {bdims});')
+        self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});')
 
         self._emit_sync(self._localcode)
 
@@ -1633,7 +1635,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
             if hasattr(e, '_cuda_event'):
                 ev = e._cuda_event
                 callsite_stream.write(
-                    'DACE_CUDA_CHECK({backend}EventSynchronize(__state->gpu_context->events[{ev}]));'.format(
+                    'DACE_GPU_CHECK({backend}EventSynchronize(__state->gpu_context->events[{ev}]));'.format(
                         ev=ev, backend=self.backend), sdfg, state_id, [e.src, e.dst])
             callsite_stream.write(
                 self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 4ae6068658..6a2c9d6da7 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -5,198 +5,197 @@
 #if defined(__HIPCC__) || defined(WITH_HIP)
 typedef hipStream_t gpuStream_t;
 typedef hipEvent_t gpuEvent_t;
+typedef hipError_t gpuError_t;
 #define gpuGetLastError hipGetLastError
 #define gpuGetErrorString hipGetErrorString
-
-#define DACE_CUDA_CHECK(err) do {                                            \
-    hipError_t errr = (err);                                                 \
-    if(errr != (hipError_t)0)                                                \
-    {                                                                        \
-        printf("HIP ERROR at %s:%d, code: %d\n", __FILE__, __LINE__, errr);  \
-    }                                                                        \
-} while(0)
-
 #else
-
 typedef cudaStream_t gpuStream_t;
 typedef cudaEvent_t gpuEvent_t;
+typedef cudaError_t gpuError_t;
 #define gpuGetLastError cudaGetLastError
 #define gpuGetErrorString cudaGetErrorString
+#endif
 
-#define DACE_CUDA_CHECK(err) do {                                            \
-    cudaError_t errr = (err);                                                \
-    if(errr != (cudaError_t)0)                                               \
-    {                                                                        \
-        printf("CUDA ERROR at %s:%d, code: %d\n", __FILE__, __LINE__, errr); \
-        throw;                                                               \
-    }                                                                        \
-} while(0)
+#define DACE_GPU_CHECK(err)                                               \
+  do {                                                                    \
+    gpuError_t errr = (err);                                              \
+    if (errr != (gpuError_t)0) {                                          \
+      printf("GPU runtime error at %s:%d: %s (%d)\n", __FILE__, __LINE__, \
+             errr, gpuGetErrorString(err));                               \
+      throw;                                                              \
+    }                                                                     \
+  } while (0)
 #endif
 
-#define DACE_KERNEL_LAUNCH_CHECK(kernel_name, gdimx, gdimy, gdimz, bdimx, bdimy, bdimz) do { \
-    auto err = gpuGetLastError();                                                            \
-    if (err != decltype(err)(0)) {                                                           \
-        printf("ERROR launching kernel %s: %s (%d). Grid dimensions: "                       \
-               "(%d, %d, %d); Block dimensions: (%d, %d, %d).\n", kernel_name,               \
-               gpuGetErrorString(err), (int)err, gdimx, gdimy, gdimz, bdimx, bdimy, bdimz);  \
-        throw;                                                                               \
-    }                                                                                        \
-} while(0)
+#define DACE_KERNEL_LAUNCH_CHECK(err, kernel_name, gdimx, gdimy, gdimz, bdimx, \
+                                 bdimy, bdimz)                                 \
+  do {                                                                         \
+    if (err != decltype(err)(0)) {                                             \
+      printf(                                                                  \
+          "ERROR launching kernel %s: %s (%d). Grid dimensions: "              \
+          "(%d, %d, %d); Block dimensions: (%d, %d, %d).\n",                   \
+          kernel_name, gpuGetErrorString(err), (int)err, gdimx, gdimy, gdimz,  \
+          bdimx, bdimy, bdimz);                                                \
+      throw;                                                                   \
+    }                                                                          \
+  } while (0)
 
 namespace dace {
 namespace cuda {
 struct Context {
-    int num_streams;
-    int num_events;
-    gpuStream_t *streams;
-    gpuEvent_t *events;
-    Context(int nstreams, int nevents) : num_streams(nstreams), 
-        num_events(nevents) {
-        streams = new gpuStream_t[nstreams];
-        events = new gpuEvent_t[nevents];
-    }
-    ~Context() {
-        delete[] streams;
-        delete[] events;
-    }
+  int num_streams;
+  int num_events;
+  gpuStream_t *streams;
+  gpuEvent_t *events;
+  Context(int nstreams, int nevents)
+      : num_streams(nstreams), num_events(nevents) {
+    streams = new gpuStream_t[nstreams];
+    events = new gpuEvent_t[nevents];
+  }
+  ~Context() {
+    delete[] streams;
+    delete[] events;
+  }
 };
-    
+
 }  // namespace cuda
 }  // namespace dace
 
 #ifdef __CUDACC__
 DACE_DFI dace::vec<float, 4> operator+(float f, dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = v.x + f;
-    result.y = v.y + f;
-    result.z = v.z + f;
-    result.w = v.w + f;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = v.x + f;
+  result.y = v.y + f;
+  result.z = v.z + f;
+  result.w = v.w + f;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator/(float f, dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = f / v.x;
-    result.y = f / v.y;
-    result.z = f / v.z;
-    result.w = f / v.w;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = f / v.x;
+  result.y = f / v.y;
+  result.z = f / v.z;
+  result.w = f / v.w;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator/(dace::vec<float, 4> v, float f) {
-    dace::vec<float, 4> result;
-    result.x = v.x / f;
-    result.y = v.y / f;
-    result.z = v.z / f;
-    result.w = v.w / f;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = v.x / f;
+  result.y = v.y / f;
+  result.z = v.z / f;
+  result.w = v.w / f;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator-(dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = - v.x;
-    result.y = - v.y;
-    result.z = - v.z;
-    result.w = - v.w;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = -v.x;
+  result.y = -v.y;
+  result.z = -v.z;
+  result.w = -v.w;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator-(float f, dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = f - v.x;
-    result.y = f - v.y;
-    result.z = f - v.z;
-    result.w = f - v.w;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = f - v.x;
+  result.y = f - v.y;
+  result.z = f - v.z;
+  result.w = f - v.w;
+  return result;
 }
 
-DACE_DFI dace::vec<float, 4> operator-(dace::vec<float, 4> u, dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = u.x - v.x;
-    result.y = u.y - v.y;
-    result.z = u.z - v.z;
-    result.w = u.w - v.w;
-    return result;
+DACE_DFI dace::vec<float, 4> operator-(dace::vec<float, 4> u,
+                                       dace::vec<float, 4> v) {
+  dace::vec<float, 4> result;
+  result.x = u.x - v.x;
+  result.y = u.y - v.y;
+  result.z = u.z - v.z;
+  result.w = u.w - v.w;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator*(float f, dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = v.x * f;
-    result.y = v.y * f;
-    result.z = v.z * f;
-    result.w = v.w * f;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = v.x * f;
+  result.y = v.y * f;
+  result.z = v.z * f;
+  result.w = v.w * f;
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> operator*(dace::vec<float, 4> v, float f) {
-    dace::vec<float, 4> result;
-    result.x = v.x * f;
-    result.y = v.y * f;
-    result.z = v.z * f;
-    result.w = v.w * f;
-    return result;
+  dace::vec<float, 4> result;
+  result.x = v.x * f;
+  result.y = v.y * f;
+  result.z = v.z * f;
+  result.w = v.w * f;
+  return result;
 }
 
-
-namespace dace { namespace math {
+namespace dace {
+namespace math {
 
 DACE_DFI dace::vec<float, 2> exp(dace::vec<float, 2> v) {
-    dace::vec<float, 2> result;
-    result.x = exp(v.x);
-    result.y = exp(v.y);
-    return result;
+  dace::vec<float, 2> result;
+  result.x = exp(v.x);
+  result.y = exp(v.y);
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> exp(dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = exp(v.x);
-    result.y = exp(v.y);
-    result.z = exp(v.z);
-    result.w = exp(v.w);
-    return result;
+  dace::vec<float, 4> result;
+  result.x = exp(v.x);
+  result.y = exp(v.y);
+  result.z = exp(v.z);
+  result.w = exp(v.w);
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> log(dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = log(v.x);
-    result.y = log(v.y);
-    result.z = log(v.z);
-    result.w = log(v.w);
-    return result;
+  dace::vec<float, 4> result;
+  result.x = log(v.x);
+  result.y = log(v.y);
+  result.z = log(v.z);
+  result.w = log(v.w);
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> log10(dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = log10(v.x);
-    result.y = log10(v.y);
-    result.z = log10(v.z);
-    result.w = log10(v.w);
-    return result;
+  dace::vec<float, 4> result;
+  result.x = log10(v.x);
+  result.y = log10(v.y);
+  result.z = log10(v.z);
+  result.w = log10(v.w);
+  return result;
 }
 
 DACE_DFI dace::vec<float, 4> tanh(dace::vec<float, 4> v) {
-    dace::vec<float, 4> result;
-    result.x = tanh(v.x);
-    result.y = tanh(v.y);
-    result.z = tanh(v.z);
-    result.w = tanh(v.w);
-    return result;
+  dace::vec<float, 4> result;
+  result.x = tanh(v.x);
+  result.y = tanh(v.y);
+  result.z = tanh(v.z);
+  result.w = tanh(v.w);
+  return result;
 }
 
-DACE_DFI dace::vec<float, 4> heaviside(const dace::vec<float, 4>& a) {
-    dace::vec<float, 4> result;
-    result.x = (a.x > 0) ? 1.0f : 0.0f;
-    result.y = (a.y > 0) ? 1.0f : 0.0f;
-    result.z = (a.z > 0) ? 1.0f : 0.0f;
-    result.w = (a.w > 0) ? 1.0f : 0.0f;
-    return result;
+DACE_DFI dace::vec<float, 4> heaviside(const dace::vec<float, 4> &a) {
+  dace::vec<float, 4> result;
+  result.x = (a.x > 0) ? 1.0f : 0.0f;
+  result.y = (a.y > 0) ? 1.0f : 0.0f;
+  result.z = (a.z > 0) ? 1.0f : 0.0f;
+  result.w = (a.w > 0) ? 1.0f : 0.0f;
+  return result;
 }
-} } // namespace dace::math
+}  // namespace math
+}  // namespace dace
 using dace::math::exp;
+using dace::math::heaviside;
 using dace::math::log;
 using dace::math::log10;
 using dace::math::tanh;
-using dace::math::heaviside;
 #endif
 
 #endif  // __DACE_CUDACOMMON_CUH
diff --git a/dace/runtime/include/dace/cuda/stream.cuh b/dace/runtime/include/dace/cuda/stream.cuh
index 61f4c99430..e22ba72d2e 100644
--- a/dace/runtime/include/dace/cuda/stream.cuh
+++ b/dace/runtime/include/dace/cuda/stream.cuh
@@ -213,7 +213,7 @@ namespace dace {
     void ResetGPUStream(GPUStream<T, IS_POW2>& stream)
     {
         void *args_reset[1] = { &stream };
-        DACE_CUDA_CHECK(gpuLaunchKernel((void *)&ResetGPUStream_kernel<T, IS_POW2>,
+        DACE_GPU_CHECK(gpuLaunchKernel((void *)&ResetGPUStream_kernel<T, IS_POW2>,
                                          dim3(1, 1, 1), dim3(1, 1, 1), 
                                          args_reset, 0, (gpuStream_t)0));
     }
@@ -229,7 +229,7 @@ namespace dace {
     void PushToGPUStream(GPUStream<T, IS_POW2>& stream, const T& item)
     {
         void *args_push[2] = { &stream, &item };
-        DACE_CUDA_CHECK(gpuLaunchKernel((void *)&PushToGPUStream_kernel<T, IS_POW2>,
+        DACE_GPU_CHECK(gpuLaunchKernel((void *)&PushToGPUStream_kernel<T, IS_POW2>,
                                          dim3(1, 1, 1), dim3(1, 1, 1), 
                                          args_push, 0, (gpuStream_t)0));
     }
@@ -242,12 +242,12 @@ namespace dace {
     GPUStream<T, IS_POW2> AllocGPUArrayStreamView(T *ptr, uint32_t capacity)
     {
         uint32_t *gStart, *gEnd, *gPending;
-        DACE_CUDA_CHECK(gpuMalloc(&gStart, sizeof(uint32_t)));
-        DACE_CUDA_CHECK(gpuMalloc(&gEnd, sizeof(uint32_t)));
-        DACE_CUDA_CHECK(gpuMalloc(&gPending, sizeof(uint32_t)));
-        DACE_CUDA_CHECK(gpuMemset(gStart, 0, sizeof(uint32_t)));
-        DACE_CUDA_CHECK(gpuMemset(gEnd, 0, sizeof(uint32_t)));
-        DACE_CUDA_CHECK(gpuMemset(gPending, 0, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMalloc(&gStart, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMalloc(&gEnd, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMalloc(&gPending, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMemset(gStart, 0, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMemset(gEnd, 0, sizeof(uint32_t)));
+        DACE_GPU_CHECK(gpuMemset(gPending, 0, sizeof(uint32_t)));
         return GPUStream<T, IS_POW2>(ptr, capacity, gStart, gEnd, gPending);
     }
 
@@ -255,23 +255,23 @@ namespace dace {
     GPUStream<T, IS_POW2> AllocGPUStream(uint32_t capacity)
     {
         T *gData;
-        DACE_CUDA_CHECK(gpuMalloc(&gData, capacity * sizeof(T)));
+        DACE_GPU_CHECK(gpuMalloc(&gData, capacity * sizeof(T)));
         return AllocGPUArrayStreamView<T, IS_POW2>(gData, capacity);
     }
 
     template<typename T, bool IS_POW2>
     void FreeGPUArrayStreamView(GPUStream<T, IS_POW2>& stream)
     {
-        DACE_CUDA_CHECK(gpuFree(stream.m_start));
-        DACE_CUDA_CHECK(gpuFree(stream.m_end));
-        DACE_CUDA_CHECK(gpuFree(stream.m_pending));
+        DACE_GPU_CHECK(gpuFree(stream.m_start));
+        DACE_GPU_CHECK(gpuFree(stream.m_end));
+        DACE_GPU_CHECK(gpuFree(stream.m_pending));
     }
 
     template<typename T, bool IS_POW2>
     void FreeGPUStream(GPUStream<T, IS_POW2>& stream)
     {
         FreeGPUArrayStreamView(stream);
-        DACE_CUDA_CHECK(gpuFree(stream.m_data));
+        DACE_GPU_CHECK(gpuFree(stream.m_data));
     }
 
 }  // namespace dace
diff --git a/dace/runtime/include/dace/stream.h b/dace/runtime/include/dace/stream.h
index dd4400784b..34d3552879 100644
--- a/dace/runtime/include/dace/stream.h
+++ b/dace/runtime/include/dace/stream.h
@@ -39,16 +39,16 @@ namespace dace {
     template<typename T, bool IS_POW2>
     void FreeGPUArrayStreamView(GPUStream<T, IS_POW2>& stream)
     {
-        DACE_CUDA_CHECK(gpuFree(stream.m_start));
-        DACE_CUDA_CHECK(gpuFree(stream.m_end));
-        DACE_CUDA_CHECK(gpuFree(stream.m_pending));
+        DACE_GPU_CHECK(gpuFree(stream.m_start));
+        DACE_GPU_CHECK(gpuFree(stream.m_end));
+        DACE_GPU_CHECK(gpuFree(stream.m_pending));
     }
 
     template<typename T, bool IS_POW2>
     void FreeGPUStream(GPUStream<T, IS_POW2>& stream)
     {
         FreeGPUArrayStreamView(stream);
-        DACE_CUDA_CHECK(gpuFree(stream.m_data));
+        DACE_GPU_CHECK(gpuFree(stream.m_data));
     }
 }  // namespace dace
 #endif
diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py
index 89bb2550f8..59c0e9279c 100644
--- a/tests/parse_state_struct_test.py
+++ b/tests/parse_state_struct_test.py
@@ -13,10 +13,12 @@
 from dace import dtypes
 from dace.codegen import codeobject, targets, compiler, compiled_sdfg
 
+
 @pytest.fixture
 def cuda_helper():
     return _cuda_helper()
 
+
 def _cuda_helper():
 
     helper_code = """
@@ -25,8 +27,8 @@ def _cuda_helper():
     extern "C" {
         int host_to_gpu(void* gpu, void* host, size_t size) {
             auto result = cudaMemcpy(gpu, host, size, cudaMemcpyHostToDevice);
-            DACE_CUDA_CHECK(cudaGetLastError());
-            DACE_CUDA_CHECK(cudaDeviceSynchronize());
+            DACE_GPU_CHECK(cudaGetLastError());
+            DACE_GPU_CHECK(cudaDeviceSynchronize());
             return result;
         } 
     } 
@@ -92,5 +94,6 @@ def persistent_transient(A: dace.float32[3, 3]):
 
     assert np.allclose(result, A @ B)
 
-if __name__ =='__main__':
+
+if __name__ == '__main__':
     test_preallocate_transients_in_state_struct(_cuda_helper())

From afac4d3f06e8551af660c933004a3d4a6f621b96 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 07:05:15 -0700
Subject: [PATCH 129/392] Fix C++ issue

---
 dace/runtime/include/dace/cuda/cudacommon.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 6a2c9d6da7..61a8c86d6a 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -25,7 +25,6 @@ typedef cudaError_t gpuError_t;
       throw;                                                              \
     }                                                                     \
   } while (0)
-#endif
 
 #define DACE_KERNEL_LAUNCH_CHECK(err, kernel_name, gdimx, gdimy, gdimz, bdimx, \
                                  bdimy, bdimz)                                 \

From b8adc4faace742640a0471d458973069668e4d90 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 27 Jun 2023 17:05:21 +0200
Subject: [PATCH 130/392] In `_add_write_access`, if the name is SDFG-local,
 return the input rng instead of None.

---
 dace/frontend/python/newast.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index ecf09417d8..541b17af06 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3067,11 +3067,11 @@ def _add_write_access(self,
                           arr_type: data.Data = None):
 
         if name in self.sdfg.arrays:
-            return (name, None)
+            return (name, rng)
         if (name, rng, 'w') in self.accesses:
             return self.accesses[(name, rng, 'w')]
         elif name in self.variables:
-            return (self.variables[name], None)
+            return (self.variables[name], rng)
         elif (name, rng, 'r') in self.accesses or name in self.scope_vars:
             return self._add_access(name, rng, 'w', target, new_name, arr_type)
         else:

From 3e5eff41a60f3ecdac9a9ed742790e7c18251eb9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 27 Jun 2023 17:05:44 +0200
Subject: [PATCH 131/392] Added tests.

---
 .../nested_name_accesses_test.py              | 85 +++++++++++++++++--
 1 file changed, 76 insertions(+), 9 deletions(-)

diff --git a/tests/python_frontend/nested_name_accesses_test.py b/tests/python_frontend/nested_name_accesses_test.py
index ffc2b68e40..14ded42960 100644
--- a/tests/python_frontend/nested_name_accesses_test.py
+++ b/tests/python_frontend/nested_name_accesses_test.py
@@ -26,7 +26,7 @@ def test_nested_name_accesses():
     diff_norm = np.linalg.norm(dc_out - np_out)
     ref_norm = np.linalg.norm(np_out)
     rel_err = diff_norm / ref_norm
-    assert (rel_err < 1e-7)
+    assert rel_err < 1e-7
 
 
 def test_nested_offset_access():
@@ -42,7 +42,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_offset_access_dappy():
@@ -62,7 +62,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_multi_offset_access():
@@ -79,7 +79,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 10]):
     inp = np.reshape(np.arange(6 * 5 * 10, dtype=np.float64), (6, 5, 10)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_multi_offset_access_dappy():
@@ -100,7 +100,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 10]):
     inp = np.reshape(np.arange(6 * 5 * 10, dtype=np.float64), (6, 5, 10)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_dec_offset_access():
@@ -116,7 +116,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_dec_offset_access_dappy():
@@ -136,7 +136,7 @@ def nested_offset_access(inp: dc.float64[6, 5, 5]):
     inp = np.reshape(np.arange(6 * 5 * 5, dtype=np.float64), (6, 5, 5)).copy()
     out = nested_offset_access(inp)
     ref = nested_offset_access.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_offset_access_nested_dependency():
@@ -157,7 +157,7 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 5]):
         out = nested_offset_access_nested_dep(inp)
     os.environ['DACE_testing_serialization'] = last_value
     ref = nested_offset_access_nested_dep.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
 def test_nested_offset_access_nested_dependency_dappy():
@@ -178,9 +178,74 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 10]):
     inp = np.reshape(np.arange(6 * 5 * 10, dtype=np.float64), (6, 5, 10)).copy()
     out = nested_offset_access_nested_dep(inp)
     ref = nested_offset_access_nested_dep.f(inp)
-    assert (np.allclose(out, ref))
+    assert np.allclose(out, ref)
 
 
+def test_access_to_nested_transient():
+
+    KLEV = 3
+    KLON = 4
+    NBLOCKS = 5
+
+    @dc.program
+    def small_wip(inp: dc.float64[KLEV+1, KLON, NBLOCKS], out: dc.float64[KLEV, KLON, NBLOCKS]):
+        for jn in dc.map[0:NBLOCKS]:
+            tmp = np.zeros([KLEV+1, KLON])
+            for jl in range(KLON):
+                for jk in range(KLEV):
+                    tmp[jk, jl] = inp[jk, jl, jn] + inp[jk+1, jl, jn]
+
+            for jl in range(KLON):
+                for jk in range(KLEV):
+                    out[jk, jl, jn] = tmp[jk, jl] + tmp[jk+1, jl]
+    
+    rng = np.random.default_rng(42)
+    inp = rng.random((KLEV+1, KLON, NBLOCKS))
+    ref = np.zeros((KLEV, KLON, NBLOCKS))
+    val = np.zeros((KLEV, KLON, NBLOCKS))
+
+    small_wip(inp, val)
+    small_wip.f(inp, ref)
+
+    assert np.allclose(val, ref)
+
+
+def test_access_to_nested_transient_dappy():
+
+    KLEV = 3
+    KLON = 4
+    NBLOCKS = 5
+
+    @dc.program
+    def small_wip_dappy(inp: dc.float64[KLEV+1, KLON, NBLOCKS], out: dc.float64[KLEV, KLON, NBLOCKS]):
+        for jn in dc.map[0:NBLOCKS]:
+            tmp = np.zeros([KLEV+1, KLON])
+            for jl in range(KLON):
+                for jk in range(KLEV):
+                    with dc.tasklet():
+                        in1 << inp[jk, jl, jn]
+                        in2 << inp[jk+1, jl, jn]
+                        out1 >> tmp[jk, jl]
+                        out1 = in1 + in2
+
+            for jl in range(KLON):
+                for jk in range(KLEV):
+                    with dc.tasklet():
+                        in1 << tmp[jk, jl]
+                        in2 << tmp[jk+1, jl]
+                        out1 >> out[jk, jl, jn]
+                        out1 = in1 + in2
+    
+    rng = np.random.default_rng(42)
+    inp = rng.random((KLEV+1, KLON, NBLOCKS))
+    ref = np.zeros((KLEV, KLON, NBLOCKS))
+    val = np.zeros((KLEV, KLON, NBLOCKS))
+
+    small_wip_dappy(inp, val)
+    small_wip_dappy.f(inp, ref)
+
+    assert np.allclose(val, ref)
+
 
 if __name__ == "__main__":
     test_nested_name_accesses()
@@ -192,3 +257,5 @@ def nested_offset_access_nested_dep(inp: dc.float64[6, 5, 10]):
     test_nested_dec_offset_access_dappy()
     test_nested_offset_access_nested_dependency()
     test_nested_offset_access_nested_dependency_dappy()
+    test_access_to_nested_transient()
+    test_access_to_nested_transient_dappy()

From 8fc8412e3b92b36504b02c67961c383dcfb19fd0 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 09:58:07 -0700
Subject: [PATCH 132/392] Check for and warn on empty GPU grids

---
 dace/codegen/targets/cuda.py                  | 21 +++++++++++++++++++
 dace/runtime/include/dace/cuda/cudacommon.cuh |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 1c2471807a..b4a910f4f3 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1600,9 +1600,28 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
 
         gdims = 'dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims))
         bdims = ', '.join(_topy(block_dims))
+
+        # Prepare an empty-grid check for runtime grids
+        dimcheck = 'false'
+        if is_persistent:
+            dimcheck = 'dace_number_blocks > 0'
+        else:
+            for gdim in grid_dims:
+                if symbolic.issymbolic(gdim) and (gdim > 0) != True:
+                    dimcheck += f' || ({_topy(gdim)}) == 0'
+
+        emptygrid_warning = ''
+        if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'):
+            emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" '
+                                 'due to an empty grid.\\n");')
+
         self._localcode.write(
             '''
 void  *{kname}_args[] = {{ {kargs} }};
+if ({dimcheck}) {{
+    {emptygrid_warning}
+    return;
+}}
 gpuError_t __err = {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''
             .format(kname=kernel_name,
                     kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
@@ -1610,6 +1629,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                     bdims=bdims,
                     dynsmem=_topy(dynsmem_size),
                     stream=cudastream,
+                    dimcheck=dimcheck,
+                    emptygrid_warning=emptygrid_warning,
                     backend=self.backend), sdfg, state_id, scope_entry)
 
         # Check kernel launch for errors
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 61a8c86d6a..6ad657b0d8 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -21,7 +21,7 @@ typedef cudaError_t gpuError_t;
     gpuError_t errr = (err);                                              \
     if (errr != (gpuError_t)0) {                                          \
       printf("GPU runtime error at %s:%d: %s (%d)\n", __FILE__, __LINE__, \
-             errr, gpuGetErrorString(err));                               \
+             gpuGetErrorString(err), errr);                               \
       throw;                                                              \
     }                                                                     \
   } while (0)

From 5448343e2e6b3aa9b2e087be5b51ffcc4a2c7fc4 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 10:00:14 -0700
Subject: [PATCH 133/392] Prettier code generation

---
 dace/codegen/targets/cuda.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index b4a910f4f3..2f92bd0e17 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1602,13 +1602,16 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
         bdims = ', '.join(_topy(block_dims))
 
         # Prepare an empty-grid check for runtime grids
-        dimcheck = 'false'
+        dimcheck = ''
         if is_persistent:
             dimcheck = 'dace_number_blocks > 0'
         else:
             for gdim in grid_dims:
                 if symbolic.issymbolic(gdim) and (gdim > 0) != True:
-                    dimcheck += f' || ({_topy(gdim)}) == 0'
+                    if not dimcheck:
+                        dimcheck = f'({_topy(gdim)}) == 0'
+                    else:
+                        dimcheck += f' || ({_topy(gdim)}) == 0'
 
         emptygrid_warning = ''
         if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'):

From bbd686bcb99f5d4fc89f6ccdcd69e4bda3db6480 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 10:11:32 -0700
Subject: [PATCH 134/392] Propagate GPU runtime error codes instead of throwing
 exceptions

---
 dace/codegen/targets/cuda.py                  |   5 +-
 dace/runtime/include/dace/cuda/cudacommon.cuh |   9 +-
 dace/runtime/include/dace/cuda/stream.cuh     | 478 +++++++++---------
 dace/runtime/include/dace/stream.h            |   8 +-
 4 files changed, 243 insertions(+), 257 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 2f92bd0e17..42335690a3 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -392,8 +392,9 @@ def get_generated_codeobjects(self):
     {exitcode}
 
     // Synchronize and check for CUDA errors
-    int __err = 0;
-    __err = static_cast<int>({backend}DeviceSynchronize());
+    int __err = static_cast<int>(__state->gpu_context->lasterror);
+    if (__err == 0)
+        __err = static_cast<int>({backend}DeviceSynchronize());
 
     // Destroy {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 6ad657b0d8..84129c09eb 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -22,20 +22,20 @@ typedef cudaError_t gpuError_t;
     if (errr != (gpuError_t)0) {                                          \
       printf("GPU runtime error at %s:%d: %s (%d)\n", __FILE__, __LINE__, \
              gpuGetErrorString(err), errr);                               \
-      throw;                                                              \
+      __state->gpu_context->lasterror = errr;                             \
     }                                                                     \
   } while (0)
 
 #define DACE_KERNEL_LAUNCH_CHECK(err, kernel_name, gdimx, gdimy, gdimz, bdimx, \
                                  bdimy, bdimz)                                 \
   do {                                                                         \
-    if (err != decltype(err)(0)) {                                             \
+    if (err != (gpuError_t)0) {                                                \
       printf(                                                                  \
           "ERROR launching kernel %s: %s (%d). Grid dimensions: "              \
           "(%d, %d, %d); Block dimensions: (%d, %d, %d).\n",                   \
           kernel_name, gpuGetErrorString(err), (int)err, gdimx, gdimy, gdimz,  \
           bdimx, bdimy, bdimz);                                                \
-      throw;                                                                   \
+      __state->gpu_context->lasterror = err;                                   \
     }                                                                          \
   } while (0)
 
@@ -46,8 +46,9 @@ struct Context {
   int num_events;
   gpuStream_t *streams;
   gpuEvent_t *events;
+  gpuError_t lasterror;
   Context(int nstreams, int nevents)
-      : num_streams(nstreams), num_events(nevents) {
+      : num_streams(nstreams), num_events(nevents), lasterror((gpuError_t)0) {
     streams = new gpuStream_t[nstreams];
     events = new gpuEvent_t[nevents];
   }
diff --git a/dace/runtime/include/dace/cuda/stream.cuh b/dace/runtime/include/dace/cuda/stream.cuh
index e22ba72d2e..2790f64f4f 100644
--- a/dace/runtime/include/dace/cuda/stream.cuh
+++ b/dace/runtime/include/dace/cuda/stream.cuh
@@ -3,22 +3,22 @@
 #define __DACE_STREAM_CUH
 
 #include <initializer_list>
-#include <vector>
 #include <map>
 #include <memory>
 #include <mutex>
-#include <new> // Used for the in-memory ctor call in the move assignment operator below  
+#include <new>  // Used for the in-memory ctor call in the move assignment operator below
+#include <vector>
 
 #ifdef __HIPCC__
-#include <hip/hip_runtime.h>
 #include <hip/hip_cooperative_groups.h>
+#include <hip/hip_runtime.h>
 #define gpuLaunchKernel hipLaunchKernel
 #define gpuMalloc hipMalloc
 #define gpuMemset hipMemset
 #define gpuFree hipFree
 #else
-#include <cuda_runtime.h>
 #include <cooperative_groups.h>
+#include <cuda_runtime.h>
 #define gpuLaunchKernel cudaLaunchKernel
 #define gpuMalloc cudaMalloc
 #define gpuMemset cudaMemset
@@ -28,252 +28,236 @@
 #include "cudacommon.cuh"
 
 namespace dace {
-    // Adapted from https://devblogs.nvidia.com/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+// Adapted from
+// https://devblogs.nvidia.com/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
 #ifndef __HIPCC__
-    __inline__ __device__ uint32_t atomicAggInc(uint32_t *ctr) {
-        auto g = cooperative_groups::coalesced_threads();
-        uint32_t warp_res;
-        int rank = g.thread_rank();
-        if (rank == 0)
-            warp_res = atomicAdd(ctr, g.size());
-        return g.shfl(warp_res, 0) + rank;
-    }
-
-    __inline__ __device__ uint32_t atomicAggDec(uint32_t *ctr) {
-        auto g = cooperative_groups::coalesced_threads();
-        uint32_t warp_res;
-        int rank = g.thread_rank();
-        if (rank == 0)
-            warp_res = atomicAdd(ctr, -g.size());
-        return g.shfl(warp_res, 0) - g.size() + rank;
-    }
+__inline__ __device__ uint32_t atomicAggInc(uint32_t *ctr) {
+  auto g = cooperative_groups::coalesced_threads();
+  uint32_t warp_res;
+  int rank = g.thread_rank();
+  if (rank == 0) warp_res = atomicAdd(ctr, g.size());
+  return g.shfl(warp_res, 0) + rank;
+}
+
+__inline__ __device__ uint32_t atomicAggDec(uint32_t *ctr) {
+  auto g = cooperative_groups::coalesced_threads();
+  uint32_t warp_res;
+  int rank = g.thread_rank();
+  if (rank == 0) warp_res = atomicAdd(ctr, -g.size());
+  return g.shfl(warp_res, 0) - g.size() + rank;
+}
 #else
-    // Version without cooperative groups to support HIP
-    __inline__ __device__ uint32_t atomicAggInc(uint32_t *ctr) {
-        unsigned int lane = threadIdx.x % warpSize;
-        unsigned long long int mask = __ballot(1);  // 64-bit because AMD warp size is 64
-        unsigned int thread_offset = __popcll(mask & ((1ULL << lane) - 1));
-        unsigned int leader = __ffsll(mask) - 1;
-        unsigned int count = __popcll(mask);
-        uint32_t warp_res;
-        if (lane == leader)
-            warp_res = atomicAdd(ctr, count);
-
-        warp_res = __shfl(warp_res, leader);
-        return warp_res + thread_offset;
-    }
-
-    __inline__ __device__ uint32_t atomicAggDec(uint32_t *ctr) {
-        unsigned int lane = threadIdx.x % warpSize;
-        unsigned long long int mask = __ballot(1);  // 64-bit because AMD warp size is 64
-        unsigned int thread_offset = __popcll(mask & ((1ULL << lane) - 1));
-        unsigned int leader = __ffsll(mask) - 1;
-        unsigned int count = __popcll(mask);
-        uint32_t warp_res;
-        if (lane == leader)
-            warp_res = atomicAdd(ctr, -count);
-
-        warp_res = __shfl(warp_res, leader);
-        return warp_res - count + thread_offset;
-    }
-#endif // __HIPCC__
-
-    //
-    // Queue classes (device):
-    //
-
-    /*
-    * @brief A device-level MPMC Queue
-    */
-    template<typename T, bool IS_POWEROFTWO = false>
-    class GPUStream
-    {
-    public:
-        T* m_data;
-        uint32_t *m_start, *m_end, *m_pending;
-        uint32_t m_capacity_mask;
-
-        __host__ GPUStream() : m_data(nullptr), m_start(nullptr), m_end(nullptr),
-            m_pending(nullptr), m_capacity_mask(0) {}
-        __host__ __device__ GPUStream(T* data, uint32_t capacity,
-                                      uint32_t *start, uint32_t *end, 
-                                      uint32_t *pending) :
-            m_data(data), m_start(start), m_end(end), m_pending(pending),
-            m_capacity_mask(IS_POWEROFTWO ? (capacity - 1) : capacity)
-        {
-            if (IS_POWEROFTWO) {
-                assert((capacity - 1 & capacity) == 0); // Must be a power of two for handling circular overflow correctly  
-            }
-        }
-
-        __device__ __forceinline__ void reset() const
-        {
-            *m_start = 0; 
-            *m_end = 0;
-            *m_pending = 0;
-        }
-
-        __device__ __forceinline__ T pop()
-        {
-            uint32_t allocation = atomicAggInc(m_start);
-            return m_data[get_addr(allocation)];
-        }
-
-        __device__ __forceinline__ T *leader_pop(uint32_t count) {
-            uint32_t current = *m_start;
-            T *result = m_data + get_addr(current);
-            *m_start += count;
-            return result;
-        }
-
-
-        __device__ __forceinline__ uint32_t get_addr(const uint32_t& i) const {
-            if (IS_POWEROFTWO)
-                return i & m_capacity_mask;
-            else
-                return i % m_capacity_mask;
-        }
-
-        __device__ __forceinline__ void push(const T& item)
-        {
-            uint32_t allocation = atomicAggInc(m_pending);
-            m_data[get_addr(allocation)] = item;
-        }
-
-        /*
-        __device__ __forceinline__ void push(T *items, int count) 
-        {
-            // Perform a warp-wide scan to get thread offsets
-            typedef cub::WarpScan<int> WarpScan;
-            __shared__ typename WarpScan::TempStorage temp_storage[4];
-            int offset;
-            int warp_id = threadIdx.x / 32;
-            WarpScan(temp_storage[warp_id]).ExclusiveSum(count, offset);
-
-            // Atomic-add the total count once per warp
-            uint32_t addr;
-            if (threadIdx.x & 31 == 31) // Last thread
-                addr = atomicAdd(m_pending, offset + count);
-            // Broadcast starting address
-            addr = cub::ShuffleIndex(addr, 31, 0xffffffff);
-
-            // Copy data from each thread
-            for(int i = 0; i < count; ++i)
-                m_data[get_addr(addr + offset + i)] = items[i];
-        }
-        */
-
-        __device__ __forceinline__ void prepend(const T& item)
-        {
-            uint32_t allocation = atomicAggDec(m_start) - 1;
-            m_data[get_addr(allocation)] = item;
-        }
-
-        __device__ __forceinline__ T read(uint32_t i) const
-        {
-            return m_data[get_addr(*m_start + i)];
-        }
-                        
-        __device__ __forceinline__ uint32_t count() const
-        {
-            return *m_end - *m_start;
-        }
-
-        // Returns the 'count' of pending items and commits
-        __device__ __forceinline__ uint32_t commit_pending() const
-        {
-            uint32_t count = *m_pending - *m_end;
-                
-            // Sync end with pending, this makes the pushed items visible to the consumer
-            *m_end = *m_pending;
-            return count;
-        }
-
-        __device__ __forceinline__ uint32_t get_start() const
-        {
-            return *m_start;
-        }
-
-        __device__ __forceinline__ uint32_t get_start_delta(uint32_t prev_start) const
-        {
-            return prev_start - *m_start;
-        }
-    };
-
-    ////////////////////////////////////////////////////////////
-    // Host controllers for GPU streams
-
-    template<typename T, bool IS_POW2>
-    __global__ void ResetGPUStream_kernel(GPUStream<T, IS_POW2> stream)
-    {
-        stream.reset();
-    }
-
-    template<typename T, bool IS_POW2>
-    void ResetGPUStream(GPUStream<T, IS_POW2>& stream)
-    {
-        void *args_reset[1] = { &stream };
-        DACE_GPU_CHECK(gpuLaunchKernel((void *)&ResetGPUStream_kernel<T, IS_POW2>,
-                                         dim3(1, 1, 1), dim3(1, 1, 1), 
-                                         args_reset, 0, (gpuStream_t)0));
-    }
-
-    template<typename T, bool IS_POW2>
-    __global__ void PushToGPUStream_kernel(GPUStream<T, IS_POW2> stream, T item)
-    {
-        stream.push(item);
-        stream.commit_pending();
-    }
-
-    template<typename T, bool IS_POW2>
-    void PushToGPUStream(GPUStream<T, IS_POW2>& stream, const T& item)
-    {
-        void *args_push[2] = { &stream, &item };
-        DACE_GPU_CHECK(gpuLaunchKernel((void *)&PushToGPUStream_kernel<T, IS_POW2>,
-                                         dim3(1, 1, 1), dim3(1, 1, 1), 
-                                         args_push, 0, (gpuStream_t)0));
-    }
-
-    ////////////////////////////////////////////////////////////
-    // Host memory management for GPU streams
-
-
-    template<typename T, bool IS_POW2>
-    GPUStream<T, IS_POW2> AllocGPUArrayStreamView(T *ptr, uint32_t capacity)
-    {
-        uint32_t *gStart, *gEnd, *gPending;
-        DACE_GPU_CHECK(gpuMalloc(&gStart, sizeof(uint32_t)));
-        DACE_GPU_CHECK(gpuMalloc(&gEnd, sizeof(uint32_t)));
-        DACE_GPU_CHECK(gpuMalloc(&gPending, sizeof(uint32_t)));
-        DACE_GPU_CHECK(gpuMemset(gStart, 0, sizeof(uint32_t)));
-        DACE_GPU_CHECK(gpuMemset(gEnd, 0, sizeof(uint32_t)));
-        DACE_GPU_CHECK(gpuMemset(gPending, 0, sizeof(uint32_t)));
-        return GPUStream<T, IS_POW2>(ptr, capacity, gStart, gEnd, gPending);
-    }
-
-    template<typename T, bool IS_POW2>
-    GPUStream<T, IS_POW2> AllocGPUStream(uint32_t capacity)
-    {
-        T *gData;
-        DACE_GPU_CHECK(gpuMalloc(&gData, capacity * sizeof(T)));
-        return AllocGPUArrayStreamView<T, IS_POW2>(gData, capacity);
-    }
-
-    template<typename T, bool IS_POW2>
-    void FreeGPUArrayStreamView(GPUStream<T, IS_POW2>& stream)
-    {
-        DACE_GPU_CHECK(gpuFree(stream.m_start));
-        DACE_GPU_CHECK(gpuFree(stream.m_end));
-        DACE_GPU_CHECK(gpuFree(stream.m_pending));
-    }
-
-    template<typename T, bool IS_POW2>
-    void FreeGPUStream(GPUStream<T, IS_POW2>& stream)
-    {
-        FreeGPUArrayStreamView(stream);
-        DACE_GPU_CHECK(gpuFree(stream.m_data));
+// Version without cooperative groups to support HIP
+__inline__ __device__ uint32_t atomicAggInc(uint32_t *ctr) {
+  unsigned int lane = threadIdx.x % warpSize;
+  unsigned long long int mask =
+      __ballot(1);  // 64-bit because AMD warp size is 64
+  unsigned int thread_offset = __popcll(mask & ((1ULL << lane) - 1));
+  unsigned int leader = __ffsll(mask) - 1;
+  unsigned int count = __popcll(mask);
+  uint32_t warp_res;
+  if (lane == leader) warp_res = atomicAdd(ctr, count);
+
+  warp_res = __shfl(warp_res, leader);
+  return warp_res + thread_offset;
+}
+
+__inline__ __device__ uint32_t atomicAggDec(uint32_t *ctr) {
+  unsigned int lane = threadIdx.x % warpSize;
+  unsigned long long int mask =
+      __ballot(1);  // 64-bit because AMD warp size is 64
+  unsigned int thread_offset = __popcll(mask & ((1ULL << lane) - 1));
+  unsigned int leader = __ffsll(mask) - 1;
+  unsigned int count = __popcll(mask);
+  uint32_t warp_res;
+  if (lane == leader) warp_res = atomicAdd(ctr, -count);
+
+  warp_res = __shfl(warp_res, leader);
+  return warp_res - count + thread_offset;
+}
+#endif  // __HIPCC__
+
+//
+// Queue classes (device):
+//
+
+/*
+ * @brief A device-level MPMC Queue
+ */
+template <typename T, bool IS_POWEROFTWO = false>
+class GPUStream {
+ public:
+  T *m_data;
+  uint32_t *m_start, *m_end, *m_pending;
+  uint32_t m_capacity_mask;
+
+  __host__ GPUStream()
+      : m_data(nullptr),
+        m_start(nullptr),
+        m_end(nullptr),
+        m_pending(nullptr),
+        m_capacity_mask(0) {}
+  __host__ __device__ GPUStream(T *data, uint32_t capacity, uint32_t *start,
+                                uint32_t *end, uint32_t *pending)
+      : m_data(data),
+        m_start(start),
+        m_end(end),
+        m_pending(pending),
+        m_capacity_mask(IS_POWEROFTWO ? (capacity - 1) : capacity) {
+    if (IS_POWEROFTWO) {
+      assert((capacity - 1 & capacity) ==
+             0);  // Must be a power of two for handling circular overflow
+                  // correctly
     }
+  }
+
+  __device__ __forceinline__ void reset() const {
+    *m_start = 0;
+    *m_end = 0;
+    *m_pending = 0;
+  }
+
+  __device__ __forceinline__ T pop() {
+    uint32_t allocation = atomicAggInc(m_start);
+    return m_data[get_addr(allocation)];
+  }
+
+  __device__ __forceinline__ T *leader_pop(uint32_t count) {
+    uint32_t current = *m_start;
+    T *result = m_data + get_addr(current);
+    *m_start += count;
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t get_addr(const uint32_t &i) const {
+    if (IS_POWEROFTWO)
+      return i & m_capacity_mask;
+    else
+      return i % m_capacity_mask;
+  }
+
+  __device__ __forceinline__ void push(const T &item) {
+    uint32_t allocation = atomicAggInc(m_pending);
+    m_data[get_addr(allocation)] = item;
+  }
+
+  /*
+  __device__ __forceinline__ void push(T *items, int count)
+  {
+      // Perform a warp-wide scan to get thread offsets
+      typedef cub::WarpScan<int> WarpScan;
+      __shared__ typename WarpScan::TempStorage temp_storage[4];
+      int offset;
+      int warp_id = threadIdx.x / 32;
+      WarpScan(temp_storage[warp_id]).ExclusiveSum(count, offset);
+
+      // Atomic-add the total count once per warp
+      uint32_t addr;
+      if (threadIdx.x & 31 == 31) // Last thread
+          addr = atomicAdd(m_pending, offset + count);
+      // Broadcast starting address
+      addr = cub::ShuffleIndex(addr, 31, 0xffffffff);
+
+      // Copy data from each thread
+      for(int i = 0; i < count; ++i)
+          m_data[get_addr(addr + offset + i)] = items[i];
+  }
+  */
+
+  __device__ __forceinline__ void prepend(const T &item) {
+    uint32_t allocation = atomicAggDec(m_start) - 1;
+    m_data[get_addr(allocation)] = item;
+  }
+
+  __device__ __forceinline__ T read(uint32_t i) const {
+    return m_data[get_addr(*m_start + i)];
+  }
+
+  __device__ __forceinline__ uint32_t count() const {
+    return *m_end - *m_start;
+  }
+
+  // Returns the 'count' of pending items and commits
+  __device__ __forceinline__ uint32_t commit_pending() const {
+    uint32_t count = *m_pending - *m_end;
+
+    // Sync end with pending, this makes the pushed items visible to the
+    // consumer
+    *m_end = *m_pending;
+    return count;
+  }
+
+  __device__ __forceinline__ uint32_t get_start() const { return *m_start; }
+
+  __device__ __forceinline__ uint32_t
+  get_start_delta(uint32_t prev_start) const {
+    return prev_start - *m_start;
+  }
+};
+
+////////////////////////////////////////////////////////////
+// Host controllers for GPU streams
+
+template <typename T, bool IS_POW2>
+__global__ void ResetGPUStream_kernel(GPUStream<T, IS_POW2> stream) {
+  stream.reset();
+}
+
+template <typename T, bool IS_POW2>
+void ResetGPUStream(GPUStream<T, IS_POW2> &stream) {
+  void *args_reset[1] = {&stream};
+  gpuLaunchKernel((void *)&ResetGPUStream_kernel<T, IS_POW2>, dim3(1, 1, 1),
+                  dim3(1, 1, 1), args_reset, 0, (gpuStream_t)0);
+}
+
+template <typename T, bool IS_POW2>
+__global__ void PushToGPUStream_kernel(GPUStream<T, IS_POW2> stream, T item) {
+  stream.push(item);
+  stream.commit_pending();
+}
+
+template <typename T, bool IS_POW2>
+void PushToGPUStream(GPUStream<T, IS_POW2> &stream, const T &item) {
+  void *args_push[2] = {&stream, &item};
+  gpuLaunchKernel((void *)&PushToGPUStream_kernel<T, IS_POW2>, dim3(1, 1, 1),
+                  dim3(1, 1, 1), args_push, 0, (gpuStream_t)0);
+}
+
+////////////////////////////////////////////////////////////
+// Host memory management for GPU streams
+
+template <typename T, bool IS_POW2>
+GPUStream<T, IS_POW2> AllocGPUArrayStreamView(T *ptr, uint32_t capacity) {
+  uint32_t *gStart, *gEnd, *gPending;
+  gpuMalloc(&gStart, sizeof(uint32_t));
+  gpuMalloc(&gEnd, sizeof(uint32_t));
+  gpuMalloc(&gPending, sizeof(uint32_t));
+  gpuMemset(gStart, 0, sizeof(uint32_t));
+  gpuMemset(gEnd, 0, sizeof(uint32_t));
+  gpuMemset(gPending, 0, sizeof(uint32_t));
+  return GPUStream<T, IS_POW2>(ptr, capacity, gStart, gEnd, gPending);
+}
+
+template <typename T, bool IS_POW2>
+GPUStream<T, IS_POW2> AllocGPUStream(uint32_t capacity) {
+  T *gData;
+  gpuMalloc(&gData, capacity * sizeof(T));
+  return AllocGPUArrayStreamView<T, IS_POW2>(gData, capacity);
+}
+
+template <typename T, bool IS_POW2>
+void FreeGPUArrayStreamView(GPUStream<T, IS_POW2> &stream) {
+  gpuFree(stream.m_start);
+  gpuFree(stream.m_end);
+  gpuFree(stream.m_pending);
+}
+
+template <typename T, bool IS_POW2>
+void FreeGPUStream(GPUStream<T, IS_POW2> &stream) {
+  FreeGPUArrayStreamView(stream);
+  gpuFree(stream.m_data);
+}
 
 }  // namespace dace
 
-#endif // __DACE_STREAM_CUH
+#endif  // __DACE_STREAM_CUH
diff --git a/dace/runtime/include/dace/stream.h b/dace/runtime/include/dace/stream.h
index 34d3552879..255e16ec2b 100644
--- a/dace/runtime/include/dace/stream.h
+++ b/dace/runtime/include/dace/stream.h
@@ -39,16 +39,16 @@ namespace dace {
     template<typename T, bool IS_POW2>
     void FreeGPUArrayStreamView(GPUStream<T, IS_POW2>& stream)
     {
-        DACE_GPU_CHECK(gpuFree(stream.m_start));
-        DACE_GPU_CHECK(gpuFree(stream.m_end));
-        DACE_GPU_CHECK(gpuFree(stream.m_pending));
+        gpuFree(stream.m_start);
+        gpuFree(stream.m_end);
+        gpuFree(stream.m_pending);
     }
 
     template<typename T, bool IS_POW2>
     void FreeGPUStream(GPUStream<T, IS_POW2>& stream)
     {
         FreeGPUArrayStreamView(stream);
-        DACE_GPU_CHECK(gpuFree(stream.m_data));
+        gpuFree(stream.m_data);
     }
 }  // namespace dace
 #endif

From 84c668abb0990b9abf80d2c7a96b3f5226c7051c Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 20:26:24 -0700
Subject: [PATCH 135/392] Better handle casts and never-empty grids

---
 dace/codegen/targets/cuda.py                  | 22 ++++++++++---------
 dace/runtime/include/dace/cuda/cudacommon.cuh |  8 ++++---
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 42335690a3..4974b6f6d9 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1614,18 +1614,22 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                     else:
                         dimcheck += f' || ({_topy(gdim)}) == 0'
 
-        emptygrid_warning = ''
-        if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'):
-            emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" '
-                                 'due to an empty grid.\\n");')
+        if dimcheck:
+            emptygrid_warning = ''
+            if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'):
+                emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" '
+                                     'due to an empty grid.\\n");')
+
+            self._localcode.write(
+                f'''
+                if ({dimcheck}) {{
+                    {emptygrid_warning}
+                    return;
+                }}''', sdfg, state_id, scope_entry)
 
         self._localcode.write(
             '''
 void  *{kname}_args[] = {{ {kargs} }};
-if ({dimcheck}) {{
-    {emptygrid_warning}
-    return;
-}}
 gpuError_t __err = {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''
             .format(kname=kernel_name,
                     kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
@@ -1633,8 +1637,6 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                     bdims=bdims,
                     dynsmem=_topy(dynsmem_size),
                     stream=cudastream,
-                    dimcheck=dimcheck,
-                    emptygrid_warning=emptygrid_warning,
                     backend=self.backend), sdfg, state_id, scope_entry)
 
         # Check kernel launch for errors
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 84129c09eb..6390c9909c 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -32,9 +32,11 @@ typedef cudaError_t gpuError_t;
     if (err != (gpuError_t)0) {                                                \
       printf(                                                                  \
           "ERROR launching kernel %s: %s (%d). Grid dimensions: "              \
-          "(%d, %d, %d); Block dimensions: (%d, %d, %d).\n",                   \
-          kernel_name, gpuGetErrorString(err), (int)err, gdimx, gdimy, gdimz,  \
-          bdimx, bdimy, bdimz);                                                \
+          "(%u, %u, %u); Block dimensions: (%u, %u, %u).\n",                   \
+          kernel_name, gpuGetErrorString(err), (int)err,                       \
+          (unsigned int)(gdimx), (unsigned int)(gdimy), (unsigned int)(gdimz), \
+          (unsigned int)(bdimx), (unsigned int)(bdimy),                        \
+          (unsigned int)(bdimz));                                              \
       __state->gpu_context->lasterror = err;                                   \
     }                                                                          \
   } while (0)

From 4484d4ca0bad278032f2f90f0f32d956d6df83fe Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 21:54:03 -0700
Subject: [PATCH 136/392] Add inaccessible memlet checks in validation

---
 dace/sdfg/nodes.py                   |   4 +-
 dace/sdfg/scope.py                   |   2 +-
 dace/sdfg/sdfg.py                    |   8 +-
 dace/sdfg/validation.py              | 125 +++++++++++++++++++++++++--
 tests/sdfg/disallowed_access_test.py |  56 ++++++++++++
 5 files changed, 178 insertions(+), 17 deletions(-)
 create mode 100644 tests/sdfg/disallowed_access_test.py

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index e703c7863e..e4e9831fec 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -603,7 +603,7 @@ def __str__(self):
         else:
             return self.label
 
-    def validate(self, sdfg, state, references: Optional[Set[int]] = None):
+    def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context: bool):
         if not dtypes.validate_name(self.label):
             raise NameError('Invalid nested SDFG name "%s"' % self.label)
         for in_conn in self.in_connectors:
@@ -639,7 +639,7 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None):
             warnings.warn(f"{self.label} maps to unused symbol(s): {extra_symbols}")
 
         # Recursively validate nested SDFG
-        self.sdfg.validate(references)
+        self.sdfg.validate(references, **context)
 
 
 # ------------------------------------------------------------------------------
diff --git a/dace/sdfg/scope.py b/dace/sdfg/scope.py
index bbea6e2688..95f278b06a 100644
--- a/dace/sdfg/scope.py
+++ b/dace/sdfg/scope.py
@@ -262,7 +262,7 @@ def is_devicelevel_fpga(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', no
     """
     from dace.sdfg.utils import is_fpga_kernel
     return (is_in_scope(sdfg, state, node, [dtypes.ScheduleType.FPGA_Device])
-            or (state and is_fpga_kernel(sdfg, state)))
+            or (state is not None and is_fpga_kernel(sdfg, state)))
 
 
 def devicelevel_block_size(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState',
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index bee601e7b1..ce8d3e7aa8 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -572,7 +572,6 @@ def hash_sdfg(self, jsondict: Optional[Dict[str, Any]] = None) -> str:
         :param jsondict: If not None, uses given JSON dictionary as input.
         :return: The hash (in SHA-256 format).
         """
-
         def keyword_remover(json_obj: Any, last_keyword=""):
             # Makes non-unique in SDFG hierarchy v2
             # Recursively remove attributes from the SDFG which are not used in
@@ -1966,8 +1965,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             if find_new_name:
                 name = self._find_new_name(name)
             else:
-                raise NameError('Array or Stream with name "%s" already exists '
-                                "in SDFG" % name)
+                raise NameError(f'Array or Stream with name "{name}" already exists in SDFG')
         self._arrays[name] = datadesc
 
         # Add free symbols to the SDFG global symbol storage
@@ -2363,8 +2361,8 @@ def predecessor_states(self, state):
             before computing the given state. """
         return (e.src for e in self.bfs_edges(state, reverse=True))
 
-    def validate(self, references: Optional[Set[int]] = None) -> None:
-        validate_sdfg(self, references)
+    def validate(self, references: Optional[Set[int]] = None, **context: bool) -> None:
+        validate_sdfg(self, references, **context)
 
     def is_valid(self) -> bool:
         """ Returns True if the SDFG is verified correctly (using `validate`).
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 78de22bda2..e7de491e4e 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -3,7 +3,7 @@
 import copy
 from dace.dtypes import DebugInfo, StorageType
 import os
-from typing import TYPE_CHECKING, Dict, Set, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Set, Tuple, Union
 import warnings
 from dace import dtypes, data as dt, subsets
 from dace import symbolic
@@ -11,6 +11,8 @@
 if TYPE_CHECKING:
     import dace
     from dace.sdfg import SDFG
+    from dace.sdfg import graph as gr
+    from dace.memlet import Memlet
 
 ###########################################
 # Validation
@@ -25,18 +27,22 @@ def validate(graph: 'dace.sdfg.graph.SubgraphView'):
         validate_state(graph)
 
 
-def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
+def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context: bool):
     """ Verifies the correctness of an SDFG by applying multiple tests.
     
         :param sdfg: The SDFG to verify.
         :param references: An optional set keeping seen IDs for object
                            miscopy validation.
+        :param context: An optional dictionary of boolean attributes
+                        used to understand the context of this validation
+                        (e.g., is this in a GPU kernel).
 
         Raises an InvalidSDFGError with the erroneous node/edge
         on failure.
     """
     # Avoid import loop
     from dace.codegen.targets import fpga
+    from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
 
     references = references or set()
 
@@ -101,6 +107,10 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
                             "Arrays that use a multibank access pattern must have the size of the first dimension equal"
                             f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None)
 
+        # Check if SDFG is located within a GPU kernel
+        context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None)
+        context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None)
+
         # Check every state separately
         start_state = sdfg.start_state
         initialized_transients = {'__pystate'}
@@ -135,7 +145,8 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
             # Source
             if edge.src not in visited:
                 visited.add(edge.src)
-                validate_state(edge.src, sdfg.node_id(edge.src), sdfg, symbols, initialized_transients, references)
+                validate_state(edge.src, sdfg.node_id(edge.src), sdfg, symbols, initialized_transients, references,
+                               **context)
 
             ##########################################
             # Edge
@@ -154,6 +165,16 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
                 eid = sdfg.edge_id(edge)
                 raise InvalidSDFGInterstateEdgeError("Invalid interstate symbol name %s" % invalid, sdfg, eid)
 
+            # Ensure accessed data containers in assignments and conditions are accessible in this context
+            ise_memlets = edge.data.get_read_memlets(sdfg.arrays)
+            for memlet in ise_memlets:
+                container = memlet.data
+                if not _accessible(sdfg, container, context):
+                    eid = sdfg.edge_id(edge)
+                    raise InvalidSDFGInterstateEdgeError(
+                        f'Trying to read an inaccessible data container "{container}" '
+                        f'(Storage: {sdfg.arrays[container].storage}) in interstate edge', sdfg, eid)
+
             # Add edge symbols into defined symbols
             symbols.update(issyms)
 
@@ -161,12 +182,14 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
             # Destination
             if edge.dst not in visited:
                 visited.add(edge.dst)
-                validate_state(edge.dst, sdfg.node_id(edge.dst), sdfg, symbols, initialized_transients, references)
+                validate_state(edge.dst, sdfg.node_id(edge.dst), sdfg, symbols, initialized_transients, references,
+                               **context)
         # End of state DFS
 
         # If there is only one state, the DFS will miss it
         if start_state not in visited:
-            validate_state(start_state, sdfg.node_id(start_state), sdfg, symbols, initialized_transients, references)
+            validate_state(start_state, sdfg.node_id(start_state), sdfg, symbols, initialized_transients, references,
+                           **context)
 
         # Validate all inter-state edges (including self-loops not found by DFS)
         for eid, edge in enumerate(sdfg.edges()):
@@ -190,18 +213,73 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None):
                 invalid = next(s for s in issyms if not dtypes.validate_name(s))
                 raise InvalidSDFGInterstateEdgeError("Invalid interstate symbol name %s" % invalid, sdfg, eid)
 
+            # Ensure accessed data containers in assignments and conditions are accessible in this context
+            ise_memlets = edge.data.get_read_memlets(sdfg.arrays)
+            for memlet in ise_memlets:
+                container = memlet.data
+                if not _accessible(sdfg, container, context):
+                    raise InvalidSDFGInterstateEdgeError(
+                        f'Trying to read an inaccessible data container "{container}" '
+                        f'(Storage: {sdfg.arrays[container].storage}) in interstate edge', sdfg, eid)
+
     except InvalidSDFGError as ex:
         # If the SDFG is invalid, save it
         sdfg.save(os.path.join('_dacegraphs', 'invalid.sdfg'), exception=ex)
         raise
 
 
+def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]):
+    """
+    Helper function that returns False if a data container cannot be accessed in the current SDFG context.
+    """
+    storage = sdfg.arrays[container].storage
+    if storage == dtypes.StorageType.GPU_Global or storage in dtypes.GPU_STORAGES:
+        return context.get('in_gpu', False)
+    if storage == dtypes.StorageType.FPGA_Global or storage in dtypes.FPGA_STORAGES:
+        return context.get('in_fpga', False)
+
+    return True
+
+
+def _is_scalar(edge: 'gr.MultiConnectorEdge[Memlet]', memlet_path: List['gr.MultiConnectorEdge[Memlet]']):
+    """
+    Helper function that determines if a memlet is going to dereference a scalar value.
+    Returns False in any case the memlet _may not_ be dereferenced (but could be).
+    """
+    # If any of the connectors is a pointer, it takes precedence
+    src_conn = memlet_path[0].src_conn
+    if src_conn and src_conn in memlet_path[0].src.out_connectors:
+        src_conntype = memlet_path[0].src.out_connectors[src_conn]
+    else:
+        src_conntype = None
+    dst_conn = memlet_path[-1].dst_conn
+    if dst_conn and dst_conn in memlet_path[0].dst.in_connectors:
+        dst_conntype = memlet_path[-1].dst.in_connectors[dst_conn]
+    else:
+        dst_conntype = None
+    for conntype in (src_conntype, dst_conntype):
+        if isinstance(conntype, dtypes.pointer):
+            return False
+
+    # If the memlet is dynamically accessed, it may also not be a scalar
+    if edge.data.dynamic and edge.data.volume < 0:
+        return False
+
+    # If the memlet has more than one element, it is definitely not a scalar
+    if edge.data.num_elements() != 1:
+        return False
+
+    # Otherwise, we can assume this is a scalar
+    return True
+
+
 def validate_state(state: 'dace.sdfg.SDFGState',
                    state_id: int = None,
                    sdfg: 'dace.sdfg.SDFG' = None,
                    symbols: Dict[str, dtypes.typeclass] = None,
                    initialized_transients: Set[str] = None,
-                   references: Set[int] = None):
+                   references: Set[int] = None,
+                   **context: bool):
     """ Verifies the correctness of an SDFG state by applying multiple
         tests. Raises an InvalidSDFGError with the erroneous node on
         failure.
@@ -214,16 +292,21 @@ def validate_state(state: 'dace.sdfg.SDFGState',
     from dace.sdfg import SDFG
     from dace.sdfg import nodes as nd
     from dace.sdfg import utils as sdutil
-    from dace.sdfg.scope import scope_contains_scope
+    from dace.sdfg.scope import scope_contains_scope, is_devicelevel_gpu, is_devicelevel_fpga
 
     sdfg = sdfg or state.parent
     state_id = state_id or sdfg.node_id(state)
     symbols = symbols or {}
     initialized_transients = (initialized_transients if initialized_transients is not None else {'__pystate'})
     references = references or set()
-    scope_local_constants: dict[nd.MapEntry, list[str]] = dict()
     scope = state.scope_dict()
 
+    # Obtain whether we are already in an accelerator context
+    if not hasattr(context, 'in_gpu'):
+        context['in_gpu'] = is_devicelevel_gpu(sdfg, state, None)
+    if not hasattr(context, 'in_fpga'):
+        context['in_fpga'] = is_devicelevel_fpga(sdfg, state, None)
+
     # Reference check
     if id(state) in references:
         raise InvalidSDFGError(
@@ -256,7 +339,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Node validation
         try:
             if isinstance(node, nd.NestedSDFG):
-                node.validate(sdfg, state, references)
+                node.validate(sdfg, state, references, **context)
             else:
                 node.validate(sdfg, state)
         except InvalidSDFGError:
@@ -475,6 +558,18 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         src_node = path[0].src
         dst_node = path[-1].dst
 
+        # Set up memlet-specific SDFG context
+        memlet_context = copy.copy(context)
+        for pe in path:
+            for pn in (pe.src, pe.dst):
+                if isinstance(pn, nd.EntryNode):
+                    if pn.schedule in dtypes.GPU_SCHEDULES:
+                        memlet_context['in_gpu'] = True
+                        break
+                    if pn.schedule in dtypes.ScheduleType.FPGA_Device:
+                        memlet_context['in_fpga'] = True
+                        break
+
         # Check if memlet data matches src or dst nodes
         if (e.data.data is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
                 and (not isinstance(src_node, nd.AccessNode) or e.data.data != src_node.data)
@@ -487,6 +582,17 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                 eid,
             )
 
+        # Check accessibility of scalar memlet data in tasklets and dynamic map ranges
+        if (not e.data.is_empty() and _is_scalar(e, path)
+                and (isinstance(e.src, nd.Tasklet) or isinstance(e.dst, nd.Tasklet) or isinstance(e.dst, nd.MapEntry))):
+            if not _accessible(sdfg, e.data.data, memlet_context):
+                raise InvalidSDFGEdgeError(
+                    f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data]} but accessed in host',
+                    sdfg,
+                    state_id,
+                    eid,
+                )
+
         # Check memlet subset validity with respect to source/destination nodes
         if e.data.data is not None and e.data.allow_oob == False:
             subset_node = (dst_node
@@ -519,6 +625,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         warnings.warn(f'Potential out-of-bounds memlet subset: {e}')
                     else:
                         raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid)
+
             # Test other_subset as well
             if e.data.other_subset is not None and isinstance(other_subset_node, nd.AccessNode):
                 arr = sdfg.arrays[other_subset_node.data]
diff --git a/tests/sdfg/disallowed_access_test.py b/tests/sdfg/disallowed_access_test.py
new file mode 100644
index 0000000000..b23d92de7f
--- /dev/null
+++ b/tests/sdfg/disallowed_access_test.py
@@ -0,0 +1,56 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import dace
+from dace.sdfg.validation import InvalidSDFGInterstateEdgeError, InvalidSDFGEdgeError
+import pytest
+
+
+@pytest.mark.gpu
+def test_gpu_access_on_host_interstate_ok():
+    sdfg = dace.SDFG('tester')
+    sdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global)
+    state = sdfg.add_state()
+    me, mx = state.add_map('map', dict(i='0:20'), dace.ScheduleType.GPU_Device)
+
+    nsdfg = dace.SDFG('inner')
+    nsdfg.add_array('a', [20], dace.float64, storage=dace.StorageType.GPU_Global)
+    state1 = nsdfg.add_state()
+    state2 = nsdfg.add_state()
+    nsdfg.add_edge(state1, state2, dace.InterstateEdge(assignments=dict(s='a[i]')))
+
+    nnode = state.add_nested_sdfg(nsdfg, None, {'a'}, {}, {'i': 'i'})
+    r = state.add_read('A')
+    state.add_memlet_path(r, me, nnode, dst_conn='a', memlet=dace.Memlet('A[0:20]'))
+    state.add_nedge(nnode, mx, dace.Memlet())
+
+    sdfg.validate()
+
+
+@pytest.mark.gpu
+def test_gpu_access_on_host_interstate_invalid():
+    sdfg = dace.SDFG('tester')
+    sdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global)
+    state1 = sdfg.add_state()
+    state2 = sdfg.add_state()
+    sdfg.add_edge(state1, state2, dace.InterstateEdge(assignments=dict(s='A[4]')))
+
+    with pytest.raises(InvalidSDFGInterstateEdgeError):
+        sdfg.validate()
+
+
+@pytest.mark.gpu
+def test_gpu_access_on_host_tasklet():
+    @dace.program
+    def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global):
+        for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore:
+            a[i] = 1
+
+    with pytest.raises(InvalidSDFGEdgeError):
+        tester.to_sdfg(validate=True)
+
+
+if __name__ == '__main__':
+    # test_gpu_access_on_host_interstate_ok()
+    test_gpu_access_on_host_interstate_invalid()
+    # test_gpu_access_on_host_tasklet()
+    
\ No newline at end of file

From 73f2d5778aa223aae20c0dee9613ae28e494de68 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 21:59:19 -0700
Subject: [PATCH 137/392] Fix test

---
 tests/parse_state_struct_test.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py
index 59c0e9279c..58ec2dfd14 100644
--- a/tests/parse_state_struct_test.py
+++ b/tests/parse_state_struct_test.py
@@ -11,7 +11,7 @@
 import dace
 import dace.library
 from dace import dtypes
-from dace.codegen import codeobject, targets, compiler, compiled_sdfg
+from dace.codegen import codeobject, targets, compiler, compiled_sdfg, common
 
 
 @pytest.fixture
@@ -21,17 +21,15 @@ def cuda_helper():
 
 def _cuda_helper():
 
-    helper_code = """
+    helper_code = f"""
     #include <dace/dace.h>
     
-    extern "C" {
-        int host_to_gpu(void* gpu, void* host, size_t size) {
-            auto result = cudaMemcpy(gpu, host, size, cudaMemcpyHostToDevice);
-            DACE_GPU_CHECK(cudaGetLastError());
-            DACE_GPU_CHECK(cudaDeviceSynchronize());
+    extern "C" {{
+        DACE_EXPORTED int host_to_gpu(void* gpu, void* host, size_t size) {{
+            auto result = {common.get_gpu_backend()}Memcpy(gpu, host, size, {common.get_gpu_backend()}MemcpyHostToDevice);
             return result;
-        } 
-    } 
+        }}
+    }}
     """
     program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper")
 

From 9f17d01a3db33df5be8535683a948e9d989893ff Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 22:01:13 -0700
Subject: [PATCH 138/392] Clarify error message

---
 dace/sdfg/validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index e7de491e4e..b07e964efa 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -173,7 +173,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                     eid = sdfg.edge_id(edge)
                     raise InvalidSDFGInterstateEdgeError(
                         f'Trying to read an inaccessible data container "{container}" '
-                        f'(Storage: {sdfg.arrays[container].storage}) in interstate edge', sdfg, eid)
+                        f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
 
             # Add edge symbols into defined symbols
             symbols.update(issyms)
@@ -220,7 +220,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                 if not _accessible(sdfg, container, context):
                     raise InvalidSDFGInterstateEdgeError(
                         f'Trying to read an inaccessible data container "{container}" '
-                        f'(Storage: {sdfg.arrays[container].storage}) in interstate edge', sdfg, eid)
+                        f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
 
     except InvalidSDFGError as ex:
         # If the SDFG is invalid, save it

From d7eafefa42b5970eaa86938ac3da0ae3493b671c Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 22:06:26 -0700
Subject: [PATCH 139/392] More information when saving invalid file

---
 dace/sdfg/sdfg.py       | 4 +++-
 dace/sdfg/validation.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index ce8d3e7aa8..2863060c68 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2251,7 +2251,9 @@ def compile(self, output_file=None, validate=True) -> \
                 # Generate code for the program by traversing the SDFG state by state
                 program_objects = codegen.generate_code(sdfg, validate=validate)
             except Exception:
-                self.save(os.path.join('_dacegraphs', 'failing.sdfg'))
+                fpath = os.path.join('_dacegraphs', 'failing.sdfg')
+                self.save(fpath)
+                print(f'Failing SDFG saved for inspection in {os.path.abspath(fpath)}')
                 raise
 
             # Generate the program folder and write the source files
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index b07e964efa..84748eef42 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -224,7 +224,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
 
     except InvalidSDFGError as ex:
         # If the SDFG is invalid, save it
-        sdfg.save(os.path.join('_dacegraphs', 'invalid.sdfg'), exception=ex)
+        fpath = os.path.join('_dacegraphs', 'invalid.sdfg')
+        sdfg.save(fpath, exception=ex)
+        print(f'Invalid SDFG saved for inspection in {os.path.abspath(fpath)}')
         raise
 
 

From a72cfd175c1f2054d6fbeb7070e1879cc2c93a4c Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 22:09:15 -0700
Subject: [PATCH 140/392] Minor typos

---
 dace/sdfg/validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 84748eef42..03223cdf35 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -568,7 +568,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                     if pn.schedule in dtypes.GPU_SCHEDULES:
                         memlet_context['in_gpu'] = True
                         break
-                    if pn.schedule in dtypes.ScheduleType.FPGA_Device:
+                    if pn.schedule == dtypes.ScheduleType.FPGA_Device:
                         memlet_context['in_fpga'] = True
                         break
 
@@ -589,7 +589,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                 and (isinstance(e.src, nd.Tasklet) or isinstance(e.dst, nd.Tasklet) or isinstance(e.dst, nd.MapEntry))):
             if not _accessible(sdfg, e.data.data, memlet_context):
                 raise InvalidSDFGEdgeError(
-                    f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data]} but accessed in host',
+                    f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data].storage} but accessed in host',
                     sdfg,
                     state_id,
                     eid,

From 0add7d3b8e00b9f9e3aee3178980fb45013fd699 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 22:23:51 -0700
Subject: [PATCH 141/392] Fix important typos and allow GPU transform without
 simplification

---
 dace/codegen/targets/cuda.py        | 2 +-
 dace/sdfg/sdfg.py                   | 5 +++--
 dace/sdfg/validation.py             | 2 +-
 tests/persistent_fusion_cudatest.py | 3 +--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 4974b6f6d9..828f05d376 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1605,7 +1605,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
         # Prepare an empty-grid check for runtime grids
         dimcheck = ''
         if is_persistent:
-            dimcheck = 'dace_number_blocks > 0'
+            dimcheck = 'dace_number_blocks == 0'
         else:
             for gdim in grid_dims:
                 if symbolic.issymbolic(gdim) and (gdim > 0) != True:
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 2863060c68..adebe51c9b 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2598,7 +2598,8 @@ def apply_gpu_transformations(self,
                                   validate_all=False,
                                   permissive=False,
                                   sequential_innermaps=True,
-                                  register_transients=True):
+                                  register_transients=True,
+                                  simplify=True):
         """ Applies a series of transformations on the SDFG for it to
             generate GPU code.
 
@@ -2614,7 +2615,7 @@ def apply_gpu_transformations(self,
 
         self.apply_transformations(GPUTransformSDFG,
                                    options=dict(sequential_innermaps=sequential_innermaps,
-                                                register_trans=register_transients),
+                                                register_trans=register_transients, simplify=simplify),
                                    validate=validate,
                                    validate_all=validate_all,
                                    permissive=permissive,
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 03223cdf35..9493c5bfbc 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -564,7 +564,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         memlet_context = copy.copy(context)
         for pe in path:
             for pn in (pe.src, pe.dst):
-                if isinstance(pn, nd.EntryNode):
+                if isinstance(pn, (nd.EntryNode, nd.ExitNode)):
                     if pn.schedule in dtypes.GPU_SCHEDULES:
                         memlet_context['in_gpu'] = True
                         break
diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py
index da8ba86616..ac05761bee 100644
--- a/tests/persistent_fusion_cudatest.py
+++ b/tests/persistent_fusion_cudatest.py
@@ -270,7 +270,7 @@ def fill_update_state(state, front_in, front_in_count, front_out, front_out_coun
 def test_persistent_fusion():
     sdfg = bfs
 
-    sdfg.apply_gpu_transformations()
+    sdfg.apply_gpu_transformations(validate=False, simplify=False)  # Only validate after fusion
 
     # All nodes but copy-in, copy-out, and init
     content_nodes = set(sdfg.nodes()) - {sdfg.start_state, sdfg.sink_nodes()[0], s_init}
@@ -319,7 +319,6 @@ def test_persistent_fusion():
     sdfg(row_index=G_row, col_index=G_col, result=depth, root=srcnode, N=V, nnz=E)
 
     assert np.allclose(depth, reference), "Result doesn't match!"
-    print("Complete.")
 
 
 # Actual execution

From 33a96c2f800949d832b769242afe6665d9fb09cc Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 23:56:28 -0700
Subject: [PATCH 142/392] Fix symbolic test

---
 dace/sdfg/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 9493c5bfbc..096a326bf3 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -264,7 +264,7 @@ def _is_scalar(edge: 'gr.MultiConnectorEdge[Memlet]', memlet_path: List['gr.Mult
             return False
 
     # If the memlet is dynamically accessed, it may also not be a scalar
-    if edge.data.dynamic and edge.data.volume < 0:
+    if edge.data.dynamic and edge.data.volume == -1:
         return False
 
     # If the memlet has more than one element, it is definitely not a scalar

From 17065664cc252bfa52f8eba86dc693dc2a6c6288 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 27 Jun 2023 23:58:35 -0700
Subject: [PATCH 143/392] Further correct symbolic test

---
 dace/sdfg/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 096a326bf3..fa21bef356 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -264,7 +264,7 @@ def _is_scalar(edge: 'gr.MultiConnectorEdge[Memlet]', memlet_path: List['gr.Mult
             return False
 
     # If the memlet is dynamically accessed, it may also not be a scalar
-    if edge.data.dynamic and edge.data.volume == -1:
+    if edge.data.dynamic and (edge.data.volume == -1 or edge.data.volume == 0):
         return False
 
     # If the memlet has more than one element, it is definitely not a scalar

From 8175e5b7fddc10734d1bf1a608424eefb10bbdf9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 00:10:58 -0700
Subject: [PATCH 144/392] Consider default schedule maps in access validation

---
 dace/sdfg/validation.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index fa21bef356..e72fd0c194 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -571,6 +571,11 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                     if pn.schedule == dtypes.ScheduleType.FPGA_Device:
                         memlet_context['in_fpga'] = True
                         break
+                    if pn.schedule == dtypes.ScheduleType.Default:
+                        # Default schedule memlet accessibility validation is deferred
+                        # to after schedule/storage inference
+                        memlet_context['in_default'] = True
+                        break
 
         # Check if memlet data matches src or dst nodes
         if (e.data.data is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
@@ -587,7 +592,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Check accessibility of scalar memlet data in tasklets and dynamic map ranges
         if (not e.data.is_empty() and _is_scalar(e, path)
                 and (isinstance(e.src, nd.Tasklet) or isinstance(e.dst, nd.Tasklet) or isinstance(e.dst, nd.MapEntry))):
-            if not _accessible(sdfg, e.data.data, memlet_context):
+            if not memlet_context.get('in_default', False) and not _accessible(sdfg, e.data.data, memlet_context):
                 raise InvalidSDFGEdgeError(
                     f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data].storage} but accessed in host',
                     sdfg,

From d665d3447ba233f8847e2e5aab9d5ccda0a33315 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 00:17:22 -0700
Subject: [PATCH 145/392] Refine validation test on potential failure

---
 dace/sdfg/validation.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index e72fd0c194..43d48bd7bd 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -593,12 +593,13 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         if (not e.data.is_empty() and _is_scalar(e, path)
                 and (isinstance(e.src, nd.Tasklet) or isinstance(e.dst, nd.Tasklet) or isinstance(e.dst, nd.MapEntry))):
             if not memlet_context.get('in_default', False) and not _accessible(sdfg, e.data.data, memlet_context):
-                raise InvalidSDFGEdgeError(
-                    f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data].storage} but accessed in host',
-                    sdfg,
-                    state_id,
-                    eid,
-                )
+                # Rerun slightly more expensive but foolproof test
+                memlet_context['in_gpu'] = is_devicelevel_gpu(sdfg, state, e.dst)
+                memlet_context['in_fpga'] = is_devicelevel_fpga(sdfg, state, e.dst)
+                if not _accessible(sdfg, e.data.data, memlet_context):
+                    raise InvalidSDFGEdgeError(
+                        f'Data container "{e.data.data}" is stored as {sdfg.arrays[e.data.data].storage} '
+                        'but accessed on host', sdfg, state_id, eid)
 
         # Check memlet subset validity with respect to source/destination nodes
         if e.data.data is not None and e.data.allow_oob == False:

From edf01150ba537b4a5647bc9930d655c3b030f876 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 00:27:04 -0700
Subject: [PATCH 146/392] Check for extra arguments when calling dace.programs

---
 dace/frontend/python/parser.py         |  8 +++++---
 tests/python_frontend/argument_test.py | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 tests/python_frontend/argument_test.py

diff --git a/dace/frontend/python/parser.py b/dace/frontend/python/parser.py
index 379c313e11..991613a9ea 100644
--- a/dace/frontend/python/parser.py
+++ b/dace/frontend/python/parser.py
@@ -120,8 +120,7 @@ def infer_symbols_from_datadescriptor(sdfg: SDFG,
     # Solve for all at once
     results = sympy.solve(equations, *symbols, dict=True, exclude=exclude)
     if len(results) > 1:
-        raise ValueError('Ambiguous values for symbols in inference. '
-                         'Options: %s' % str(results))
+        raise ValueError('Ambiguous values for symbols in inference. Options: %s' % str(results))
     if len(results) == 0:
         raise ValueError('Cannot infer values for symbols in inference.')
 
@@ -136,7 +135,6 @@ def infer_symbols_from_datadescriptor(sdfg: SDFG,
 class DaceProgram(pycommon.SDFGConvertible):
     """ A data-centric program object, obtained by decorating a function with
         ``@dace.program``. """
-
     def __init__(self,
                  f,
                  args,
@@ -684,6 +682,10 @@ def _get_type_annotations(
             else:
                 types['__return'] = create_datadescriptor(rettype)
 
+        # Too many arguments given
+        if nargs > len(specified_args):
+            raise TypeError(f'{self.name}() takes {len(specified_args)} arguments but {nargs} were given')
+
         return types, arg_mapping, gvar_mapping, specified_args
 
     def _load_sdfg(self, path: str, *args, **kwargs):
diff --git a/tests/python_frontend/argument_test.py b/tests/python_frontend/argument_test.py
new file mode 100644
index 0000000000..1f43337eb8
--- /dev/null
+++ b/tests/python_frontend/argument_test.py
@@ -0,0 +1,20 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import dace
+import pytest
+
+N = dace.symbol('N')
+
+
+@dace.program
+def imgcpy(img1: dace.float64[N, N], img2: dace.float64[N, N], coefficient: dace.float64):
+    img1[:, :] = img2[:, :] * coefficient
+
+
+def test_extra_args():
+    with pytest.raises(TypeError):
+        imgcpy([[1, 2], [3, 4]], [[4, 3], [2, 1]], 0.0, 1.0)
+
+
+if __name__ == '__main__':
+    test_extra_args()

From 561f9fb7c98d9a39a0dd2086e189fe6d4da15c7f Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 01:27:54 -0700
Subject: [PATCH 147/392] Fix invalid tests

---
 tests/numpy/advanced_indexing_test.py          | 4 ++--
 tests/python_frontend/sdfg_convertible_test.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/numpy/advanced_indexing_test.py b/tests/numpy/advanced_indexing_test.py
index 9dc31cf7ff..48853cdf26 100644
--- a/tests/numpy/advanced_indexing_test.py
+++ b/tests/numpy/advanced_indexing_test.py
@@ -107,7 +107,7 @@ def indexing_test(A: dace.float64[20]):
 
     A = np.random.rand(20)
     indices = [1, 10, 15]
-    res = indexing_test(A, indices)
+    res = indexing_test(A)
     assert np.allclose(A[indices], res)
 
 
@@ -119,7 +119,7 @@ def indexing_test(A: dace.float64[20]):
         return A[indices]
 
     A = np.random.rand(20)
-    res = indexing_test(A, indices)
+    res = indexing_test(A)
     assert np.allclose(A[indices], res)
 
 
diff --git a/tests/python_frontend/sdfg_convertible_test.py b/tests/python_frontend/sdfg_convertible_test.py
index 87e2cf4bd2..38a3d34e02 100644
--- a/tests/python_frontend/sdfg_convertible_test.py
+++ b/tests/python_frontend/sdfg_convertible_test.py
@@ -27,7 +27,7 @@ def test_constants_in_signature():
     class AConvertible(SDFGConvertible):
         def __sdfg__(self, grid, arr):
             @dace.program
-            def func(arr: dace.float64[10]):
+            def func(_: dace.compiletime, arr: dace.float64[10]):
                 arr[grid.start:grid.end] = 7.0
 
             return func.to_sdfg(grid, arr)

From 8b9dae5f0eb2a8f78759ad57b79d1141fc18f7d4 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 03:08:35 -0700
Subject: [PATCH 148/392] Fix more broken tests

---
 tests/polybench/correlation.py | 4 ++--
 tests/polybench/covariance.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/polybench/correlation.py b/tests/polybench/correlation.py
index 102b450b9a..c5ccc476ff 100644
--- a/tests/polybench/correlation.py
+++ b/tests/polybench/correlation.py
@@ -12,10 +12,10 @@
 # Dataset sizes
 sizes = [{M: 28, N: 32}, {M: 80, N: 100}, {M: 240, N: 260}, {M: 1200, N: 1400}, {M: 2600, N: 3000}]
 
-args = [([N, M], datatype), ([M, M], datatype), ([M], datatype), ([M], datatype), M, N]
+args = [([N, M], datatype), ([M, M], datatype), ([M], datatype), ([M], datatype)]
 
 
-def init_array(data, corr, mean, stddev, M, N):
+def init_array(data, corr, mean, stddev):
     n = N.get()
     m = M.get()
     for i in range(n):
diff --git a/tests/polybench/covariance.py b/tests/polybench/covariance.py
index 9021c189b0..db68a0651d 100644
--- a/tests/polybench/covariance.py
+++ b/tests/polybench/covariance.py
@@ -11,10 +11,10 @@
 # Dataset sizes
 sizes = [{M: 28, N: 32}, {M: 80, N: 100}, {M: 240, N: 260}, {M: 1200, N: 1400}, {M: 2600, N: 3000}]
 
-args = [([N, M], datatype), ([M, M], datatype), ([M], datatype), M, N]
+args = [([N, M], datatype), ([M, M], datatype), ([M], datatype)]
 
 
-def init_array(data, cov, mean, M, N):
+def init_array(data, cov, mean):
     n = N.get()
     m = M.get()
     for i in range(n):

From 4c2c9f72e65555bf6989c6e210ce4ec7ccf1c270 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 03:14:58 -0700
Subject: [PATCH 149/392] Place validation printout at the end of the exception

---
 dace/sdfg/validation.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 43d48bd7bd..fa86163063 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -226,7 +226,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
         # If the SDFG is invalid, save it
         fpath = os.path.join('_dacegraphs', 'invalid.sdfg')
         sdfg.save(fpath, exception=ex)
-        print(f'Invalid SDFG saved for inspection in {os.path.abspath(fpath)}')
+        ex.path = fpath
         raise
 
 
@@ -740,6 +740,7 @@ def __init__(self, message: str, sdfg: 'SDFG', state_id: int):
         self.message = message
         self.sdfg = sdfg
         self.state_id = state_id
+        self.path = None
 
     def _getlineinfo(self, obj) -> str:
         """
@@ -781,6 +782,9 @@ def __str__(self):
         if locinfo:
             locinfo = '\nOriginating from source code at ' + locinfo
 
+        if self.path:
+            locinfo += f'\nInvalid SDFG saved for inspection in {os.path.abspath(self.path)}'
+
         return f'{self.message}{suffix}{locinfo}'
 
 
@@ -790,6 +794,7 @@ def __init__(self, message: str, sdfg: 'SDFG', edge_id: int):
         self.message = message
         self.sdfg = sdfg
         self.edge_id = edge_id
+        self.path = None
 
     def to_json(self):
         return dict(message=self.message, sdfg_id=self.sdfg.sdfg_id, isedge_id=self.edge_id)
@@ -822,6 +827,9 @@ def __str__(self):
         else:
             locinfo = ''
 
+        if self.path:
+            locinfo += f'\nInvalid SDFG saved for inspection in {os.path.abspath(self.path)}'
+
         return f'{self.message}{edgestr}{locinfo}'
 
 
@@ -832,6 +840,7 @@ def __init__(self, message: str, sdfg: 'SDFG', state_id: int, node_id: int):
         self.sdfg = sdfg
         self.state_id = state_id
         self.node_id = node_id
+        self.path = None
 
     def to_json(self):
         return dict(message=self.message, sdfg_id=self.sdfg.sdfg_id, state_id=self.state_id, node_id=self.node_id)
@@ -852,6 +861,9 @@ def __str__(self):
         if locinfo:
             locinfo = '\nOriginating from source code at ' + locinfo
 
+        if self.path:
+            locinfo += f'\nInvalid SDFG saved for inspection in {os.path.abspath(self.path)}'
+
         return f'{self.message} (at state {state.label}{nodestr}){locinfo}'
 
 
@@ -871,6 +883,7 @@ def __init__(self, message: str, sdfg: 'SDFG', state_id: int, edge_id: int):
         self.sdfg = sdfg
         self.state_id = state_id
         self.edge_id = edge_id
+        self.path = None
 
     def to_json(self):
         return dict(message=self.message, sdfg_id=self.sdfg.sdfg_id, state_id=self.state_id, edge_id=self.edge_id)
@@ -895,4 +908,7 @@ def __str__(self):
         if locinfo:
             locinfo = '\nOriginating from source code at ' + locinfo
 
+        if self.path:
+            locinfo += f'\nInvalid SDFG saved for inspection in {os.path.abspath(self.path)}'
+
         return f'{self.message} (at state {state.label}{edgestr}){locinfo}'

From f1e0e1b10cbc1ec79f075674b50bbcb781daf1fd Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 03:39:49 -0700
Subject: [PATCH 150/392] Add nested SDFG parent pointer validation

---
 dace/sdfg/nodes.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index e4e9831fec..5c270153e1 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -612,6 +612,13 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
         for out_conn in self.out_connectors:
             if not dtypes.validate_name(out_conn):
                 raise NameError('Invalid output connector "%s"' % out_conn)
+        if self.sdfg.parent_nsdfg_node is not self:
+            raise ValueError('Parent nested SDFG node not properly set')
+        if self.sdfg.parent is not state:
+            raise ValueError('Parent state not properly set for nested SDFG node')
+        if self.sdfg.parent_sdfg is not sdfg:
+            raise ValueError('Parent SDFG not properly set for nested SDFG node')
+
         connectors = self.in_connectors.keys() | self.out_connectors.keys()
         for conn in connectors:
             if conn not in self.sdfg.arrays:

From 86eea027ee33547f6a94a6a60d51753268615303 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 03:40:47 -0700
Subject: [PATCH 151/392] Fix parent-pointing bug in loop to map

---
 dace/transformation/interstate/loop_to_map.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index 0b0baabddb..03b4efac1c 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -668,3 +668,11 @@ def apply(self, _, sdfg: sd.SDFG):
         for sym in symbols_to_remove:
             if sym in sdfg.symbols and helpers.is_symbol_unused(sdfg, sym):
                 sdfg.remove_symbol(sym)
+
+        # Reset all nested SDFG parent pointers
+        for nstate in nsdfg.nodes():
+            for nnode in nstate.nodes():
+                if isinstance(nnode, nodes.NestedSDFG):
+                    nnode.sdfg.parent_nsdfg_node = nnode
+                    nnode.sdfg.parent = nstate
+                    nnode.sdfg.parent_sdfg = nsdfg

From 29d89aa24d463352d1cb8a09ce19d02d934f682c Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 05:05:53 -0700
Subject: [PATCH 152/392] Fix potentially unbound local

---
 dace/transformation/interstate/loop_to_map.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index 03b4efac1c..bb21774619 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -365,6 +365,8 @@ def apply(self, _, sdfg: sd.SDFG):
                     to_visit.append(dst)
             states.add(state)
 
+        nsdfg = None
+
         # Nest loop-body states
         if len(states) > 1:
 
@@ -670,9 +672,10 @@ def apply(self, _, sdfg: sd.SDFG):
                 sdfg.remove_symbol(sym)
 
         # Reset all nested SDFG parent pointers
-        for nstate in nsdfg.nodes():
-            for nnode in nstate.nodes():
-                if isinstance(nnode, nodes.NestedSDFG):
-                    nnode.sdfg.parent_nsdfg_node = nnode
-                    nnode.sdfg.parent = nstate
-                    nnode.sdfg.parent_sdfg = nsdfg
+        if nsdfg is not None:
+            for nstate in nsdfg.nodes():
+                for nnode in nstate.nodes():
+                    if isinstance(nnode, nodes.NestedSDFG):
+                        nnode.sdfg.parent_nsdfg_node = nnode
+                        nnode.sdfg.parent = nstate
+                        nnode.sdfg.parent_sdfg = nsdfg

From 8375456faef4040bfff536c341aa014957686e32 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 05:14:56 -0700
Subject: [PATCH 153/392] Fix curly braces in RTL tasklets

---
 dace/codegen/targets/rtl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/codegen/targets/rtl.py b/dace/codegen/targets/rtl.py
index e0044e114a..dcb752e215 100644
--- a/dace/codegen/targets/rtl.py
+++ b/dace/codegen/targets/rtl.py
@@ -56,6 +56,7 @@ def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id:
                 (2) generate tasklet->out
                 (3) generate tasklet
             """
+            callsite_stream.write('{', sdfg, state_id, dfg.node_id(node))
             # generate code to handle data input to the tasklet
             for edge in dfg.in_edges(node):
                 # find input array
@@ -72,6 +73,7 @@ def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id:
                                                            callsite_stream)
             # generate tasklet code
             self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream)
+            callsite_stream.write('}', sdfg, state_id, dfg.node_id(node))
         else:
             raise RuntimeError(
                 "Only tasklets are handled here, not {}. This should have been filtered by the predicate".format(

From c30bf4d857d6309d67d59bec5a9e1be9e88530ea Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 05:37:50 -0700
Subject: [PATCH 154/392] Fix yet another LoopToMap bug

---
 dace/transformation/interstate/loop_to_map.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py
index bb21774619..8fb6600b76 100644
--- a/dace/transformation/interstate/loop_to_map.py
+++ b/dace/transformation/interstate/loop_to_map.py
@@ -673,6 +673,9 @@ def apply(self, _, sdfg: sd.SDFG):
 
         # Reset all nested SDFG parent pointers
         if nsdfg is not None:
+            if isinstance(nsdfg, nodes.NestedSDFG):
+                nsdfg = nsdfg.sdfg
+
             for nstate in nsdfg.nodes():
                 for nnode in nstate.nodes():
                     if isinstance(nnode, nodes.NestedSDFG):

From 3c0f5059d2aabe9ed69ccca2f4b645b3815f6b91 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 06:00:32 -0700
Subject: [PATCH 155/392] GPU runtime: Use pre-installed CUB if exists

---
 dace/runtime/include/dace/cuda/multidim_gbar.cuh | 10 +++++++---
 dace/runtime/include/dace/reduction.h            |  4 ++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/dace/runtime/include/dace/cuda/multidim_gbar.cuh b/dace/runtime/include/dace/cuda/multidim_gbar.cuh
index 97e817d829..6f11fb07eb 100644
--- a/dace/runtime/include/dace/cuda/multidim_gbar.cuh
+++ b/dace/runtime/include/dace/cuda/multidim_gbar.cuh
@@ -35,9 +35,13 @@
 
 #pragma once
 
-#include "../../../../external/cub/cub/util_debug.cuh"
-#include "../../../../external/cub/cub/util_namespace.cuh"
-#include "../../../../external/cub/cub/thread/thread_load.cuh"
+#if __has_include(<cub/cub.cuh>)
+    #include <cub/cub.cuh>
+#else
+    #include "../../../../external/cub/cub/util_debug.cuh"
+    #include "../../../../external/cub/cub/util_namespace.cuh"
+    #include "../../../../external/cub/cub/thread/thread_load.cuh"
+#endif
 
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
diff --git a/dace/runtime/include/dace/reduction.h b/dace/runtime/include/dace/reduction.h
index b78bf45d17..9d8c59997c 100644
--- a/dace/runtime/include/dace/reduction.h
+++ b/dace/runtime/include/dace/reduction.h
@@ -9,12 +9,16 @@
 #include "math.h"  // for ::min, ::max
 
 #ifdef __CUDACC__
+#if __has_include(<cub/cub.cuh>)
+    #include <cub/cub.cuh>
+#else
     #include "../../../external/cub/cub/device/device_segmented_reduce.cuh"
     #include "../../../external/cub/cub/device/device_reduce.cuh"
     #include "../../../external/cub/cub/block/block_reduce.cuh"
     #include "../../../external/cub/cub/iterator/counting_input_iterator.cuh"
     #include "../../../external/cub/cub/iterator/transform_input_iterator.cuh"
 #endif
+#endif
 
 #ifdef __HIPCC__
     // HIP supports the same set of atomic ops as CUDA SM 6.0+

From ff964a60e633042b5eaef1b3fb3065ae9acdd3b4 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 08:12:43 -0700
Subject: [PATCH 156/392] More informative block warnings and default block
 size linearization if extra dimensions specified

---
 dace/codegen/targets/cuda.py      | 19 ++++++++++++++++---
 tests/cuda_highdim_kernel_test.py | 18 ++++++++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 828f05d376..0896e12f12 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1802,15 +1802,28 @@ def get_kernel_dimensions(self, dfg_scope):
                             int(b) for b in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')
                         ]
                 else:
-                    if Config.get_bool('debugprint'):
-                        warnings.warn('Thread-block maps not found in kernel, assuming block size of (%s)' %
-                                      Config.get('compiler', 'cuda', 'default_block_size'))
+                    def_bsize = Config.get('compiler', 'cuda', 'default_block_size')
+                    warnings.warn(f'No ``gpu_block_size`` property specified on map "{kernelmap_entry.map.label}". '
+                                  f'Falling back to the configured ``default_block_size``: {def_bsize}. '
+                                  'You can either specify the block size to use with the gpu_block_size property, '
+                                  'or by adding nested ``GPU_Threadblock`` maps, which map work to individual threads. '
+                                  'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
 
                     if (Config.get('compiler', 'cuda', 'default_block_size') == 'max'):
                         raise NotImplementedError('max dynamic block size unimplemented')
                     else:
                         block_size = [int(b) for b in Config.get('compiler', 'cuda', 'default_block_size').split(',')]
 
+                    block_ndim = sum(1 if b != 1 else 0 for b in block_size)
+                    grid_ndim = sum(1 if g != 1 else 0 for g in grid_size)
+                    if block_ndim > grid_ndim:
+                        linearized_remainder = prod(block_size[grid_ndim:])
+                        block_size = block_size[:grid_ndim] + [1] * (3 - grid_ndim)
+                        block_size[grid_ndim - 1] *= linearized_remainder
+                        warnings.warn(f'Default block size has more dimensions ({block_ndim}) than kernel dimensions '
+                                      f'({grid_ndim}) in map "{kernelmap_entry.map.label}". Linearizing block '
+                                      f'size to {block_size}. Consider setting the ``gpu_block_size`` property.')
+
             assert (len(block_size) >= 1 and len(block_size) <= 3)
 
             # Grid size = ceil(|S|/32) for first dimension, rest = |S|
diff --git a/tests/cuda_highdim_kernel_test.py b/tests/cuda_highdim_kernel_test.py
index 7b1553ad3c..79e0eaf6cd 100644
--- a/tests/cuda_highdim_kernel_test.py
+++ b/tests/cuda_highdim_kernel_test.py
@@ -19,10 +19,8 @@
 
 @dace.program
 def highdim(A: dace.uint64[N, M, K, L, X, Y, Z, W, U], B: dace.uint64[N, M, K, L]):
-
     @dace.mapscope
     def kernel(i: _[5:N - 5], j: _[0:M], k: _[7:K - 1], l: _[0:L]):
-
         @dace.map
         def block(a: _[0:X], b: _[0:Y], c: _[1:Z], d: _[2:W - 2], e: _[0:U]):
             input << A[i, j, k, l, a, b, c, d, e]
@@ -83,7 +81,6 @@ def test_gpu():
 
 @pytest.mark.gpu
 def test_highdim_implicit_block():
-
     @dace.program
     def tester(x: dace.float64[32, 90, 80, 70]):
         for i, j, k, l in dace.map[0:32, 0:90, 0:80, 0:70]:
@@ -105,7 +102,6 @@ def tester(x: dace.float64[32, 90, 80, 70]):
 
 @pytest.mark.gpu
 def test_highdim_implicit_block_threadsplit():
-
     @dace.program
     def tester(x: dace.float64[2, 2, 80, 70]):
         for i, j, k, l in dace.map[0:2, 0:2, 0:80, 0:70]:
@@ -125,8 +121,22 @@ def tester(x: dace.float64[2, 2, 80, 70]):
     assert np.allclose(a, 2)
 
 
+def test_highdim_default_block_size():
+    @dace.program
+    def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device:
+            a[i, j] = 1
+
+    with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32, 8, 2'):
+        with pytest.warns(UserWarning, match='has more dimensions'):
+            sdfg = tester.to_sdfg()
+            gpu_code = sdfg.generate_code()[1]
+            assert 'dim3(32, 16, 1)' in gpu_code.code
+
+
 if __name__ == "__main__":
     test_cpu()
     test_gpu()
     test_highdim_implicit_block()
     test_highdim_implicit_block_threadsplit()
+    test_highdim_default_block_size()

From b302ceac1e2bfc676962a176bec88163912f5230 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 18:15:27 -0700
Subject: [PATCH 157/392] Fix erroneously detecting 1d kernels as 0d

---
 dace/codegen/targets/cuda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 0896e12f12..71ef4c0ce6 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1804,7 +1804,7 @@ def get_kernel_dimensions(self, dfg_scope):
                 else:
                     def_bsize = Config.get('compiler', 'cuda', 'default_block_size')
                     warnings.warn(f'No ``gpu_block_size`` property specified on map "{kernelmap_entry.map.label}". '
-                                  f'Falling back to the configured ``default_block_size``: {def_bsize}. '
+                                  f'Falling back to the configuration entry ``compiler.cuda.default_block_size``: {def_bsize}. '
                                   'You can either specify the block size to use with the gpu_block_size property, '
                                   'or by adding nested ``GPU_Threadblock`` maps, which map work to individual threads. '
                                   'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
@@ -1814,8 +1814,8 @@ def get_kernel_dimensions(self, dfg_scope):
                     else:
                         block_size = [int(b) for b in Config.get('compiler', 'cuda', 'default_block_size').split(',')]
 
-                    block_ndim = sum(1 if b != 1 else 0 for b in block_size)
-                    grid_ndim = sum(1 if g != 1 else 0 for g in grid_size)
+                    block_ndim = max(1, sum(1 if b != 1 else 0 for b in block_size))
+                    grid_ndim = max(1, sum(1 if g != 1 else 0 for g in grid_size))
                     if block_ndim > grid_ndim:
                         linearized_remainder = prod(block_size[grid_ndim:])
                         block_size = block_size[:grid_ndim] + [1] * (3 - grid_ndim)

From 3e02efbf3ac6a21fcd72f84f36a0f299eb907ba7 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 18:20:48 -0700
Subject: [PATCH 158/392] Warn when multiple block sizes are used

---
 dace/codegen/targets/cuda.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 71ef4c0ce6..9027b142e2 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1857,6 +1857,13 @@ def get_kernel_dimensions(self, dfg_scope):
             #       kernel, raise an invalid SDFG exception and recommend
             #       overapproximation.
 
+            # Warn when multiple detected block sizes have different sizes
+            if len(detected_block_sizes) > 1:
+                warnings.warn('Multiple thread-block maps with different sizes detected for '
+                              f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. '
+                              f'Over-approximating to block size {block_size}.\n'
+                              'If this was not the intent, try tiling one of the thread-block maps to match.')
+
             # both thread-block map and dynamic thread-block map exist at the same
             # time
             if has_dtbmap:

From b870f968c657846765b5cad2c7cdf9fdd2f29e09 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 18:34:03 -0700
Subject: [PATCH 159/392] Error when gpu_block_size and thread-block map sizes
 conflict

---
 dace/codegen/targets/cuda.py | 40 ++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 9027b142e2..644989764b 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1803,11 +1803,12 @@ def get_kernel_dimensions(self, dfg_scope):
                         ]
                 else:
                     def_bsize = Config.get('compiler', 'cuda', 'default_block_size')
-                    warnings.warn(f'No ``gpu_block_size`` property specified on map "{kernelmap_entry.map.label}". '
-                                  f'Falling back to the configuration entry ``compiler.cuda.default_block_size``: {def_bsize}. '
-                                  'You can either specify the block size to use with the gpu_block_size property, '
-                                  'or by adding nested ``GPU_Threadblock`` maps, which map work to individual threads. '
-                                  'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
+                    warnings.warn(
+                        f'No `gpu_block_size` property specified on map "{kernelmap_entry.map.label}". '
+                        f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {def_bsize}. '
+                        'You can either specify the block size to use with the gpu_block_size property, '
+                        'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. '
+                        'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
 
                     if (Config.get('compiler', 'cuda', 'default_block_size') == 'max'):
                         raise NotImplementedError('max dynamic block size unimplemented')
@@ -1831,8 +1832,7 @@ def get_kernel_dimensions(self, dfg_scope):
 
         else:
             # Find all thread-block maps to determine overall block size
-            block_size = block_size if block_size is not None else [1, 1, 1]
-            detected_block_sizes = [block_size]
+            detected_block_sizes = [block_size] if block_size is not None else []
             for tbmap, sym_map in tb_maps_sym_map:
                 tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]]
 
@@ -1847,18 +1847,33 @@ def get_kernel_dimensions(self, dfg_scope):
                     del tbsize[3:]
 
                 # Extend to 3 dimensions if necessary
-                tbsize = tbsize + [1] * (len(block_size) - len(tbsize))
+                tbsize = tbsize + [1] * (3 - len(tbsize))
 
-                block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)]
-                if block_size != tbsize:
+                if len(detected_block_sizes) == 0:
+                    block_size = tbsize
+                else:
+                    block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)]
+
+                if block_size != tbsize or len(detected_block_sizes) == 0:
                     detected_block_sizes.append(tbsize)
 
             # TODO: If grid/block sizes contain elements only defined within the
             #       kernel, raise an invalid SDFG exception and recommend
             #       overapproximation.
 
-            # Warn when multiple detected block sizes have different sizes
             if len(detected_block_sizes) > 1:
+
+                # Error when both gpu_block_size and thread-block maps were defined and conflict
+                if kernelmap_entry.map.gpu_block_size is not None:
+                    raise ValueError('Both the `gpu_block_size` property and internal thread-block '
+                                     'maps were defined with conflicting sizes for kernel '
+                                     f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). '
+                                     'Use `gpu_block_size` only if you do not need access to individual '
+                                     'thread-block threads, or explicit block-level synchronization (e.g., '
+                                     '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
+                                     '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
+                                     'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
+
                 warnings.warn('Multiple thread-block maps with different sizes detected for '
                               f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. '
                               f'Over-approximating to block size {block_size}.\n'
@@ -1873,6 +1888,9 @@ def get_kernel_dimensions(self, dfg_scope):
         if is_persistent:
             grid_size = ['gridDim.x', '1', '1']
 
+        # Check block size against configured maximum values
+        # TODO "to increase this limit, modify the ``bla`` configuration entry"
+
         return grid_size, block_size, len(tb_maps_sym_map) > 0, has_dtbmap, extra_dim_offsets
 
     def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_id: int, kernel_map: nodes.Map,

From b8717a5cb2b5976dce7df68726ebd5490c950f39 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 18:41:10 -0700
Subject: [PATCH 160/392] Warnings and errors on mismatching block sizes

---
 tests/cuda_highdim_kernel_test.py | 32 +++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/cuda_highdim_kernel_test.py b/tests/cuda_highdim_kernel_test.py
index 79e0eaf6cd..00ae530933 100644
--- a/tests/cuda_highdim_kernel_test.py
+++ b/tests/cuda_highdim_kernel_test.py
@@ -134,9 +134,41 @@ def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
             assert 'dim3(32, 16, 1)' in gpu_code.code
 
 
+def test_block_size_mismatch_warning():
+    @dace.program
+    def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device:
+            for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i + bi, j + bj] = 1
+            for bi, bj in dace.map[0:2, 0:1] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i + bi, j + bj] = 1
+
+    sdfg = tester.to_sdfg()
+    with pytest.warns(UserWarning, match='Multiple thread-block maps'):
+        sdfg.generate_code()
+
+
+def test_block_size_mismatch_error():
+    @dace.program
+    def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device:
+            for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i + bi, j + bj] = 1
+
+    sdfg = tester.to_sdfg()
+    for n, _ in sdfg.all_nodes_recursive():
+        if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device:
+            n.gpu_block_size = [4, 2, 1]
+
+    with pytest.raises(ValueError):
+        sdfg.generate_code()
+
+
 if __name__ == "__main__":
     test_cpu()
     test_gpu()
     test_highdim_implicit_block()
     test_highdim_implicit_block_threadsplit()
     test_highdim_default_block_size()
+    test_block_size_mismatch_warning()
+    test_block_size_mismatch_error()

From 52f95e53971982159d5faed53ca0e075804abfab Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 19:10:25 -0700
Subject: [PATCH 161/392] Errors for block sizes that are too large

---
 dace/codegen/targets/cuda.py      | 18 ++++++++++++++++--
 dace/config_schema.yml            | 19 +++++++++++++++++++
 tests/cuda_highdim_kernel_test.py | 31 +++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 644989764b..f2e11fee7b 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1888,8 +1888,22 @@ def get_kernel_dimensions(self, dfg_scope):
         if is_persistent:
             grid_size = ['gridDim.x', '1', '1']
 
-        # Check block size against configured maximum values
-        # TODO "to increase this limit, modify the ``bla`` configuration entry"
+        # Check block size against configured maximum values, if those can be determined
+        total_bsize = prod(block_size)
+        total_limit = Config.get('compiler', 'cuda', 'block_size_limit')
+        lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit')
+        if (total_bsize > total_limit) == True:
+            raise ValueError(f'Block size for kernel "{kernelmap_entry.map.label}" ({block_size}) '
+                             f'is larger than the possible number of threads per block ({total_limit}). '
+                             'The kernel will potentially not run, please reduce the thread-block size. '
+                             'To increase this limit, modify the `compiler.cuda.block_size_limit` '
+                             'configuration entry.')
+        if (block_size[-1] > lastdim_limit) == True:
+            raise ValueError(f'Last block size dimension for kernel "{kernelmap_entry.map.label}" ({block_size}) '
+                             'is larger than the possible number of threads in the last block dimension '
+                             f'({lastdim_limit}). The kernel will potentially not run, please reduce the '
+                             'thread-block size. To increase this limit, modify the '
+                             '`compiler.cuda.block_size_lastdim_limit` configuration entry.')
 
         return grid_size, block_size, len(tb_maps_sym_map) > 0, has_dtbmap, extra_dim_offsets
 
diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index 30f5bdc924..df809e2264 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -395,6 +395,25 @@ required:
                             For more information, see ``cudaMemPoolAttrReleaseThreshold`` in the CUDA toolkit
                             documentation.
 
+                    block_size_limit:
+                        type: int
+                        title: Maximum thread-block size in code generation
+                        default: 1024
+                        description: >
+                            Threshold for the GPU code generator to fail in generating a kernel with
+                            a specified overall larger block size. Default value is derived from hardware
+                            limits on common GPUs.
+
+                    block_size_lastdim_limit:
+                        type: int
+                        title: Maximum last dimension thread-block size in code generation
+                        default: 64
+                        description: >
+                            Threshold for the GPU code generator to fail in generating a kernel with
+                            a specified larger block size in the third dimension. Default value is
+                            derived from hardware limits on common GPUs.
+
+
             #############################################
             # General FPGA flags
             fpga:
diff --git a/tests/cuda_highdim_kernel_test.py b/tests/cuda_highdim_kernel_test.py
index 00ae530933..8a3dade4e5 100644
--- a/tests/cuda_highdim_kernel_test.py
+++ b/tests/cuda_highdim_kernel_test.py
@@ -164,6 +164,35 @@ def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
         sdfg.generate_code()
 
 
+def test_block_size_too_large():
+    @dace.program
+    def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
+        for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device:
+            a[i, j] = 1
+
+    sdfg = tester.to_sdfg()
+    for n, _ in sdfg.all_nodes_recursive():
+        if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device:
+            n.gpu_block_size = [64, 32, 1]
+
+    with pytest.raises(ValueError):
+        sdfg.generate_code()
+
+
+def test_highdim_block_size_too_large():
+    BX, BY, BZ, BW = 64, 2, 2, 2
+
+    @dace.program
+    def tester(a: dace.float64[1024, 2, 2, 20] @ dace.StorageType.GPU_Global):
+        for i, j, k, l in dace.map[0:16, 0:1, 0:1, 0:10:2] @ dace.ScheduleType.GPU_Device:
+            for bi, bj, bk, bl in dace.map[0:BX, 0:BY, 0:BZ, 0:BW] @ dace.ScheduleType.GPU_ThreadBlock:
+                a[i + bi, j + bj, k + bk, l + bl] = 1
+
+    sdfg = tester.to_sdfg()
+    with pytest.raises(ValueError):
+        sdfg.generate_code()
+
+
 if __name__ == "__main__":
     test_cpu()
     test_gpu()
@@ -172,3 +201,5 @@ def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global):
     test_highdim_default_block_size()
     test_block_size_mismatch_warning()
     test_block_size_mismatch_error()
+    test_block_size_too_large()
+    test_highdim_block_size_too_large()

From 9c61c9d6d0b219c1ac775036340865f5eecd33e8 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 19:17:15 -0700
Subject: [PATCH 162/392] Fix comments in test

---
 tests/sdfg/disallowed_access_test.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/sdfg/disallowed_access_test.py b/tests/sdfg/disallowed_access_test.py
index b23d92de7f..8700e34db5 100644
--- a/tests/sdfg/disallowed_access_test.py
+++ b/tests/sdfg/disallowed_access_test.py
@@ -50,7 +50,6 @@ def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global):
 
 
 if __name__ == '__main__':
-    # test_gpu_access_on_host_interstate_ok()
+    test_gpu_access_on_host_interstate_ok()
     test_gpu_access_on_host_interstate_invalid()
-    # test_gpu_access_on_host_tasklet()
-    
\ No newline at end of file
+    test_gpu_access_on_host_tasklet()

From a6093b9986ef7ae29305f50233a70b11f58f93ce Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 20:06:43 -0700
Subject: [PATCH 163/392] Fix tests

---
 tests/cuda_block_test.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cuda_block_test.py b/tests/cuda_block_test.py
index 8888291810..f77e80673f 100644
--- a/tests/cuda_block_test.py
+++ b/tests/cuda_block_test.py
@@ -117,14 +117,14 @@ def tester(A: dace.float64[400, 300]):
     mapentry: dace.nodes.MapEntry = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
 
     # Test 1: too many dimensions
-    mapentry.map.gpu_block_size = (257, 5, 3, 4)
+    mapentry.map.gpu_block_size = (13, 5, 3, 4)
     code = sdfg.generate_code()[1].clean_code  # Get GPU code (second file)
-    assert 'dim3(257, 5, 12)' in code
+    assert 'dim3(13, 5, 12)' in code
 
     # Test 2: too few dimensions
-    mapentry.map.gpu_block_size = (257, 5)
+    mapentry.map.gpu_block_size = (127, 5)
     code = sdfg.generate_code()[1].clean_code  # Get GPU code (second file)
-    assert 'dim3(257, 5, 1)' in code
+    assert 'dim3(127, 5, 1)' in code
 
     # Test 3: compilation
     sdfg.compile()
@@ -141,14 +141,14 @@ def tester(A: dace.float64[400, 300, 2, 32]):
                     a = 1
 
     sdfg = tester.to_sdfg()
-    sdfg.apply_gpu_transformations(sequential_innermaps=False)
+    sdfg.apply_gpu_transformations(sequential_innermaps=True)
     mapentry: dace.nodes.MapEntry = next(
         n for n, _ in sdfg.all_nodes_recursive()
         if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device)
 
-    mapentry.map.gpu_block_size = (257, 5)
+    mapentry.map.gpu_block_size = (127, 5)
     code = sdfg.generate_code()[1].clean_code  # Get GPU code (second file)
-    assert 'dim3(257, 5, 1)' in code
+    assert 'dim3(127, 5, 1)' in code
 
     # Test 3: compilation
     sdfg.compile()

From c942f4b95fe67e079497606e039c889420a33af5 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 20:23:44 -0700
Subject: [PATCH 164/392] Run reference fix pass on SDFG after deepcopy

---
 dace/sdfg/sdfg.py                           | 10 ++++-
 dace/transformation/passes/fusion_inline.py | 43 ++++++++++++++++++++-
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index adebe51c9b..8d9d442bbc 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -491,7 +491,14 @@ def __deepcopy__(self, memo):
             setattr(result, '_transformation_hist', copy.deepcopy(self._transformation_hist, memo))
         result._sdfg_list = []
         if self._parent_sdfg is None:
+            # Avoid import loops
+            from dace.transformation.passes.fusion_inline import FixNestedSDFGReferences
+
             result._sdfg_list = result.reset_sdfg_list()
+            fixed = FixNestedSDFGReferences().apply_pass(result, {})
+            if fixed:
+                warnings.warn(f'Fixed {fixed} nested SDFG parent references during deep copy.')
+
         return result
 
     @property
@@ -2615,7 +2622,8 @@ def apply_gpu_transformations(self,
 
         self.apply_transformations(GPUTransformSDFG,
                                    options=dict(sequential_innermaps=sequential_innermaps,
-                                                register_trans=register_transients, simplify=simplify),
+                                                register_trans=register_transients,
+                                                simplify=simplify),
                                    validate=validate,
                                    validate_all=validate_all,
                                    permissive=permissive,
diff --git a/dace/transformation/passes/fusion_inline.py b/dace/transformation/passes/fusion_inline.py
index abb4a9fe74..74f73e3c93 100644
--- a/dace/transformation/passes/fusion_inline.py
+++ b/dace/transformation/passes/fusion_inline.py
@@ -7,6 +7,7 @@
 from typing import Any, Dict, Optional
 
 from dace import SDFG, properties
+from dace.sdfg import nodes
 from dace.sdfg.utils import fuse_states, inline_sdfgs
 from dace.transformation import pass_pipeline as ppl
 
@@ -20,7 +21,7 @@ class FuseStates(ppl.Pass):
 
     CATEGORY: str = 'Simplification'
 
-    permissive = properties.Property(dtype=bool, default=False, desc='If True, ignores some race conditions checks.')
+    permissive = properties.Property(dtype=bool, default=False, desc='If True, ignores some race condition checks.')
     progress = properties.Property(dtype=bool,
                                    default=None,
                                    allow_none=True,
@@ -82,3 +83,43 @@ def apply_pass(self, sdfg: SDFG, _: Dict[str, Any]) -> Optional[int]:
 
     def report(self, pass_retval: int) -> str:
         return f'Inlined {pass_retval} SDFGs.'
+
+
+@dataclass(unsafe_hash=True)
+@properties.make_properties
+class FixNestedSDFGReferences(ppl.Pass):
+    """
+    Fixes nested SDFG references to parent state/SDFG/node
+    """
+
+    CATEGORY: str = 'Simplification'
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return modified & (ppl.Modifies.States | ppl.Modifies.NestedSDFGs)
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.NestedSDFGs
+
+    def apply_pass(self, sdfg: SDFG, _: Dict[str, Any]) -> Optional[int]:
+        modified = 0
+        for node, state in sdfg.all_nodes_recursive():
+            if not isinstance(node, nodes.NestedSDFG):
+                continue
+            was_modified = False
+            if node.sdfg.parent_nsdfg_node is not node:
+                was_modified = True
+                node.sdfg.parent_nsdfg_node = node
+            if node.sdfg.parent is not state:
+                was_modified = True
+                node.sdfg.parent = state
+            if node.sdfg.parent_sdfg is not state.parent:
+                was_modified = True
+                node.sdfg.parent_sdfg = state.parent
+
+            if was_modified:
+                modified += 1
+
+        return modified or None
+
+    def report(self, pass_retval: int) -> str:
+        return f'Fixed {pass_retval} nested SDFG references.'

From 672fc30e7098be325290f91108996863fcdc4d5a Mon Sep 17 00:00:00 2001
From: Philipp Schaad <schaad.phil@gmail.com>
Date: Thu, 29 Jun 2023 06:56:24 +0200
Subject: [PATCH 165/392] Make SDFG.name a proper property (#1289)

---
 dace/sdfg/sdfg.py | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index bee601e7b1..5cb14cd7b3 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -359,6 +359,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]):
         the `Memlet` class documentation.
     """
 
+    name = Property(dtype=str, desc="Name of the SDFG")
     arg_names = ListProperty(element_type=str, desc='Ordered argument names (used for calling conventions).')
     constants_prop = Property(dtype=dict, default={}, desc="Compile-time constants")
     _arrays = Property(dtype=dict,
@@ -425,7 +426,7 @@ def __init__(self,
             :param parent: The parent SDFG or SDFG state (for nested SDFGs).
         """
         super(SDFG, self).__init__()
-        self._name = name
+        self.name = name
         if name is not None and not validate_name(name):
             raise InvalidSDFGError('Invalid SDFG name "%s"' % name, self, None)
 
@@ -1107,27 +1108,6 @@ def set_sourcecode(self, code: str, lang=None):
         """
         self.sourcecode = {'code': code, 'language': lang}
 
-    @property
-    def name(self):
-        """ The name of this SDFG. """
-        if self._name != self._orig_name:
-            return self._name
-        newname = self._orig_name
-        numbers = []
-        for sdfg in self._sdfg_list:
-            if sdfg is not self and sdfg._orig_name == self._orig_name:
-                numbers.append(sdfg._num)
-        while self._num in numbers:
-            self._num += 1
-        if self._num > 0:
-            newname = '{}_{}'.format(self._orig_name, self._num)
-            self._name = newname
-        return newname
-
-    @name.setter
-    def name(self, newname: str):
-        self._name = newname
-
     @property
     def label(self):
         """ The name of this SDFG. """
@@ -2240,7 +2220,7 @@ def compile(self, output_file=None, validate=True) -> \
             # Rename SDFG to avoid runtime issues with clashing names
             index = 0
             while sdfg.is_loaded():
-                sdfg._name = f'{self._name}_{index}'
+                sdfg.name = f'{self.name}_{index}'
                 index += 1
             if self.name != sdfg.name:
                 warnings.warn('SDFG "%s" is already loaded by another object, '

From 5d2ce3ed7e97f3748b83aab265e1a876d8b61a8e Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Wed, 28 Jun 2023 23:53:12 -0700
Subject: [PATCH 166/392] Apply review suggestions

---
 dace/runtime/include/dace/cuda/stream.cuh   | 23 ---------------------
 dace/transformation/passes/fusion_inline.py |  2 +-
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/dace/runtime/include/dace/cuda/stream.cuh b/dace/runtime/include/dace/cuda/stream.cuh
index 2790f64f4f..3600a81ce9 100644
--- a/dace/runtime/include/dace/cuda/stream.cuh
+++ b/dace/runtime/include/dace/cuda/stream.cuh
@@ -141,29 +141,6 @@ class GPUStream {
     m_data[get_addr(allocation)] = item;
   }
 
-  /*
-  __device__ __forceinline__ void push(T *items, int count)
-  {
-      // Perform a warp-wide scan to get thread offsets
-      typedef cub::WarpScan<int> WarpScan;
-      __shared__ typename WarpScan::TempStorage temp_storage[4];
-      int offset;
-      int warp_id = threadIdx.x / 32;
-      WarpScan(temp_storage[warp_id]).ExclusiveSum(count, offset);
-
-      // Atomic-add the total count once per warp
-      uint32_t addr;
-      if (threadIdx.x & 31 == 31) // Last thread
-          addr = atomicAdd(m_pending, offset + count);
-      // Broadcast starting address
-      addr = cub::ShuffleIndex(addr, 31, 0xffffffff);
-
-      // Copy data from each thread
-      for(int i = 0; i < count; ++i)
-          m_data[get_addr(addr + offset + i)] = items[i];
-  }
-  */
-
   __device__ __forceinline__ void prepend(const T &item) {
     uint32_t allocation = atomicAggDec(m_start) - 1;
     m_data[get_addr(allocation)] = item;
diff --git a/dace/transformation/passes/fusion_inline.py b/dace/transformation/passes/fusion_inline.py
index 74f73e3c93..93764670e8 100644
--- a/dace/transformation/passes/fusion_inline.py
+++ b/dace/transformation/passes/fusion_inline.py
@@ -92,7 +92,7 @@ class FixNestedSDFGReferences(ppl.Pass):
     Fixes nested SDFG references to parent state/SDFG/node
     """
 
-    CATEGORY: str = 'Simplification'
+    CATEGORY: str = 'Cleanup'
 
     def should_reapply(self, modified: ppl.Modifies) -> bool:
         return modified & (ppl.Modifies.States | ppl.Modifies.NestedSDFGs)

From e66392fb8eb7cc275edecf47f5b9edeb0d1c7e77 Mon Sep 17 00:00:00 2001
From: Phillip Allen Lane <iamaperson620@gmail.com>
Date: Fri, 30 Jun 2023 05:23:34 -0500
Subject: [PATCH 167/392] Fix ROCm issue where target ID is set to gfxgfx***
 (#1290)

---
 AUTHORS                      | 1 +
 dace/codegen/targets/cuda.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index 9b7763593e..573f142cf9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -35,5 +35,6 @@ Tiancheng Chen
 Reid Wahl
 Yihang Luo
 Alexandru Calotoiu
+Phillip Lane
 
 and other contributors listed in https://github.com/spcl/dace/graphs/contributors
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index f2e11fee7b..f4db868730 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -497,7 +497,7 @@ def cmake_options():
             hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0]
 
             flags = Config.get("compiler", "cuda", "hip_args")
-            flags += ' ' + ' '.join('--offload-arch=gfx{arch}'.format(arch=arch) for arch in hip_arch)
+            flags += ' ' + ' '.join('--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) for arch in hip_arch)
             options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags))
 
         if Config.get('compiler', 'cpu', 'executable'):

From a43c8bb4aaabfaac0863e0e1d6e81641ef800e14 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 30 Jun 2023 15:52:52 +0200
Subject: [PATCH 168/392] Quick-fix for npbench-spmv-fusion.

---
 .../interstate/gpu_transform_sdfg.py          | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py
index 7d0e7be4d2..c33fd6ae29 100644
--- a/dace/transformation/interstate/gpu_transform_sdfg.py
+++ b/dace/transformation/interstate/gpu_transform_sdfg.py
@@ -195,11 +195,14 @@ def apply(self, _, sdfg: sd.SDFG):
         #######################################################
         # Step 1: Create cloned GPU arrays and replace originals
 
+        data_already_on_gpu = {}
+
         cloned_arrays = {}
         for inodename, inode in set(input_nodes):
-            if isinstance(inode, data.Scalar):  # Scalars can remain on host
-                continue
             if inode.storage == dtypes.StorageType.GPU_Global:
+                data_already_on_gpu[inodename] = None
+                continue
+            if isinstance(inode, data.Scalar):  # Scalars can remain on host
                 continue
             newdesc = inode.clone()
             newdesc.storage = dtypes.StorageType.GPU_Global
@@ -208,9 +211,10 @@ def apply(self, _, sdfg: sd.SDFG):
             cloned_arrays[inodename] = name
 
         for onodename, onode in set(output_nodes):
-            if onodename in cloned_arrays:
-                continue
             if onode.storage == dtypes.StorageType.GPU_Global:
+                data_already_on_gpu[onodename] = None
+                continue
+            if onodename in cloned_arrays:
                 continue
             newdesc = onode.clone()
             newdesc.storage = dtypes.StorageType.GPU_Global
@@ -246,6 +250,12 @@ def apply(self, _, sdfg: sd.SDFG):
                 if not found_full_write:
                     input_nodes.append((onodename, onode))
 
+        for edge in sdfg.edges():
+            memlets = edge.data.get_read_memlets(sdfg.arrays)
+            for mem in memlets:
+                if sdfg.arrays[mem.data].storage == dtypes.StorageType.GPU_Global:
+                    data_already_on_gpu[mem.data] = None
+
         # Replace nodes
         for state in sdfg.nodes():
             for node in state.nodes():
@@ -459,7 +469,7 @@ def apply(self, _, sdfg: sd.SDFG):
         #######################################################
         # Step 8: Introduce copy-out if data used in outgoing interstate edges
 
-        cloned_data = set(cloned_arrays.keys()).union(gpu_scalars.keys())
+        cloned_data = set(cloned_arrays.keys()).union(gpu_scalars.keys()).union(data_already_on_gpu.keys())
 
         for state in list(sdfg.nodes()):
             arrays_used = set()
@@ -493,6 +503,17 @@ def apply(self, _, sdfg: sd.SDFG):
                         else:
                             desc = sdfg.arrays[hostname]
                         devicename = nname
+                    elif nname in data_already_on_gpu:
+                        hostname = data_already_on_gpu[nname]
+                        if not hostname:
+                            desc = sdfg.arrays[nname].clone()
+                            desc.storage = dtypes.StorageType.CPU_Heap
+                            desc.transient = True
+                            hostname = sdfg.add_datadesc('host_' + nname, desc, find_new_name=True)
+                            data_already_on_gpu[nname] = hostname
+                        else:
+                            desc = sdfg.arrays[hostname]
+                        devicename = nname
                     else:
                         desc = sdfg.arrays[nname]
                         hostname = nname

From 3e83820b7e9c1d5425801219acc010c332989eb5 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:53:18 -0700
Subject: [PATCH 169/392] Refactor and fix performance regression with GPU
 runtime checks (#1292)

---
 dace/codegen/common.py            | 34 +++++++----------------------
 dace/codegen/compiled_sdfg.py     |  4 ++--
 dace/codegen/tools/gpu_runtime.py | 36 +++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 28 deletions(-)
 create mode 100644 dace/codegen/tools/gpu_runtime.py

diff --git a/dace/codegen/common.py b/dace/codegen/common.py
index e8f2972c63..5dafc696cf 100644
--- a/dace/codegen/common.py
+++ b/dace/codegen/common.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import ast
 from copy import deepcopy
 import ctypes.util
@@ -6,6 +6,7 @@
 from dace.sdfg import SDFG
 from dace.properties import CodeBlock
 from dace.codegen import cppunparse
+from dace.codegen.tools import gpu_runtime
 from functools import lru_cache
 from io import StringIO
 import os
@@ -146,7 +147,11 @@ def _try_execute(cmd: str) -> bool:
                        'to either "cuda" or "hip".')
 
 
-def get_gpu_runtime_library() -> ctypes.CDLL:
+@lru_cache()
+def get_gpu_runtime() -> gpu_runtime.GPURuntime:
+    """
+    Returns the GPU runtime library (CUDA / HIP) if exists. The result is cached for performance.
+    """
     backend = get_gpu_backend()
     if backend == 'cuda':
         libpath = ctypes.util.find_library('cudart')
@@ -165,27 +170,4 @@ def get_gpu_runtime_library() -> ctypes.CDLL:
         raise RuntimeError(f'GPU runtime library for {backend} not found. Please set the {envname} '
                            'environment variable to point to the libraries.')
 
-    return ctypes.CDLL(libpath)
-
-
-def get_gpu_runtime_error_string(err: int) -> str:
-    lib = get_gpu_runtime_library()
-
-    # Obtain the error string
-    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
-    geterrorstring.restype = ctypes.c_char_p
-    return geterrorstring(err).decode('utf-8')
-
-
-def get_gpu_runtime_last_error() -> str:
-    lib = get_gpu_runtime_library()
-
-    getlasterror = getattr(lib, f'{get_gpu_backend()}GetLastError')
-    res: int = getlasterror()
-    if res == 0:
-        return None
-
-    # Obtain the error string
-    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
-    geterrorstring.restype = ctypes.c_char_p
-    return geterrorstring(res).decode('utf-8')
+    return gpu_runtime.GPURuntime(backend, libpath)
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 4538d6d9b4..ea1b9e9cb8 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -319,7 +319,7 @@ def finalize(self):
     def _get_error_text(self, result: Union[str, int]) -> str:
         if self.has_gpu_code:
             if isinstance(result, int):
-                result = common.get_gpu_runtime_error_string(result)
+                result = common.get_gpu_runtime().get_error_string(result)
             return (f'{result}. Consider enabling synchronous debugging mode (environment variable: '
                     'DACE_compiler_cuda_syncdebug=1) to see where the issue originates from.')
         else:
@@ -345,7 +345,7 @@ def __call__(self, *args, **kwargs):
             if self.has_gpu_code:
                 # Optionally get errors from call
                 try:
-                    lasterror = common.get_gpu_runtime_last_error()
+                    lasterror = common.get_gpu_runtime().get_last_error_string()
                 except RuntimeError as ex:
                     warnings.warn(f'Could not get last error from GPU runtime: {ex}')
                     lasterror = None
diff --git a/dace/codegen/tools/gpu_runtime.py b/dace/codegen/tools/gpu_runtime.py
new file mode 100644
index 0000000000..1a2c4abcef
--- /dev/null
+++ b/dace/codegen/tools/gpu_runtime.py
@@ -0,0 +1,36 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+GPU runtime testing functionality. Used for checking error codes after GPU-capable SDFG execution.
+"""
+import ctypes
+from typing import Optional
+
+
+class GPURuntime:
+    """
+    GPU runtime object containing the library (CUDA / HIP) and some functions to query errors.
+    """
+
+    def __init__(self, backend_name: str, path: str) -> None:
+        self.backend = backend_name
+        self.library = ctypes.CDLL(path)
+
+        # Prefetch runtime functions
+        self._geterrorstring = getattr(self.library, f'{self.backend}GetErrorString')
+        self._geterrorstring.restype = ctypes.c_char_p
+        self._getlasterror = getattr(self.library, f'{self.backend}GetLastError')
+
+    def get_error_string(self, err: int) -> str:
+        # Obtain the error string
+        return self._geterrorstring(err).decode('utf-8')
+
+    def get_last_error(self) -> int:
+        return self._getlasterror()
+
+    def get_last_error_string(self) -> Optional[str]:
+        res: int = self._getlasterror()
+        if res == 0:
+            return None
+
+        # Obtain the error string
+        return self.get_error_string(res)

From 5a2938c2697e423f4444bf362efdd9a71e30a198 Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Mon, 3 Jul 2023 23:43:59 +0200
Subject: [PATCH 170/392] basic allocatable functionality

---
 dace/frontend/fortran/ast_components.py       | 34 +++++++
 dace/frontend/fortran/ast_internal_classes.py | 20 ++++
 dace/frontend/fortran/ast_transforms.py       | 11 +--
 dace/frontend/fortran/fortran_parser.py       | 96 +++++++++++++------
 tests/fortran/allocate_test.py                | 51 ++++++++++
 5 files changed, 176 insertions(+), 36 deletions(-)
 create mode 100644 tests/fortran/allocate_test.py

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index e386bae23b..e917409017 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -248,6 +248,12 @@ def __init__(self, ast: f03.Program, tables: symbol_table.SymbolTables):
             "Structure_Constructor": self.structure_constructor,
             "Component_Spec_List": self.component_spec_list,
             "Write_Stmt": self.write_stmt,
+            "Assumed_Shape_Spec_List": self.assumed_shape_spec_list,
+            "Allocate_Stmt": self.allocate_stmt,
+            "Allocation_List": self.allocation_list,
+            "Allocation": self.allocation,
+            "Allocate_Shape_Spec": self.allocate_shape_spec,
+            "Allocate_Shape_Spec_List": self.allocate_shape_spec_list,
         }
 
     def list_tables(self):
@@ -359,6 +365,30 @@ def array_constructor(self, node: FASTNode):
         value_list = get_child(children, ast_internal_classes.Ac_Value_List_Node)
         return ast_internal_classes.Array_Constructor_Node(value_list=value_list.value_list)
 
+    def allocate_stmt(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Allocate_Stmt_Node(allocation_list=children[1])
+
+    def allocation_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return children
+
+    def allocation(self, node: FASTNode):
+        children = self.create_children(node)
+        name = get_child(children, ast_internal_classes.Name_Node)
+        shape = get_child(children, ast_internal_classes.Allocate_Shape_Spec_List)
+        return ast_internal_classes.Allocation_Node(name=name, shape=shape)
+
+    def allocate_shape_spec_list(self, node: FASTNode):
+        children = self.create_children(node)
+        return ast_internal_classes.Allocate_Shape_Spec_List(shape_list=children)
+
+    def allocate_shape_spec(self, node: FASTNode):
+        children = self.create_children(node)
+        if len(children) != 2:
+            raise NotImplementedError("Only simple allocate shape specs are supported")
+        return children[1]
+
     def structure_constructor(self, node: FASTNode):
         children = self.create_children(node)
         name = get_child(children, ast_internal_classes.Type_Name_Node)
@@ -490,6 +520,10 @@ def declaration_type_spec(self, node: FASTNode):
         raise NotImplementedError("Declaration type spec is not supported yet")
         return node
 
+    def assumed_shape_spec_list(self, node: FASTNode):
+
+        return node
+
     def type_declaration_stmt(self, node: FASTNode):
 
         #decide if its a intrinsic variable type or a derived type
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index f4dba68fb4..6bdfb61faf 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -158,6 +158,26 @@ class Type_Decl_Node(Statement_Node):
     _fields = ()
 
 
+class Allocate_Shape_Spec_Node(FNode):
+    _attributes = ()
+    _fields = ('sizes', )
+
+
+class Allocate_Shape_Spec_List(FNode):
+    _attributes = ()
+    _fields = ('shape_list', )
+
+
+class Allocation_Node(FNode):
+    _attributes = ('name', )
+    _fields = ('shape', )
+
+
+class Allocate_Stmt_Node(FNode):
+    _attributes = ()
+    _fields = ('allocation_list', )
+
+
 class Symbol_Decl_Node(Statement_Node):
     _attributes = (
         'name',
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 62c5ad0c7e..7e5cd3bf00 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -357,10 +357,7 @@ def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_
                 new_indices.append(ast_internal_classes.Name_Node(name="tmp_index_" + str(tmp)))
                 tmp = tmp + 1
         self.count = tmp
-        return ast_internal_classes.Array_Subscript_Node(
-            name=node.name,
-            indices=new_indices,
-        )
+        return ast_internal_classes.Array_Subscript_Node(name=node.name, indices=new_indices)
 
     def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
         newbody = []
@@ -543,8 +540,8 @@ def localFunctionStatementEliminator(node: ast_internal_classes.FNode):
                         i.lval, ast_internal_classes.Structure_Constructor_Node):
                     function_statement_name = i.lval.name
                     is_actually_function_statement = False
-                    # In Fortran, function statement are defined as scalar values, 
-                    # but called as arrays, so by identifiying that it is called as 
+                    # In Fortran, function statement are defined as scalar values,
+                    # but called as arrays, so by identifiying that it is called as
                     # a call_expr or structure_constructor, we also need to match
                     # the specification part and see that it is scalar rather than an array.
                     found = False
@@ -562,7 +559,7 @@ def localFunctionStatementEliminator(node: ast_internal_classes.FNode):
                     if is_actually_function_statement:
                         to_change.append([i.lval, i.rval])
                         new_exec.remove(i)
-                        
+
                     else:
                         #There are no function statements after the first one that isn't a function statement
                         break
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 3f3df33997..4180b45371 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -42,6 +42,7 @@ def __init__(self, ast: ast_components.InternalFortranAst, source: str):
         self.views = 0
         self.libstates = []
         self.file_name = source
+        self.unallocated_arrays = []
         self.all_array_names = []
         self.last_sdfg_states = {}
         self.last_loop_continues = {}
@@ -64,8 +65,8 @@ def __init__(self, ast: ast_components.InternalFortranAst, source: str):
             ast_internal_classes.Call_Expr_Node: self.call2sdfg,
             ast_internal_classes.Program_Node: self.ast2sdfg,
             ast_internal_classes.Write_Stmt_Node: self.write2sdfg,
+            ast_internal_classes.Allocate_Stmt_Node: self.allocate2sdfg,
         }
-        
 
     def get_dace_type(self, type):
         """  
@@ -176,6 +177,44 @@ def basicblock2sdfg(self, node: ast_internal_classes.Execution_Part_Node, sdfg:
         for i in node.execution:
             self.translate(i, sdfg)
 
+    def allocate2sdfg(self, node: ast_internal_classes.Allocate_Stmt_Node, sdfg: SDFG):
+        """
+        This function is responsible for translating Fortran allocate statements into a SDFG.
+        :param node: The node to be translated
+        :param sdfg: The SDFG to which the node should be translated
+        :note: We pair the allocate with a list of unallocated arrays.
+        """
+        for i in node.allocation_list:
+            for j in self.unallocated_arrays:
+                if j[0] == i.name.name and sdfg == j[2]:
+                    datatype = j[1]
+                    transient = j[3]
+                    self.unallocated_arrays.remove(j)
+                    offset_value = -1
+                    sizes = []
+                    offset = []
+                    for j in i.shape.shape_list:
+                        tw = ast_utils.TaskletWriter([], [], sdfg, self.name_mapping)
+                        text = tw.write_code(j)
+                        sizes.append(sym.pystr_to_symbolic(text))
+                        offset.append(offset_value)
+                    strides = [dat._prod(sizes[:i]) for i in range(len(sizes))]
+                    self.name_mapping[sdfg][i.name.name] = sdfg._find_new_name(i.name.name)
+
+                    self.all_array_names.append(self.name_mapping[sdfg][i.name.name])
+                    if self.contexts.get(sdfg.name) is None:
+                        self.contexts[sdfg.name] = ast_utils.Context(name=sdfg.name)
+                    if i.name.name not in self.contexts[sdfg.name].containers:
+                        self.contexts[sdfg.name].containers.append(i.name.name)
+                    sdfg.add_array(self.name_mapping[sdfg][i.name.name],
+                                   shape=sizes,
+                                   dtype=datatype,
+                                   offset=offset,
+                                   strides=strides,
+                                   transient=transient)
+
+        #raise NotImplementedError("Fortran allocate statements are not implemented yet")
+
     def write2sdfg(self, node: ast_internal_classes.Write_Stmt_Node, sdfg: SDFG):
         #TODO implement
         raise NotImplementedError("Fortran write statements are not implemented yet")
@@ -321,7 +360,6 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
         write_names = list(dict.fromkeys([i.name for i in output_vars]))
         read_names = list(dict.fromkeys([i.name for i in input_vars]))
 
-
         # Collect the parameters and the function signature to comnpare and link
         parameters = node.args.copy()
 
@@ -493,7 +531,6 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                             ins_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
                         if local_name.name in write_names:
                             outs_in_new_sdfg.append(self.name_mapping[new_sdfg][local_name.name])
-                        
 
                         indices = 0
                         if isinstance(variable_in_call, ast_internal_classes.Array_Subscript_Node):
@@ -568,7 +605,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    
+
                     array_in_global = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
@@ -580,7 +617,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                                            transient=False,
                                            strides=array_in_global.strides,
                                            offset=array_in_global.offset)
-        # This handles the case where the function is called with wrriten but not read variables found in a module                        
+        # This handles the case where the function is called with wrriten but not read variables found in a module
         for i in not_found_write_names:
             if i in not_found_read_names:
                 continue
@@ -593,7 +630,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    
+
                     array = sdfg.arrays[self.name_mapping[sdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
@@ -613,7 +650,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                         ins_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
                     if i in write_names:
                         outs_in_new_sdfg.append(self.name_mapping[new_sdfg][i])
-                    
+
                     array = self.globalsdfg.arrays[self.name_mapping[self.globalsdfg][i]]
                     if isinstance(array_in_global, Scalar):
                         new_sdfg.add_scalar(self.name_mapping[new_sdfg][i], array_in_global.dtype, transient=False)
@@ -631,16 +668,16 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                                                  ins_in_new_sdfg,
                                                  outs_in_new_sdfg,
                                                  symbol_mapping=sym_dict)
-        
+
         # Now adding memlets
         for i in self.libstates:
             memlet = "0"
             if i in write_names:
                 ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][i], internal_sdfg,
-                                            self.name_mapping[new_sdfg][i], memlet)
+                                           self.name_mapping[new_sdfg][i], memlet)
             if i in read_names:
                 ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][i], internal_sdfg,
-                                           self.name_mapping[new_sdfg][i], memlet)
+                                          self.name_mapping[new_sdfg][i], memlet)
 
         for i in variables_in_call:
 
@@ -658,7 +695,6 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
             else:
                 raise NameError("Variable name not found: " + ast_utils.get_name(i))
 
-            
             if not hasattr(var, "shape") or len(var.shape) == 0:
                 memlet = ""
             elif (len(var.shape) == 1 and var.shape[0] == 1):
@@ -687,29 +723,29 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
             if not found:
                 if local_name.name in write_names:
                     ast_utils.add_memlet_write(substate, mapped_name, internal_sdfg,
-                                                self.name_mapping[new_sdfg][local_name.name], memlet)
+                                               self.name_mapping[new_sdfg][local_name.name], memlet)
                 if local_name.name in read_names:
                     ast_utils.add_memlet_read(substate, mapped_name, internal_sdfg,
-                                               self.name_mapping[new_sdfg][local_name.name], memlet)
+                                              self.name_mapping[new_sdfg][local_name.name], memlet)
 
         for i in addedmemlets:
 
             memlet = ast_utils.generate_memlet(ast_internal_classes.Name_Node(name=i), sdfg, self)
             if local_name.name in write_names:
                 ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][i], internal_sdfg,
-                                            self.name_mapping[new_sdfg][i], memlet)
+                                           self.name_mapping[new_sdfg][i], memlet)
             if local_name.name in read_names:
                 ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][i], internal_sdfg,
-                                           self.name_mapping[new_sdfg][i], memlet)
+                                          self.name_mapping[new_sdfg][i], memlet)
         for i in globalmemlets:
 
             memlet = ast_utils.generate_memlet(ast_internal_classes.Name_Node(name=i), sdfg, self)
             if local_name.name in write_names:
                 ast_utils.add_memlet_write(substate, self.name_mapping[self.globalsdfg][i], internal_sdfg,
-                                            self.name_mapping[new_sdfg][i], memlet)
+                                           self.name_mapping[new_sdfg][i], memlet)
             if local_name.name in read_names:
                 ast_utils.add_memlet_read(substate, self.name_mapping[self.globalsdfg][i], internal_sdfg,
-                                           self.name_mapping[new_sdfg][i], memlet)
+                                          self.name_mapping[new_sdfg][i], memlet)
 
         #Finally, now that the nested sdfg is built and the memlets are added, we can parse the internal of the subroutine and add it to the SDFG.
 
@@ -731,7 +767,6 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                 self.translate(i, new_sdfg)
             self.translate(node.execution_part, new_sdfg)
 
-    
     def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG):
         """
         This parses binary operations to tasklets in a new state or creates
@@ -787,8 +822,8 @@ def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG):
         output_names_changed = [o_t + "_out" for o_t in output_names]
 
         tasklet = ast_utils.add_tasklet(substate, "_l" + str(node.line_number[0]) + "_c" + str(node.line_number[1]),
-                                         input_names_tasklet, output_names_changed, "text", node.line_number,
-                                         self.file_name)
+                                        input_names_tasklet, output_names_changed, "text", node.line_number,
+                                        self.file_name)
 
         for i, j in zip(input_names, input_names_tasklet):
             memlet_range = self.get_memlet_range(sdfg, input_vars, i, j)
@@ -799,12 +834,11 @@ def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG):
             memlet_range = self.get_memlet_range(sdfg, output_vars, i, j)
             ast_utils.add_memlet_write(substate, i, tasklet, k, memlet_range)
         tw = ast_utils.TaskletWriter(output_names, output_names_changed, sdfg, self.name_mapping, input_names,
-                                      input_names_tasklet)
+                                     input_names_tasklet)
 
         text = tw.write_code(node)
         tasklet.code = CodeBlock(text, lang.Python)
 
-
     def call2sdfg(self, node: ast_internal_classes.Call_Expr_Node, sdfg: SDFG):
         """
         This parses function calls to a nested SDFG 
@@ -879,7 +913,7 @@ def call2sdfg(self, node: ast_internal_classes.Call_Expr_Node, sdfg: SDFG):
                 output_names_changed.append(o_t + "_out")
 
             tw = ast_utils.TaskletWriter(output_names_tasklet.copy(), output_names_changed.copy(), sdfg,
-                                          self.name_mapping)
+                                         self.name_mapping)
             if not isinstance(rettype, ast_internal_classes.Void) and hasret:
                 special_list_in[retval.name] = pointer(self.get_dace_type(rettype))
                 special_list_out.append(retval.name + "_out")
@@ -896,15 +930,15 @@ def call2sdfg(self, node: ast_internal_classes.Call_Expr_Node, sdfg: SDFG):
             }, output_names_changed + special_list_out, "text", node.line_number, self.file_name)
             if libstate is not None:
                 ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][libstate], tasklet,
-                                           self.name_mapping[sdfg][libstate] + "_task", "0")
+                                          self.name_mapping[sdfg][libstate] + "_task", "0")
 
                 ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][libstate], tasklet,
-                                            self.name_mapping[sdfg][libstate] + "_task_out", "0")
+                                           self.name_mapping[sdfg][libstate] + "_task_out", "0")
             if not isinstance(rettype, ast_internal_classes.Void) and hasret:
                 ast_utils.add_memlet_read(substate, self.name_mapping[sdfg][retval.name], tasklet, retval.name, "0")
 
                 ast_utils.add_memlet_write(substate, self.name_mapping[sdfg][retval.name], tasklet,
-                                            retval.name + "_out", "0")
+                                           retval.name + "_out", "0")
 
             for i, j in zip(input_names, input_names_tasklet):
                 memlet_range = self.get_memlet_range(sdfg, used_vars, i, j)
@@ -938,6 +972,10 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
         transient = True
         # find the type
         datatype = self.get_dace_type(node.type)
+        if hasattr(node, "alloc"):
+            if node.alloc:
+                self.unallocated_arrays.append([node.name, datatype, sdfg, transient])
+                return
         # get the dimensions
         if node.sizes is not None:
             sizes = []
@@ -954,10 +992,10 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
         # create and check name - if variable is already defined (function argument and defined in declaration part) simply stop
         if self.name_mapping[sdfg].get(node.name) is not None:
             return
-            
+
         if node.name in sdfg.symbols:
             return
-            
+
         self.name_mapping[sdfg][node.name] = sdfg._find_new_name(node.name)
 
         if sizes is None:
@@ -970,7 +1008,7 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
                            offset=offset,
                            strides=strides,
                            transient=transient)
-        
+
         self.all_array_names.append(self.name_mapping[sdfg][node.name])
         if self.contexts.get(sdfg.name) is None:
             self.contexts[sdfg.name] = ast_utils.Context(name=sdfg.name)
diff --git a/tests/fortran/allocate_test.py b/tests/fortran/allocate_test.py
new file mode 100644
index 0000000000..8d7dbbc856
--- /dev/null
+++ b/tests/fortran/allocate_test.py
@@ -0,0 +1,51 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+from dace import SDFG, SDFGState, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_basic_allocate():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM allocate_test
+                    implicit none
+                    double precision, allocatable :: d(:,:)
+                    allocate(d(4,5))
+                    CALL allocate_test_function(d)
+                    end
+
+                    SUBROUTINE allocate_test_function(d)
+                    double precision d(4,5)
+                    
+                    d(2,1)=5.5
+                    
+                    END SUBROUTINE allocate_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "allocate_test")
+    sdfg.simplify(verbose=True)
+    a = np.full([4,5], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert (a[0,0] == 42)
+    assert (a[1,0] == 5.5)
+    assert (a[2,0] == 42)
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_basic_allocate()

From b79b00854096e56368c1e83267ba922190bac9d9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 3 Jul 2023 23:37:11 -0700
Subject: [PATCH 171/392] Externally-managed memory lifetime

---
 dace/codegen/common.py                        |  4 +-
 dace/codegen/compiled_sdfg.py                 | 50 ++++++++--
 dace/codegen/dispatcher.py                    | 17 +++-
 dace/codegen/targets/cpp.py                   | 14 +--
 dace/codegen/targets/cpu.py                   | 19 ++--
 dace/codegen/targets/cuda.py                  | 15 ++-
 dace/codegen/targets/fpga.py                  |  2 +-
 dace/codegen/targets/framecode.py             | 46 ++++++++-
 dace/codegen/targets/snitch.py                |  5 +-
 dace/dtypes.py                                |  1 +
 dace/sdfg/sdfg.py                             |  4 +-
 dace/sdfg/validation.py                       | 16 ++-
 .../transformation/interstate/sdfg_nesting.py |  6 +-
 .../passes/dead_dataflow_elimination.py       |  2 +-
 .../transformation/passes/scalar_to_symbol.py |  6 +-
 tests/codegen/external_memory_test.py         | 98 +++++++++++++++++++
 16 files changed, 258 insertions(+), 47 deletions(-)
 create mode 100644 tests/codegen/external_memory_test.py

diff --git a/dace/codegen/common.py b/dace/codegen/common.py
index 5dafc696cf..37cfb864eb 100644
--- a/dace/codegen/common.py
+++ b/dace/codegen/common.py
@@ -74,7 +74,7 @@ def update_persistent_desc(desc: data.Data, sdfg: SDFG):
     Replaces the symbols used in a persistent data descriptor according to NestedSDFG's symbol mapping.
     The replacement happens recursively up to the top-level SDFG.
     """
-    if (desc.lifetime == dtypes.AllocationLifetime.Persistent and sdfg.parent
+    if (desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External) and sdfg.parent
             and any(str(s) in sdfg.parent_nsdfg_node.symbol_mapping for s in desc.free_symbols)):
         newdesc = deepcopy(desc)
         csdfg = sdfg
@@ -155,7 +155,7 @@ def get_gpu_runtime() -> gpu_runtime.GPURuntime:
     backend = get_gpu_backend()
     if backend == 'cuda':
         libpath = ctypes.util.find_library('cudart')
-        if os.name == 'nt' and not libpath: # Windows-based search
+        if os.name == 'nt' and not libpath:  # Windows-based search
             for version in (12, 11, 10, 9):
                 libpath = ctypes.util.find_library(f'cudart64_{version}0')
                 if libpath:
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index ea1b9e9cb8..d0d29cfa1e 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -147,21 +147,20 @@ def __exit__(self, *args, **kwargs):
         self.unload()
 
 
-def _array_interface_ptr(array: Any, array_type: dt.Array) -> int:
+def _array_interface_ptr(array: Any, storage: dtypes.StorageType) -> int:
     """
     If the given array implements ``__array_interface__`` (see
     ``dtypes.is_array``), returns the base host or device pointer to the
     array's allocated memory.
 
     :param array: Array object that implements NumPy's array interface.
-    :param array_type: Data descriptor of the array (used to get storage
-                       location to determine whether it's a host or GPU device
-                       pointer).
+    :param array_type: Storage location of the array, used to determine whether
+                       it is a host or device pointer (e.g. GPU).
     :return: A pointer to the base location of the allocated buffer.
     """
     if hasattr(array, 'data_ptr'):
         return array.data_ptr()
-    if array_type.storage == dtypes.StorageType.GPU_Global:
+    if storage == dtypes.StorageType.GPU_Global:
         return array.__cuda_array_interface__['data'][0]
     return array.__array_interface__['data'][0]
 
@@ -200,10 +199,13 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self.argnames = argnames
 
         self.has_gpu_code = False
+        self.external_memory_types = set()
         for _, _, aval in self._sdfg.arrays_recursive():
             if aval.storage in dtypes.GPU_STORAGES:
                 self.has_gpu_code = True
                 break
+            if aval.lifetime == dtypes.AllocationLifetime.External:
+                self.external_memory_types.add(aval.storage)
         if not self.has_gpu_code:
             for node, _ in self._sdfg.all_nodes_recursive():
                 if getattr(node, 'schedule', False) in dtypes.GPU_SCHEDULES:
@@ -271,6 +273,42 @@ class State(ctypes.Structure):
 
         return State
 
+    def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
+        """
+        Returns the total external memory size to be allocated for this SDFG.
+
+        :return: A dictionary mapping storage types to the number of bytes necessary
+                 to allocate for the SDFG to work properly.
+        """
+        if not self._initialized:
+            raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to '
+                             'querying external memory size.')
+
+        result: Dict[dtypes.StorageType, int] = {}
+        for storage in self.external_memory_types:
+            func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}')
+            result[storage] = func(self._libhandle, *self._lastargs[1])
+
+        return result
+
+    def set_workspace(self, storage: dtypes.StorageType, workspace: Any):
+        """
+        Sets the workspace for the given storage type to the given buffer.
+
+        :param storage: The storage type to fill.
+        :param workspace: An array-convertible object (through ``__[cuda_]array_interface__``,
+                          see ``_array_interface_ptr``) to use for the workspace.
+        """
+        if not self._initialized:
+            raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to '
+                             'setting external memory.')
+        if storage not in self.external_memory_types:
+            raise ValueError(f'Compiled SDFG does not specify external memory of {storage}')
+
+        func = self._lib.get_symbol(f'__dace_set_external_memory_{storage.name}', None)
+        ptr = _array_interface_ptr(workspace, storage)
+        func(self._libhandle, ctypes.c_void_p(ptr), *self._lastargs[1])
+
     @property
     def filename(self):
         return self._lib._library_filename
@@ -487,7 +525,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
             for arg, actype, atype, aname in callparams if aname in symbols)
 
         # Replace arrays with their base host/device pointers
-        newargs = tuple((ctypes.c_void_p(_array_interface_ptr(arg, atype)), actype,
+        newargs = tuple((ctypes.c_void_p(_array_interface_ptr(arg, atype.storage)), actype,
                          atype) if dtypes.is_array(arg) else (arg, actype, atype)
                         for arg, actype, atype, _ in callparams)
 
diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 103cb5fa2e..0b4f58d5ef 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -32,6 +32,7 @@ class DefinedMemlets:
         referenced correctly in nested scopes and SDFGs.
         The ones defined in the first (top) scope, refer to global variables.
     """
+
     def __init__(self):
         self._scopes = [(None, {}, True), (None, {}, True)]
 
@@ -142,6 +143,7 @@ def remove(self, name: str, ancestor: int = 0, is_global: bool = False) -> Tuple
 class TargetDispatcher(object):
     """ Dispatches sub-SDFG generation (according to scope),
         storage<->storage copies, and storage<->tasklet copies to targets. """
+
     def __init__(self, framecode):
         # Avoid import loop
         from dace.codegen.targets import framecode as fc
@@ -215,7 +217,8 @@ def register_state_dispatcher(self, dispatcher, predicate=None):
         """
 
         if not hasattr(dispatcher, "generate_state"):
-            raise TypeError("State dispatcher \"{}\" does not " "implement \"generate_state\"".format(dispatcher))
+            raise TypeError("State dispatcher \"{}\" does not "
+                            "implement \"generate_state\"".format(dispatcher))
         if predicate is None:
             self._generic_state_dispatcher = dispatcher
         else:
@@ -241,7 +244,8 @@ def register_node_dispatcher(self, dispatcher, predicate=None):
             :see: TargetCodeGenerator
         """
         if not hasattr(dispatcher, "generate_node"):
-            raise TypeError("Node dispatcher must " "implement \"generate_node\"")
+            raise TypeError("Node dispatcher must "
+                            "implement \"generate_node\"")
         if predicate is None:
             self._generic_node_dispatcher = dispatcher
         else:
@@ -448,9 +452,12 @@ def dispatch_allocate(self,
         """ Dispatches a code generator for data allocation. """
         self._used_targets.add(self._array_dispatchers[datadesc.storage])
 
-        if datadesc.lifetime is dtypes.AllocationLifetime.Persistent:
+        if datadesc.lifetime == dtypes.AllocationLifetime.Persistent:
             declaration_stream = CodeIOStream()
             callsite_stream = self.frame._initcode
+        elif datadesc.lifetime == dtypes.AllocationLifetime.External:
+            declaration_stream = CodeIOStream()
+            callsite_stream = CodeIOStream()
         else:
             declaration_stream = callsite_stream
 
@@ -468,8 +475,10 @@ def dispatch_deallocate(self, sdfg: SDFG, dfg: ScopeSubgraphView, state_id: int,
         """ Dispatches a code generator for a data deallocation. """
         self._used_targets.add(self._array_dispatchers[datadesc.storage])
 
-        if datadesc.lifetime is dtypes.AllocationLifetime.Persistent:
+        if datadesc.lifetime == dtypes.AllocationLifetime.Persistent:
             callsite_stream = self.frame._exitcode
+        elif datadesc.lifetime == dtypes.AllocationLifetime.External:
+            return
 
         self._array_dispatchers[datadesc.storage].deallocate_array(sdfg, dfg, state_id, node, datadesc, function_stream,
                                                                    callsite_stream)
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index d5e7cacc53..295bf21310 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -62,7 +62,8 @@ def copy_expr(
         offset_cppstr = "0"
     dt = ""
 
-    is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent)
+    is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
+                                       dtypes.AllocationLifetime.External)
     defined_types = None
     # Non-free symbol dependent Arrays due to their shape
     dependent_shape = (isinstance(data_desc, data.Array) and not isinstance(data_desc, data.View) and any(
@@ -219,7 +220,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str:
 
     # Special case: If memory is persistent and defined in this SDFG, add state
     # struct to name
-    if (desc.transient and desc.lifetime is dtypes.AllocationLifetime.Persistent):
+    if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)):
         from dace.codegen.targets.cuda import CUDACodeGen  # Avoid import loop
 
         if desc.storage == dtypes.StorageType.CPU_ThreadLocal:  # Use unambiguous name for thread-local arrays
@@ -1252,7 +1253,7 @@ def visit_BinOp(self, node: ast.BinOp):
         if isinstance(node.op, ast.Pow):
             from dace.frontend.python import astutils
             try:
-                evaluated_node = astutils.evalnode(node.right, {**self.constants, 'dace': dace,'math': math})
+                evaluated_node = astutils.evalnode(node.right, {**self.constants, 'dace': dace, 'math': math})
                 unparsed = symbolic.pystr_to_symbolic(evaluated_node)
                 evaluated_constant = symbolic.evaluate(unparsed, self.constants)
                 evaluated = symbolic.symstr(evaluated_constant, cpp_mode=True)
@@ -1356,8 +1357,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
             if isinstance(desc, data.Array) and desc.start_offset != 0:
                 ptrname = f'({ptrname} - {sym2cpp(desc.start_offset)})'
             if Config.get_bool('compiler', 'cuda', 'syncdebug'):
-                callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg,
-                                      state_id, scope_exit)
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg, state_id,
+                                      scope_exit)
                 callsite_stream.write(f'DACE_GPU_CHECK({backend}DeviceSynchronize());')
             else:
                 callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', sdfg, state_id, scope_exit)
@@ -1381,7 +1382,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
                     and edge.dst._cuda_stream != node._cuda_stream):
                 callsite_stream.write(
                     """DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream}));
-DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));""".format(
+DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));"""
+                    .format(
                         ev=edge._cuda_event if hasattr(edge, "_cuda_event") else 0,
                         src_stream=cudastream,
                         dst_stream=edge.dst._cuda_stream,
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 83f178c538..eb7d232966 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -222,7 +222,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
         # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if
         # `nodedesc` is a View and `dfg` is None.
         if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
-            raise NotImplementedError("The declare_array method should only be used for variables "
+                raise NotImplementedError("The declare_array method should only be used for variables "
                                       "that must have their declaration and allocation separate.")
 
         name = node.data
@@ -278,7 +278,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         declared = self._dispatcher.declared_arrays.has(alloc_name)
 
         define_var = self._dispatcher.defined_vars.add
-        if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent:
+        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             define_var = self._dispatcher.defined_vars.add_global
             nodedesc = update_persistent_desc(nodedesc, sdfg)
 
@@ -449,7 +449,8 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream,
             alloc_name = f'({alloc_name} - {cpp.sym2cpp(nodedesc.start_offset)})'
 
         if self._dispatcher.declared_arrays.has(alloc_name):
-            is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent)
+            is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
+                                              dtypes.AllocationLifetime.External)
             self._dispatcher.declared_arrays.remove(alloc_name, is_global=is_global)
 
         if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
@@ -932,7 +933,8 @@ def process_out_memlets(self,
                         desc = sdfg.arrays[memlet.data]
                         ptrname = cpp.ptr(memlet.data, desc, sdfg, self._frame)
                         is_global = desc.lifetime in (dtypes.AllocationLifetime.Global,
-                                                      dtypes.AllocationLifetime.Persistent)
+                                                      dtypes.AllocationLifetime.Persistent,
+                                                      dtypes.AllocationLifetime.External)
                         try:
                             defined_type, _ = self._dispatcher.declared_arrays.get(ptrname, is_global=is_global)
                         except KeyError:
@@ -1430,7 +1432,8 @@ def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge,
             # If pointer, also point to output
             desc = sdfg.arrays[edge.data.data]
             ptrname = cpp.ptr(edge.data.data, desc, sdfg, self._frame)
-            is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent)
+            is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
+                                          dtypes.AllocationLifetime.External)
             defined_type, _ = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
             base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type, codegen=self._frame)
             callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node)
@@ -1448,18 +1451,22 @@ def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references,
         # Add "__restrict__" keywords to arguments that do not alias with others in the context of this SDFG
         restrict_args = []
         for atype, aname, _ in memlet_references:
+
             def make_restrict(expr: str) -> str:
                 # Check whether "restrict" has already been added before and can be added
                 if expr.strip().endswith('*'):
                     return '__restrict__'
                 else:
                     return ''
+
             if aname in node.sdfg.arrays and not node.sdfg.arrays[aname].may_alias:
                 restrict_args.append(make_restrict(atype))
             else:
                 restrict_args.append('')
 
-        arguments += [f'{atype} {restrict} {aname}' for (atype, aname, _), restrict in zip(memlet_references, restrict_args)]
+        arguments += [
+            f'{atype} {restrict} {aname}' for (atype, aname, _), restrict in zip(memlet_references, restrict_args)
+        ]
         arguments += [
             f'{node.sdfg.symbols[aname].as_arg(aname)}' for aname in sorted(node.symbol_mapping.keys())
             if aname not in sdfg.constants
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index f4db868730..1e06a1d3ef 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -497,7 +497,9 @@ def cmake_options():
             hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0]
 
             flags = Config.get("compiler", "cuda", "hip_args")
-            flags += ' ' + ' '.join('--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) for arch in hip_arch)
+            flags += ' ' + ' '.join(
+                '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch)
+                for arch in hip_arch)
             options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags))
 
         if Config.get('compiler', 'cpu', 'executable'):
@@ -568,7 +570,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
             return self._cpu_codegen.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream,
                                                         allocation_stream)
 
-        if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent:
+        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             nodedesc = update_persistent_desc(nodedesc, sdfg)
 
         result_decl = StringIO()
@@ -717,7 +719,8 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream,
             dataname = f'({dataname} - {cpp.sym2cpp(nodedesc.start_offset)})'
 
         if self._dispatcher.declared_arrays.has(dataname):
-            is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent)
+            is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
+                                              dtypes.AllocationLifetime.External)
             self._dispatcher.declared_arrays.remove(dataname, is_global=is_global)
 
         if isinstance(nodedesc, dace.data.Stream):
@@ -1449,7 +1452,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                 if aname in sdfg.arrays:
                     data_desc = sdfg.arrays[aname]
                     is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global,
-                                                       dtypes.AllocationLifetime.Persistent)
+                                                       dtypes.AllocationLifetime.Persistent,
+                                                       dtypes.AllocationLifetime.External)
                     # Non-free symbol dependent Arrays due to their shape
                     dependent_shape = (isinstance(data_desc, dt.Array) and not isinstance(data_desc, dt.View) and any(
                         str(s) not in self._frame.symbols_and_constants(sdfg)
@@ -1482,7 +1486,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st
                     data_desc = sdfg.arrays[aname]
                     ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame)
                     is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global,
-                                                       dtypes.AllocationLifetime.Persistent)
+                                                       dtypes.AllocationLifetime.Persistent,
+                                                       dtypes.AllocationLifetime.External)
                     defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
                     CUDACodeGen._in_device_code = True
                     inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame)
diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py
index b920b0e9d5..413cb751d6 100644
--- a/dace/codegen/targets/fpga.py
+++ b/dace/codegen/targets/fpga.py
@@ -1171,7 +1171,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         # NOTE: The code below fixes symbol-related issues with transient data originally defined in a NestedSDFG scope
         # but promoted to be persistent. These data must have their free symbols replaced with the corresponding
         # top-level SDFG symbols.
-        if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent:
+        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             nodedesc = update_persistent_desc(nodedesc, sdfg)
 
         result_decl = StringIO()
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 09bbd30ab8..888941312e 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -333,6 +333,49 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
         callsite_stream.write('delete __state;\n', sdfg)
         callsite_stream.write('return __err;\n}\n', sdfg)
 
+    def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeIOStream):
+        # Collect external arrays
+        ext_arrays: Dict[dtypes.StorageType, List[Tuple[SDFG, str, data.Data]]] = collections.defaultdict(list)
+        for subsdfg, aname, arr in sdfg.arrays_recursive():
+            if arr.lifetime == dtypes.AllocationLifetime.External:
+                ext_arrays[arr.storage].append((subsdfg, aname, arr))
+
+        # Only generate functions as necessary
+        if not ext_arrays:
+            return
+
+        initparams = sdfg.init_signature(free_symbols=self.free_symbols(sdfg))
+        initparams_comma = (', ' + initparams) if initparams else ''
+
+        for storage, arrays in ext_arrays.items():
+            size = 0
+            for subsdfg, aname, arr in arrays:
+                size += arr.total_size * arr.dtype.bytes
+
+            # Size query functions
+            callsite_stream.write(
+                f'''
+DACE_EXPORTED size_t __dace_get_external_memory_size_{storage.name}({sdfg.name}_t *__state{initparams_comma})
+{{
+    return {sym2cpp(size)};
+}}
+''', sdfg)
+
+            # Pointer set functions
+            callsite_stream.write(
+                f'''
+DACE_EXPORTED void __dace_set_external_memory_{storage.name}({sdfg.name}_t *__state, char *ptr{initparams_comma})
+{{''', sdfg)
+            
+            offset = 0
+            for subsdfg, aname, arr in arrays:
+                allocname = f'__state->__{subsdfg.sdfg_id}_{aname}'
+                callsite_stream.write(f'{allocname} = decltype({allocname})(ptr + {sym2cpp(offset)});', subsdfg)
+                offset += arr.total_size * arr.dtype.bytes
+            
+            # Footer
+            callsite_stream.write('}', sdfg)
+
     def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True):
 
         sid = sdfg.node_id(state)
@@ -525,7 +568,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
                 access_instances[sdfg.sdfg_id].get(name, [(None, None)])[-1]
 
             # Cases
-            if desc.lifetime is dtypes.AllocationLifetime.Persistent:
+            if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
                 # Persistent memory is allocated in initialization code and
                 # exists in the library state structure
 
@@ -872,6 +915,7 @@ def generate_code(self,
             function_signature = ('void __program_%s_internal(%s_t *__state%s)\n{\n' % (sdfg.name, sdfg.name, params))
 
             self.generate_footer(sdfg, footer_global_stream, footer_stream)
+            self.generate_external_memory_management(sdfg, footer_stream)
 
             header_global_stream.write(global_stream.getvalue())
             header_global_stream.write(footer_global_stream.getvalue())
diff --git a/dace/codegen/targets/snitch.py b/dace/codegen/targets/snitch.py
index 1c4ba8f821..1eb6f68a2a 100644
--- a/dace/codegen/targets/snitch.py
+++ b/dace/codegen/targets/snitch.py
@@ -366,7 +366,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre
         # NOTE: The code below fixes symbol-related issues with transient data originally defined in a NestedSDFG scope
         # but promoted to be persistent. These data must have their free symbols replaced with the corresponding
         # top-level SDFG symbols.
-        if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent:
+        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             nodedesc = update_persistent_desc(nodedesc, sdfg)
 
         # Compute array size
@@ -411,7 +411,8 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre
             elif not symbolic.issymbolic(arrsize, sdfg.constants):
                 # static allocation
                 declaration_stream.write(f'// static allocate storage "{nodedesc.storage}"')
-                if node.desc(sdfg).lifetime == dace.AllocationLifetime.Persistent:
+                if node.desc(sdfg).lifetime in (dtypes.AllocationLifetime.Persistent,
+                                                dtypes.AllocationLifetime.External):
                     # Don't put a static if it is declared in the state struct for C compliance
                     declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', sdfg,
                                              state_id, node)
diff --git a/dace/dtypes.py b/dace/dtypes.py
index a86a746884..dee2283f25 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -132,6 +132,7 @@ class AllocationLifetime(aenum.AutoNumberEnum):
     SDFG = ()  #: Allocated throughout the innermost SDFG (possibly nested)
     Global = ()  #: Allocated throughout the entire program (outer SDFG)
     Persistent = ()  #: Allocated throughout multiple invocations (init/exit)
+    External = ()  #: Allocated and managed outside the generated code
 
 
 @undefined_safe_enum
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 260360776f..3abef05dc9 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -45,6 +45,7 @@
 if TYPE_CHECKING:
     from dace.codegen.instrumentation.report import InstrumentationReport
     from dace.codegen.instrumentation.data.data_report import InstrumentedDataReport
+    from dace.codegen.compiled_sdfg import CompiledSDFG
 
 
 def _arrays_to_json(arrays):
@@ -2189,8 +2190,7 @@ def is_loaded(self) -> bool:
         dll = cs.ReloadableDLL(binary_filename, self.name)
         return dll.is_loaded()
 
-    def compile(self, output_file=None, validate=True) -> \
-            'dace.codegen.compiler.CompiledSDFG':
+    def compile(self, output_file=None, validate=True) -> 'CompiledSDFG':
         """ Compiles a runnable binary from this SDFG.
 
             :param output_file: If not None, copies the output library file to
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index fa86163063..abad1e7907 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -76,9 +76,10 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
             if name is not None and not dtypes.validate_name(name):
                 raise InvalidSDFGError("Invalid array name %s" % name, sdfg, None)
             # Allocation lifetime checks
-            if (desc.lifetime is dtypes.AllocationLifetime.Persistent and desc.storage is dtypes.StorageType.Register):
+            if (desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)
+                    and desc.storage == dtypes.StorageType.Register):
                 raise InvalidSDFGError(
-                    "Array %s cannot be both persistent and use Register as "
+                    "Array %s cannot be both persistent/external and use Register as "
                     "storage type. Please use a different storage location." % name, sdfg, None)
 
             # Check for valid bank assignments
@@ -320,7 +321,8 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         raise InvalidSDFGError("Invalid state name", sdfg, state_id)
 
     if state._parent != sdfg:
-        raise InvalidSDFGError("State does not point to the correct " "parent", sdfg, state_id)
+        raise InvalidSDFGError("State does not point to the correct "
+                               "parent", sdfg, state_id)
 
     # Unreachable
     ########################################
@@ -736,6 +738,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
 
 class InvalidSDFGError(Exception):
     """ A class of exceptions thrown when SDFG validation fails. """
+
     def __init__(self, message: str, sdfg: 'SDFG', state_id: int):
         self.message = message
         self.sdfg = sdfg
@@ -759,7 +762,8 @@ def _getlineinfo(self, obj) -> str:
 
         if lineinfo.start_line >= 0:
             if lineinfo.start_column > 0:
-                return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, ' f'column {lineinfo.start_column}')
+                return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, '
+                        f'column {lineinfo.start_column}')
             return f'File "{lineinfo.filename}", line {lineinfo.start_line}'
 
         return f'File "{lineinfo.filename}"'
@@ -790,6 +794,7 @@ def __str__(self):
 
 class InvalidSDFGInterstateEdgeError(InvalidSDFGError):
     """ Exceptions of invalid inter-state edges in an SDFG. """
+
     def __init__(self, message: str, sdfg: 'SDFG', edge_id: int):
         self.message = message
         self.sdfg = sdfg
@@ -835,6 +840,7 @@ def __str__(self):
 
 class InvalidSDFGNodeError(InvalidSDFGError):
     """ Exceptions of invalid nodes in an SDFG state. """
+
     def __init__(self, message: str, sdfg: 'SDFG', state_id: int, node_id: int):
         self.message = message
         self.sdfg = sdfg
@@ -872,12 +878,14 @@ class NodeNotExpandedError(InvalidSDFGNodeError):
     Exception that is raised whenever a library node was not expanded
     before code generation.
     """
+
     def __init__(self, sdfg: 'SDFG', state_id: int, node_id: int):
         super().__init__('Library node not expanded', sdfg, state_id, node_id)
 
 
 class InvalidSDFGEdgeError(InvalidSDFGError):
     """ Exceptions of invalid edges in an SDFG state. """
+
     def __init__(self, message: str, sdfg: 'SDFG', state_id: int, edge_id: int):
         self.message = message
         self.sdfg = sdfg
diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index a63b37aa19..b33ad43a3b 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -590,7 +590,7 @@ def apply(self, state: SDFGState, sdfg: SDFG):
         for dnode in state.data_nodes():
             if state.degree(dnode) == 0 and dnode not in isolated_nodes:
                 state.remove_node(dnode)
-        
+
         sdfg._sdfg_list = sdfg.reset_sdfg_list()
 
     def _modify_access_to_access(self,
@@ -764,8 +764,8 @@ def _candidates(sdfg: SDFG, graph: SDFGState, nsdfg: nodes.NestedSDFG) -> Dict[s
             if not desc.transient:
                 continue
             # Needs to be allocated in "Scope" or "Persistent" lifetime
-            if (desc.lifetime != dtypes.AllocationLifetime.Scope
-                    and desc.lifetime != dtypes.AllocationLifetime.Persistent):
+            if (desc.lifetime not in (dtypes.AllocationLifetime.Scope, dtypes.AllocationLifetime.Persistent,
+                                      dtypes.AllocationLifetime.External)):
                 continue
             # If same transient is connected with multiple connectors, bail
             # for now
diff --git a/dace/transformation/passes/dead_dataflow_elimination.py b/dace/transformation/passes/dead_dataflow_elimination.py
index 7c0949ce4d..aeaf1cdbd1 100644
--- a/dace/transformation/passes/dead_dataflow_elimination.py
+++ b/dace/transformation/passes/dead_dataflow_elimination.py
@@ -222,7 +222,7 @@ def _is_node_dead(self, node: nodes.Node, sdfg: SDFG, state: SDFGState, dead_nod
 
             # If access node is persistent, mark as dead only if self.remove_persistent_memory is set
             if not self.remove_persistent_memory:
-                if desc.lifetime == dtypes.AllocationLifetime.Persistent:
+                if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
                     return False
 
             # If data will be used later, cannot remove
diff --git a/dace/transformation/passes/scalar_to_symbol.py b/dace/transformation/passes/scalar_to_symbol.py
index f751ebc271..124efdaae1 100644
--- a/dace/transformation/passes/scalar_to_symbol.py
+++ b/dace/transformation/passes/scalar_to_symbol.py
@@ -89,7 +89,7 @@ def find_promotable_scalars(sdfg: sd.SDFG, transients_only: bool = True, integer
             continue
         if desc.total_size != 1:
             continue
-        if desc.lifetime is dtypes.AllocationLifetime.Persistent:
+        if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             continue
         candidates.add(aname)
 
@@ -589,9 +589,7 @@ class ScalarToSymbolPromotion(passes.Pass):
 
     CATEGORY: str = 'Simplification'
 
-    ignore = props.SetProperty(element_type=str,
-                               default=set(),
-                               desc='Fields that should not be promoted.')
+    ignore = props.SetProperty(element_type=str, default=set(), desc='Fields that should not be promoted.')
     transients_only = props.Property(dtype=bool, default=True, desc='Promote only transients.')
     integers_only = props.Property(dtype=bool, default=True, desc='Allow promotion of integer scalars only.')
 
diff --git a/tests/codegen/external_memory_test.py b/tests/codegen/external_memory_test.py
new file mode 100644
index 0000000000..c72c574806
--- /dev/null
+++ b/tests/codegen/external_memory_test.py
@@ -0,0 +1,98 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+Tests external memory allocation.
+"""
+import dace
+import numpy as np
+import pytest
+
+
+@pytest.mark.parametrize('symbolic', (False, True))
+def test_external_mem(symbolic):
+    N = dace.symbol('N') if symbolic else 20
+
+    @dace.program
+    def tester(a: dace.float64[N]):
+        workspace = dace.ndarray([N], dace.float64, lifetime=dace.AllocationLifetime.External)
+
+        workspace[:] = a
+        workspace += 1
+        a[:] = workspace
+
+    sdfg = tester.to_sdfg()
+
+    # Test that there is no allocation
+    code = sdfg.generate_code()[0].clean_code
+    assert 'new double' not in code
+    assert 'delete[]' not in code
+    assert 'set_external_memory' in code
+
+    a = np.random.rand(20)
+
+    if symbolic:
+        extra_args = dict(a=a, N=20)
+    else:
+        extra_args = {}
+
+    # Test workspace size
+    csdfg = sdfg.compile()
+    csdfg.initialize(**extra_args)
+    sizes = csdfg.get_workspace_sizes()
+    assert sizes == {dace.StorageType.CPU_Heap: 20 * 8}
+
+    # Test setting the workspace
+    wsp = np.random.rand(20)
+    csdfg.set_workspace(dace.StorageType.CPU_Heap, wsp)
+
+    ref = a + 1
+
+    csdfg(a, **extra_args)
+
+    assert np.allclose(a, ref)
+    assert np.allclose(wsp, ref)
+
+
+def test_external_twobuffers():
+    N = dace.symbol('N')
+
+    @dace.program
+    def tester(a: dace.float64[N]):
+        workspace = dace.ndarray([N], dace.float64, lifetime=dace.AllocationLifetime.External)
+        workspace2 = dace.ndarray([2], dace.float64, lifetime=dace.AllocationLifetime.External)
+
+        workspace[:] = a
+        workspace += 1
+        workspace2[0] = np.sum(workspace)
+        workspace2[1] = np.mean(workspace)
+        a[0] = workspace2[0] + workspace2[1]
+
+    sdfg = tester.to_sdfg()
+    csdfg = sdfg.compile()
+
+    # Test workspace size
+    a = np.random.rand(20)
+    csdfg.initialize(a=a, N=20)
+    sizes = csdfg.get_workspace_sizes()
+    assert sizes == {dace.StorageType.CPU_Heap: 22 * 8}
+
+    # Test setting the workspace
+    wsp = np.random.rand(22)
+    csdfg.set_workspace(dace.StorageType.CPU_Heap, wsp)
+
+    ref = a + 1
+    ref2 = np.copy(a)
+    s, m = np.sum(ref), np.mean(ref)
+    ref2[0] = s + m
+
+    csdfg(a=a, N=20)
+
+    assert np.allclose(a, ref2)
+    assert np.allclose(wsp[:-2], ref)
+    assert np.allclose(wsp[-2], s)
+    assert np.allclose(wsp[-1], m)
+
+
+if __name__ == '__main__':
+    test_external_mem(False)
+    test_external_mem(True)
+    test_external_twobuffers()

From ad1d8347cdcd43cc3f1d6d16966ec26d4107d7c2 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 4 Jul 2023 00:52:42 -0700
Subject: [PATCH 172/392] Testing mode should not affect the behavior of the
 framework

---
 dace/codegen/codegen.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py
index e6bb6d9a50..c502a47376 100644
--- a/dace/codegen/codegen.py
+++ b/dace/codegen/codegen.py
@@ -178,11 +178,6 @@ def generate_code(sdfg, validate=True) -> List[CodeObject]:
                 shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg")
                 raise RuntimeError('SDFG serialization failed - files do not match')
 
-        # Run with the deserialized version
-        # NOTE: This means that all subsequent modifications to `sdfg`
-        # are not reflected outside of this function (e.g., library
-        # node expansion).
-        sdfg = sdfg2
 
     # Before generating the code, run type inference on the SDFG connectors
     infer_types.infer_connector_types(sdfg)

From 537205d8741345becc0c2d2dff086c25605fc947 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 4 Jul 2023 15:24:28 +0200
Subject: [PATCH 173/392] Attributes realized by replacement methods now create
 new SDFGStates.

---
 dace/frontend/python/newast.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 541b17af06..52a6862083 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -4612,7 +4612,12 @@ def visit_Attribute(self, node: ast.Attribute):
         # Try to find sub-SDFG attribute
         func = oprepo.Replacements.get_attribute(type(arr), node.attr)
         if func is not None:
-            return func(self, self.sdfg, self.last_state, result)
+            # A new state is likely needed here, e.g., for transposition (ndarray.T)
+            self._add_state('%s_%d' % (type(node).__name__, node.lineno))
+            self.last_state.set_default_lineinfo(self.current_lineinfo)
+            result = func(self, self.sdfg, self.last_state, result)
+            self.last_state.set_default_lineinfo(None)
+            return result
 
         # Otherwise, try to find compile-time attribute (such as shape)
         try:

From 1ee37649d3820e5fe55202a4dd633ec3755d054c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 4 Jul 2023 15:31:15 +0200
Subject: [PATCH 174/392] Added test.

---
 tests/numpy/attribute_test.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/numpy/attribute_test.py b/tests/numpy/attribute_test.py
index da9ac508e9..2181883015 100644
--- a/tests/numpy/attribute_test.py
+++ b/tests/numpy/attribute_test.py
@@ -33,6 +33,28 @@ def test_attribute_in_ranged_loop_symbolic():
     assert np.allclose(a, regression)
 
 
+def test_attribute_new_state():
+
+    N, F_in, F_out, heads = 2, 3, 4, 5
+
+    @dace.program
+    def fn(a: dace.float64[N, F_in], b: dace.float64[N, heads, F_out], c: dace.float64[heads * F_out, F_in]):
+        tmp = a.T @ np.reshape(b, (N, heads * F_out))
+        c[:] = tmp.T
+
+    rng = np.random.default_rng(42)
+
+    a = rng.random((N, F_in))
+    b = rng.random((N, heads, F_out))
+    c_expected = np.zeros((heads * F_out, F_in))
+    c = np.zeros((heads * F_out, F_in))
+
+    fn.f(a, b, c_expected)
+    fn(a, b, c)
+    assert np.allclose(c, c_expected)
+
+
 if __name__ == '__main__':
     test_attribute_in_ranged_loop()
     test_attribute_in_ranged_loop_symbolic()
+    test_attribute_new_state()

From f3495fa81ad42da0e3445f2a649ed3c05c010b00 Mon Sep 17 00:00:00 2001
From: acalotoiu <61420859+acalotoiu@users.noreply.github.com>
Date: Tue, 4 Jul 2023 17:38:05 +0200
Subject: [PATCH 175/392] Update dace/frontend/fortran/ast_components.py

Co-authored-by: Philipp Schaad <schaad.phil@gmail.com>
---
 dace/frontend/fortran/ast_components.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index e917409017..a66ee5c0d6 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -521,7 +521,6 @@ def declaration_type_spec(self, node: FASTNode):
         return node
 
     def assumed_shape_spec_list(self, node: FASTNode):
-
         return node
 
     def type_declaration_stmt(self, node: FASTNode):

From 0dba0111da93c2f945f391daae20665b81b05696 Mon Sep 17 00:00:00 2001
From: acalotoiu <61420859+acalotoiu@users.noreply.github.com>
Date: Tue, 4 Jul 2023 17:38:17 +0200
Subject: [PATCH 176/392] Update dace/frontend/fortran/fortran_parser.py

Co-authored-by: Philipp Schaad <schaad.phil@gmail.com>
---
 dace/frontend/fortran/fortran_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 4180b45371..6d1be7138a 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -213,7 +213,6 @@ def allocate2sdfg(self, node: ast_internal_classes.Allocate_Stmt_Node, sdfg: SDF
                                    strides=strides,
                                    transient=transient)
 
-        #raise NotImplementedError("Fortran allocate statements are not implemented yet")
 
     def write2sdfg(self, node: ast_internal_classes.Write_Stmt_Node, sdfg: SDFG):
         #TODO implement

From d610ce979db68a67c6f3d36f1cd6f644c6278f1a Mon Sep 17 00:00:00 2001
From: Alexandru Calotoiu <acalotoiu@ethz.ch>
Date: Tue, 4 Jul 2023 22:59:34 +0200
Subject: [PATCH 177/392] copyright fix

---
 tests/fortran/allocate_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fortran/allocate_test.py b/tests/fortran/allocate_test.py
index 8d7dbbc856..498c97d932 100644
--- a/tests/fortran/allocate_test.py
+++ b/tests/fortran/allocate_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 
 from fparser.common.readfortran import FortranStringReader
 from fparser.common.readfortran import FortranFileReader

From 211582edf4bcf12572c514029874c9d7e047a951 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 4 Jul 2023 14:10:23 -0700
Subject: [PATCH 178/392] Add documentation

---
 dace/codegen/targets/framecode.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 888941312e..6f302c11ba 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -334,6 +334,14 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
         callsite_stream.write('return __err;\n}\n', sdfg)
 
     def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeIOStream):
+        """
+        If external data descriptors are found in the SDFG (or any nested SDFGs),
+        this function will generate exported functions to (1) get the required memory size
+        per storage location (``__dace_get_external_memory_size_<STORAGE>``, where ``<STORAGE>``
+        can be ``CPU_Heap`` or any other ``dtypes.StorageType``); and (2) set the externally-allocated
+        pointer to the generated code's internal state (``__dace_set_external_memory_<STORAGE>``).
+        """
+        
         # Collect external arrays
         ext_arrays: Dict[dtypes.StorageType, List[Tuple[SDFG, str, data.Data]]] = collections.defaultdict(list)
         for subsdfg, aname, arr in sdfg.arrays_recursive():

From 191ef93917858c00742f5a1b4195aa6bd3d416e5 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 5 Jul 2023 19:20:40 +0200
Subject: [PATCH 179/392] visit_Name and visit_Subscript now look into the
 node's context to choose between read/write accesses.

---
 dace/frontend/python/newast.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 52a6862083..2cc5bd7db7 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -4563,7 +4563,10 @@ def _visitname(self, name: str, node: ast.AST):
         rname = self.scope_vars[name]
         if rname in self.scope_arrays:
             rng = subsets.Range.from_array(self.scope_arrays[rname])
-            rname, _ = self._add_read_access(rname, rng, node)
+            if isinstance(node.ctx, ast.Store):
+                rname, _ = self._add_write_access(rname, rng, node)
+            else:
+                rname, _ = self._add_read_access(rname, rng, node)
         return rname
 
     #### Visitors that return arrays
@@ -4898,6 +4901,8 @@ def _promote(node: ast.AST) -> Union[Any, str, symbolic.symbol]:
     ### Subscript (slicing) handling
     def visit_Subscript(self, node: ast.Subscript, inference: bool = False):
 
+        is_read: bool = not isinstance(node.ctx, ast.Store)
+
         if self.nested:
 
             defined_vars = {**self.variables, **self.scope_vars}
@@ -4924,13 +4929,19 @@ def visit_Subscript(self, node: ast.Subscript, inference: bool = False):
                 if inference:
                     rng.offset(rng, True)
                     return self.sdfg.arrays[true_name].dtype, rng.size()
-                new_name, new_rng = self._add_read_access(name, rng, node)
+                if is_read:
+                    new_name, new_rng = self._add_read_access(name, rng, node)
+                else:
+                    new_name, new_rng = self._add_write_access(name, rng, node)
                 new_arr = self.sdfg.arrays[new_name]
                 full_rng = subsets.Range.from_array(new_arr)
                 if new_rng.ranges == full_rng.ranges:
                     return new_name
                 else:
-                    new_name, _ = self.make_slice(new_name, new_rng)
+                    if is_read:
+                        new_name, _ = self.make_slice(new_name, new_rng)
+                    else:
+                        raise NotImplementedError('Cannot slice a write access')
                     return new_name
 
         # Obtain array/tuple
@@ -4971,8 +4982,11 @@ def visit_Subscript(self, node: ast.Subscript, inference: bool = False):
             rng = expr.subset
             rng.offset(rng, True)
             return self.sdfg.arrays[array].dtype, rng.size()
-
-        return self._add_read_slice(array, node, expr)
+        
+        if is_read:
+            return self._add_read_slice(array, node, expr)
+        else:
+            raise NotImplementedError('Write slicing not implemented')
 
     def _visit_ast_or_value(self, node: ast.AST) -> Any:
         result = self.visit(node)

From 345c36b01820ecc12f0b8c43197fd1d060a76401 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 6 Jul 2023 14:24:48 +0800
Subject: [PATCH 180/392] Updated mpi_send_recv_test.py for correctness of
 blocking comm

---
 tests/library/mpi/mpi_send_recv_test.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py
index 48c8170949..ec094e7cf5 100644
--- a/tests/library/mpi/mpi_send_recv_test.py
+++ b/tests/library/mpi/mpi_send_recv_test.py
@@ -82,8 +82,12 @@ def dace_send_recv(rank: dace.int32, size: dace.int32):
     dst = np.full([1], (rank + 1) % size, dtype=np.int32)
     sbuf = np.full([1], rank, dtype=np.int32)
     rbuf = np.zeros([1], dtype=np.int32)
-    dace.comm.Recv(rbuf, src, tag=42)
-    dace.comm.Send(sbuf, dst, tag=42)
+    if rank % 2 == 0:
+        dace.comm.Recv(rbuf, src, tag=42)
+        dace.comm.Send(sbuf, dst, tag=42)
+    else:
+        dace.comm.Send(sbuf, dst, tag=42)
+        dace.comm.Recv(rbuf, src, tag=42)
     return rbuf
 
 
@@ -99,10 +103,11 @@ def test_dace_send_recv():
     sdfg = None
     if rank == 0:
         sdfg = dace_send_recv.to_sdfg(simplify=True)
+        # disable openMP section for blocking
+        sdfg.openmp_sections = False
     mpi_sdfg = utils.distributed_compile(sdfg, comm)
 
     val = mpi_sdfg(rank=rank, size=commsize)
-
     assert (val[0] == (rank - 1) % commsize)
 
 

From 1cea59ec62a3ecaaa224d0dbe9949b6db6bd4329 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 6 Jul 2023 16:15:40 +0800
Subject: [PATCH 181/392] Updated Isend/Irecv test

---
 tests/library/mpi/mpi_isend_irecv_test.py | 65 ++++++++++-------------
 1 file changed, 28 insertions(+), 37 deletions(-)

diff --git a/tests/library/mpi/mpi_isend_irecv_test.py b/tests/library/mpi/mpi_isend_irecv_test.py
index 0c9a1ef0a9..9fab8c0158 100644
--- a/tests/library/mpi/mpi_isend_irecv_test.py
+++ b/tests/library/mpi/mpi_isend_irecv_test.py
@@ -1,5 +1,6 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
+from dace.sdfg import utils
 from dace.memlet import Memlet
 import dace.libraries.mpi as mpi
 import numpy as np
@@ -104,55 +105,45 @@ def _test_mpi(info, sdfg, dtype):
         raise (ValueError("The received values are not what I expected."))
 
 
-# TODO: The test deadlocks in the CI (Ubuntu 18.04, MPICH 3.3a2)
-# but works fine in up-to-date systems, including when using pytest.
-@pytest.mark.skip
+@pytest.mark.mpi
 def test_mpi():
-    _test_mpi("MPI Send/Recv", make_sdfg(np.float64), np.float64)
-
+    _test_mpi("MPI Isend/Irecv", make_sdfg(np.float64), np.float64)
 
 ###############################################################################
 
-myrank = dace.symbol('myrank', dtype=dace.int32)
-mysize = dace.symbol('mysize', dtype=dace.int32)
-
+@pytest.mark.mpi
+def test_isend_irecv():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
 
-@dace.program
-def dace_send_recv():
-    tmp1 = np.full([1], myrank, dtype=np.int32)
-    tmp2 = np.zeros([1], dtype=np.int32)
-    if myrank == 0:
-        dace.comm.Send(tmp1, 1, tag=42)
-        dace.comm.Recv(tmp2, mysize - 1, tag=42)
-    else:
-        dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42)
-        dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42)
-    return tmp2
+    @dace.program
+    def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32):
+        src = (rank - 1) % size
+        dst = (rank + 1) % size
+        req = np.empty((2, ), dtype=MPI.Request)
+        sbuf = np.full((1,), rank, dtype=np.int32)
+        req[0] = commworld.Isend(sbuf, dst, tag=0)
+        rbuf = np.empty((1, ), dtype=np.int32)
+        req[1] = commworld.Irecv(rbuf, src, tag=0)
+        MPI.Request.Waitall(req)
+        return rbuf
 
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_isend_irecv.to_sdfg(simplify=True)
+    func = utils.distributed_compile(sdfg, commworld)
 
-# TODO: The test is redundant. It must be updated to use Isend/Irecv.
-@pytest.mark.skip
-def test_dace_send_recv():
-    from mpi4py import MPI as MPI4PY
-    comm = MPI4PY.COMM_WORLD
-    rank = comm.Get_rank()
-    commsize = comm.Get_size()
-    mpi_sdfg = None
-    if commsize < 2:
-        raise ValueError("This test is supposed to be run with at least two processes!")
-    for r in range(0, commsize):
-        if r == rank:
-            mpi_sdfg = dace_send_recv.compile()
-        comm.Barrier()
-
-    prv_rank = mpi_sdfg(myrank=rank, mysize=commsize)
+    val = func(rank=rank, size=size)
+    ref = mpi4py_isend_irecv.f(rank, size)
 
-    assert (prv_rank[0] == (rank - 1) % commsize)
+    assert (val[0] == ref[0])
 
 
 ###############################################################################
 
 if __name__ == "__main__":
     test_mpi()
-    test_dace_send_recv()
+    test_isend_irecv()
 ###############################################################################

From b82b06ac2ca122789e1d4c2bbbc2a45b277faf36 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 7 Jul 2023 00:01:49 +0800
Subject: [PATCH 182/392] Updated alltoall library node for logical correctness

---
 dace/libraries/mpi/nodes/alltoall.py   | 10 +++++-----
 tests/library/mpi/mpi_alltoall_test.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py
index b0accfb52d..92be24ce45 100644
--- a/dace/libraries/mpi/nodes/alltoall.py
+++ b/dace/libraries/mpi/nodes/alltoall.py
@@ -25,12 +25,12 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         if node.grid:
             comm = f"__state->{node.grid}_comm"
 
-        # code = f"""
-        #     MPI_Alltoall({buffer}, {count_str}, {mpi_dtype_str}, _outbuffer, {count_str}, {mpi_dtype_str}, {comm});
-        #     """
         code = f"""
-            MPI_Alltoall(_inbuffer, {in_count_str}, {in_mpi_dtype_str}, \
-                        _outbuffer, {out_count_str}, {out_mpi_dtype_str}, \
+            int size;
+            MPI_Comm_size({comm}, &size);
+            int sendrecv_amt = {in_count_str} / size;
+            MPI_Alltoall(_inbuffer, sendrecv_amt, {in_mpi_dtype_str}, \
+                        _outbuffer, sendrecv_amt, {out_mpi_dtype_str}, \
                         {comm});
             """
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py
index cf155fc640..e1eb4fe5f1 100644
--- a/tests/library/mpi/mpi_alltoall_test.py
+++ b/tests/library/mpi/mpi_alltoall_test.py
@@ -59,7 +59,7 @@ def test_mpi(implementation, dtype):
     size_per_proc = int(size/commsize)
     A = np.arange(0, size, dtype=np_dtype)
     B = np.full(size, 0, dtype=np_dtype)
-    mpi_sdfg(inbuf=A, outbuf=B, n=size_per_proc)
+    mpi_sdfg(inbuf=A, outbuf=B, n=size)
 
     # now B should be an array of size,
     # containing (size / size_per_proc) repeated chunked_data

From a115db68447e4f6dfc6f599757de12d5c2f4e86a Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 7 Jul 2023 00:14:09 +0800
Subject: [PATCH 183/392] Added replacement and test for mpi4py alltoall

---
 dace/frontend/common/distr.py    | 30 +++++++++++++++++++++++++++++-
 tests/library/mpi/mpi4py_test.py | 27 +++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index c47040728f..4200ad9024 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -254,9 +254,37 @@ def _Reduce(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall')
+@oprepo.replaces('dace.comm.Alltoall')
+def _allreduce(pv: 'ProgramVisitor',
+               sdfg: SDFG,
+               state: SDFGState,
+               inbuffer: str,
+               outbuffer: str,
+               grid: str = None):
+
+    from dace.libraries.mpi.nodes.alltoall import Alltoall
+
+
+    libnode = Alltoall('_Alltoall_', grid)
+    in_desc = sdfg.arrays[inbuffer]
+    in_buffer = state.add_read(inbuffer)
+    out_desc = sdfg.arrays[inbuffer]
+    out_buffer = state.add_write(outbuffer)
+    state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc))
+    state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(out_buffer, out_desc))
+
+    return None
+
+
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
-def _allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
+def _allreduce(pv: 'ProgramVisitor',
+               sdfg: SDFG,
+               state: SDFGState,
+               buffer: str,
+               op: str,
+               grid: str = None):
 
     from dace.libraries.mpi.nodes.allreduce import Allreduce
 
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 603a6786cb..a9c94ea4a0 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -231,9 +231,36 @@ def mpi4py_send_recv(rank: dace.int32, size: dace.int32):
     assert (val[0] == ref[0])
 
 
+@pytest.mark.mpi
+def test_alltoall():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def mpi4py_alltoall(rank: dace.int32, size: dace.int32):
+        sbuf = np.full((128,), rank, dtype=np.int32)
+        rbuf = np.zeros((128, ), dtype=np.int32)
+        commworld.Alltoall(sbuf, rbuf)
+        return rbuf
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_alltoall.to_sdfg(simplify=True)
+    func = utils.distributed_compile(sdfg, commworld)
+
+    val = func(rank=rank, size=size)
+    ref = mpi4py_alltoall.f(rank, size)
+
+    if (not np.allclose(val, ref)):
+        raise (ValueError("The received values are not what I expected."))
+
+
 if __name__ == "__main__":
     # test_process_grid_bcast()
     # test_sub_grid_bcast()
     # test_3mm()
     test_isend_irecv()
     test_send_recv()
+    test_alltoall()

From eebefe4618a715c02099da3375245bb4c737c7b9 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 7 Jul 2023 13:38:08 +0800
Subject: [PATCH 184/392] Corrected the out_desc in alltoall replacement

---
 dace/frontend/common/distr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 4200ad9024..dd20a7b6fe 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -269,7 +269,7 @@ def _allreduce(pv: 'ProgramVisitor',
     libnode = Alltoall('_Alltoall_', grid)
     in_desc = sdfg.arrays[inbuffer]
     in_buffer = state.add_read(inbuffer)
-    out_desc = sdfg.arrays[inbuffer]
+    out_desc = sdfg.arrays[outbuffer]
     out_buffer = state.add_write(outbuffer)
     state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc))
     state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(out_buffer, out_desc))

From 110d0f2a334bca1ac96a0cae1c7ba6d82f990d11 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 7 Jul 2023 14:05:17 +0800
Subject: [PATCH 185/392] Added alltoall replacement for ProcessGrid and
 Intracomm

---
 dace/frontend/common/distr.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index dd20a7b6fe..f20e6f6729 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -256,7 +256,7 @@ def _Reduce(pv: 'ProgramVisitor',
 
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall')
 @oprepo.replaces('dace.comm.Alltoall')
-def _allreduce(pv: 'ProgramVisitor',
+def _alltoall(pv: 'ProgramVisitor',
                sdfg: SDFG,
                state: SDFGState,
                inbuffer: str,
@@ -265,7 +265,6 @@ def _allreduce(pv: 'ProgramVisitor',
 
     from dace.libraries.mpi.nodes.alltoall import Alltoall
 
-
     libnode = Alltoall('_Alltoall_', grid)
     in_desc = sdfg.arrays[inbuffer]
     in_buffer = state.add_read(inbuffer)
@@ -277,6 +276,36 @@ def _allreduce(pv: 'ProgramVisitor',
     return None
 
 
+@oprepo.replaces_method('Intracomm', 'Alltoall')
+def _intracomm_alltoall(pv: 'ProgramVisitor',
+                         sdfg: SDFG,
+                         state: SDFGState,
+                         icomm: 'Intracomm',
+                         inp_buffer: str,
+                         out_buffer: str):
+
+    """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """
+
+    from mpi4py import MPI
+    if icomm != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    return _alltoall(pv, sdfg, state, inp_buffer, out_buffer)
+
+
+@oprepo.replaces_method('ProcessGrid', 'Alltoall')
+def _pgrid_alltoall(pv: 'ProgramVisitor',
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     pgrid: str,
+                     inp_buffer: str,
+                     out_buffer: str):
+
+    """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer, grid=pgrid)`. """
+
+    from mpi4py import MPI
+    return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid)
+
+
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
 def _allreduce(pv: 'ProgramVisitor',

From 8626b9a8f7e74c805430adf5e3649f09e31ee718 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 17:15:44 +0200
Subject: [PATCH 186/392] Fixed bad merge.

---
 dace/frontend/common/distr.py | 128 ++++++++++++----------------------
 1 file changed, 44 insertions(+), 84 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 653ca38337..af08623083 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -16,7 +16,6 @@
 RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]
 ProgramVisitor = 'dace.frontend.python.newast.ProgramVisitor'
 
-
 ##### MPI Cartesian Communicators
 
 
@@ -64,7 +63,6 @@ def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm:
     return _cart_create(pv, sdfg, state, dims)
 
 
-
 @oprepo.replaces('dace.comm.Cart_sub')
 def _cart_sub(pv: 'ProgramVisitor',
               sdfg: SDFG,
@@ -107,11 +105,8 @@ def _cart_sub(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('ProcessGrid', 'Sub')
-def _pgrid_sub(pv: 'ProgramVisitor',
-               sdfg: SDFG,
-               state: SDFGState,
-               parent_grid: str,
-               color: Sequence[Union[Integral, bool]]):
+def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid: str, color: Sequence[Union[Integral,
+                                                                                                           bool]]):
     """ Equivalent to `dace.comm.Cart_sub(parent_grid, color).
         :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`).
         :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`).
@@ -197,7 +192,6 @@ def _intracomm_bcast(pv: 'ProgramVisitor',
                      icomm: 'Intracomm',
                      buffer: str,
                      root: Union[str, sp.Expr, Number] = 0):
-
     """ Equivalent to `dace.comm.Bcast(buffer, root)`. """
 
     from mpi4py import MPI
@@ -213,7 +207,6 @@ def _pgrid_bcast(pv: 'ProgramVisitor',
                  pgrid: str,
                  buffer: str,
                  root: Union[str, sp.Expr, Number] = 0):
-
     """ Equivalent to `dace.comm.Bcast(buffer, root, grid=pgrid)`. """
 
     return _bcast(pv, sdfg, state, buffer, root, grid=pgrid)
@@ -257,12 +250,7 @@ def _Reduce(pv: ProgramVisitor,
 
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall')
 @oprepo.replaces('dace.comm.Alltoall')
-def _alltoall(pv: 'ProgramVisitor',
-               sdfg: SDFG,
-               state: SDFGState,
-               inbuffer: str,
-               outbuffer: str,
-               grid: str = None):
+def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None):
 
     from dace.libraries.mpi.nodes.alltoall import Alltoall
 
@@ -278,13 +266,8 @@ def _alltoall(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('Intracomm', 'Alltoall')
-def _intracomm_alltoall(pv: 'ProgramVisitor',
-                         sdfg: SDFG,
-                         state: SDFGState,
-                         icomm: 'Intracomm',
-                         inp_buffer: str,
-                         out_buffer: str):
-
+def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: str,
+                        out_buffer: str):
     """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """
 
     from mpi4py import MPI
@@ -294,13 +277,7 @@ def _intracomm_alltoall(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('ProcessGrid', 'Alltoall')
-def _pgrid_alltoall(pv: 'ProgramVisitor',
-                     sdfg: SDFG,
-                     state: SDFGState,
-                     pgrid: str,
-                     inp_buffer: str,
-                     out_buffer: str):
-
+def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, inp_buffer: str, out_buffer: str):
     """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer, grid=pgrid)`. """
 
     from mpi4py import MPI
@@ -309,7 +286,7 @@ def _pgrid_alltoall(pv: 'ProgramVisitor',
 
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
-def _Allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
+def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
 
     from dace.libraries.mpi.nodes.allreduce import Allreduce
 
@@ -324,14 +301,8 @@ def _Allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op
 
 
 @oprepo.replaces_method('Intracomm', 'Allreduce')
-def _intracomm_allreduce(pv: 'ProgramVisitor',
-                         sdfg: SDFG,
-                         state: SDFGState,
-                         icomm: 'Intracomm',
-                         inp_buffer: 'InPlace',
-                         out_buffer: str,
-                         op: str):
-
+def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: 'InPlace',
+                         out_buffer: str, op: str):
     """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """
 
     from mpi4py import MPI
@@ -345,14 +316,8 @@ def _intracomm_allreduce(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('ProcessGrid', 'Allreduce')
-def _pgrid_allreduce(pv: 'ProgramVisitor',
-                     sdfg: SDFG,
-                     state: SDFGState,
-                     pgrid: str,
-                     inp_buffer: 'InPlace',
-                     out_buffer: str,
-                     op: str):
-
+def _pgrid_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, inp_buffer: 'InPlace',
+                     out_buffer: str, op: str):
     """ Equivalent to `dace.comm.Allreduce(out_buffer, op, grid=pgrid)`. """
 
     from mpi4py import MPI
@@ -425,6 +390,7 @@ def _gather(pv: ProgramVisitor,
 
 ##### Point-To-Point Communication
 
+
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send')
 @oprepo.replaces('dace.comm.Send')
 def _send(pv: ProgramVisitor,
@@ -500,15 +466,24 @@ def _send(pv: ProgramVisitor,
 
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend')
 @oprepo.replaces('dace.comm.Isend')
-def _isend(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, dst: Union[str, sp.Expr, Number],
-           tag: Union[str, sp.Expr, Number], request: str):
+def _isend(pv: ProgramVisitor,
+           sdfg: SDFG,
+           state: SDFGState,
+           buffer: str,
+           dst: Union[str, sp.Expr, Number],
+           tag: Union[str, sp.Expr, Number],
+           request: str = None,
+           grid: str = None):
 
     from dace.libraries.mpi.nodes.isend import Isend
 
     ret_req = False
     if not request:
         ret_req = True
-        request, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+        request, _ = sdfg.add_array("isend_req", [1],
+                                    dace.dtypes.opaque("MPI_Request"),
+                                    transient=True,
+                                    find_new_name=True)
 
     libnode = Isend('_Isend_', grid=grid)
 
@@ -591,14 +566,8 @@ def _isend(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, dst: U
 
 
 @oprepo.replaces_method('Intracomm', 'Isend')
-def _intracomm_isend(pv: 'ProgramVisitor',
-                     sdfg: SDFG,
-                     state: SDFGState,
-                     icomm: 'Intracomm',
-                     buffer: str,
-                     dst: Union[str, sp.Expr, Number],
-                     tag: Union[str, sp.Expr, Number]):
-
+def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+                     dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """
 
     from mpi4py import MPI
@@ -610,14 +579,8 @@ def _intracomm_isend(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('ProcessGrid', 'Isend')
-def _pgrid_isend(pv: 'ProgramVisitor',
-                 sdfg: SDFG,
-                 state: SDFGState,
-                 pgrid: str,
-                 buffer: str,
-                 dst: Union[str, sp.Expr, Number],
-                 tag: Union[str, sp.Expr, Number]):
-
+def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str,
+                 dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """
 
     from mpi4py import MPI
@@ -701,15 +664,24 @@ def _recv(pv: ProgramVisitor,
 
 @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv')
 @oprepo.replaces('dace.comm.Irecv')
-def _irecv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, src: Union[str, sp.Expr, Number],
-           tag: Union[str, sp.Expr, Number], request: str):
+def _irecv(pv: ProgramVisitor,
+           sdfg: SDFG,
+           state: SDFGState,
+           buffer: str,
+           src: Union[str, sp.Expr, Number],
+           tag: Union[str, sp.Expr, Number],
+           request: str = None,
+           grid: str = None):
 
     from dace.libraries.mpi.nodes.irecv import Irecv
 
     ret_req = False
     if not request:
         ret_req = True
-        request, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
+        request, _ = sdfg.add_array("irecv_req", [1],
+                                    dace.dtypes.opaque("MPI_Request"),
+                                    transient=True,
+                                    find_new_name=True)
 
     libnode = Irecv('_Irecv_', grid=grid)
 
@@ -790,14 +762,8 @@ def _irecv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, src: U
 
 
 @oprepo.replaces_method('Intracomm', 'Irecv')
-def _intracomm_irecv(pv: 'ProgramVisitor',
-                     sdfg: SDFG,
-                     state: SDFGState,
-                     icomm: 'Intracomm',
-                     buffer: str,
-                     src: Union[str, sp.Expr, Number],
-                     tag: Union[str, sp.Expr, Number]):
-
+def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+                     src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """
 
     from mpi4py import MPI
@@ -809,14 +775,8 @@ def _intracomm_irecv(pv: 'ProgramVisitor',
 
 
 @oprepo.replaces_method('ProcessGrid', 'Irecv')
-def _pgrid_irecv(pv: 'ProgramVisitor',
-                 sdfg: SDFG,
-                 state: SDFGState,
-                 pgrid: str,
-                 buffer: str,
-                 src: Union[str, sp.Expr, Number],
-                 tag: Union[str, sp.Expr, Number]):
-
+def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str,
+                 src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req, grid=pgrid)`. """
 
     from mpi4py import MPI

From 442a8734419393f2797e82328829eb74a4ce8377 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 17:15:57 +0200
Subject: [PATCH 187/392] Updated tests.

---
 tests/library/mpi/mpi4py_test.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index a9c94ea4a0..bbc72bc6c4 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -239,18 +239,18 @@ def test_alltoall():
     size = commworld.Get_size()
 
     @dace.program
-    def mpi4py_alltoall(rank: dace.int32, size: dace.int32):
-        sbuf = np.full((128,), rank, dtype=np.int32)
-        rbuf = np.zeros((128, ), dtype=np.int32)
+    def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
+        sbuf = np.full((size,), rank, dtype=np.int32)
+        rbuf = np.zeros((size, ), dtype=np.int32)
         commworld.Alltoall(sbuf, rbuf)
         return rbuf
 
     sdfg = None
     if rank == 0:
-        sdfg = mpi4py_alltoall.to_sdfg(simplify=True)
+        sdfg = mpi4py_alltoall.to_sdfg(simplify=True, size=size)
     func = utils.distributed_compile(sdfg, commworld)
 
-    val = func(rank=rank, size=size)
+    val = func(rank=rank)
     ref = mpi4py_alltoall.f(rank, size)
 
     if (not np.allclose(val, ref)):
@@ -261,6 +261,6 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.int32):
     # test_process_grid_bcast()
     # test_sub_grid_bcast()
     # test_3mm()
-    test_isend_irecv()
-    test_send_recv()
+    # test_isend_irecv()
+    # test_send_recv()
     test_alltoall()

From ee55b6ef5ce3ed84f87e2e8fdf88aebed2c1e733 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Fri, 7 Jul 2023 08:41:27 -0700
Subject: [PATCH 188/392] Fixes for double-free of external streams and
 auto-optimize removing external lifetime

---
 dace/codegen/targets/cuda.py                  | 5 +++--
 dace/runtime/include/dace/cuda/cudacommon.cuh | 2 ++
 dace/transformation/auto/auto_optimize.py     | 4 ++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 1e06a1d3ef..8f0139f8fb 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -377,7 +377,8 @@ def get_generated_codeobjects(self):
 
     // Create {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
-        DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->streams[i], {backend}StreamNonBlocking));
+        DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking));
+        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams
     }}
     for(int i = 0; i < {nevents}; ++i) {{
         DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming));
@@ -398,7 +399,7 @@ def get_generated_codeobjects(self):
 
     // Destroy {backend} streams and events
     for(int i = 0; i < {nstreams}; ++i) {{
-        DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->streams[i]));
+        DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i]));
     }}
     for(int i = 0; i < {nevents}; ++i) {{
         DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i]));
diff --git a/dace/runtime/include/dace/cuda/cudacommon.cuh b/dace/runtime/include/dace/cuda/cudacommon.cuh
index 6390c9909c..01d7ca2146 100644
--- a/dace/runtime/include/dace/cuda/cudacommon.cuh
+++ b/dace/runtime/include/dace/cuda/cudacommon.cuh
@@ -47,11 +47,13 @@ struct Context {
   int num_streams;
   int num_events;
   gpuStream_t *streams;
+  gpuStream_t *internal_streams;
   gpuEvent_t *events;
   gpuError_t lasterror;
   Context(int nstreams, int nevents)
       : num_streams(nstreams), num_events(nevents), lasterror((gpuError_t)0) {
     streams = new gpuStream_t[nstreams];
+    internal_streams = new gpuStream_t[nstreams];
     events = new gpuEvent_t[nevents];
   }
   ~Context() {
diff --git a/dace/transformation/auto/auto_optimize.py b/dace/transformation/auto/auto_optimize.py
index 6177e9e38e..301846a4f6 100644
--- a/dace/transformation/auto/auto_optimize.py
+++ b/dace/transformation/auto/auto_optimize.py
@@ -494,6 +494,10 @@ def make_transients_persistent(sdfg: SDFG,
                         not_persistent.add(dnode.data)
                         continue
 
+                if desc.lifetime == dtypes.AllocationLifetime.External:
+                    not_persistent.add(dnode.data)
+                    continue
+
                 persistent.add(dnode.data)
 
         for aname in (persistent - not_persistent):

From 832c203598dba64abc093b39743e1b153954ed9e Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 17:52:57 +0200
Subject: [PATCH 189/392] uncommented out tests.

---
 tests/library/mpi/mpi4py_test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index bbc72bc6c4..a81294c47f 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -258,9 +258,9 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
 
 
 if __name__ == "__main__":
-    # test_process_grid_bcast()
-    # test_sub_grid_bcast()
-    # test_3mm()
-    # test_isend_irecv()
-    # test_send_recv()
+    test_process_grid_bcast()
+    test_sub_grid_bcast()
+    test_3mm()
+    test_isend_irecv()
+    test_send_recv()
     test_alltoall()

From 67874c3fbf01eb967b590e94a17f5c592f158b32 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 21:30:56 +0200
Subject: [PATCH 190/392] In case of indirection, treat the slice in the
 Tasklet's code the same as the memlet's strides.

---
 dace/codegen/targets/cpp.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 295bf21310..1ec383815d 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -1082,11 +1082,17 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
                 ]
 
         if isinstance(visited_slice, ast.Tuple):
-            if len(strides) != len(visited_slice.elts):
+            # If slice is multi-dimensional and writes to array with more than 1 elements, then:
+            # - Assume this is indirection (?)
+            # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
+            desc = self.sdfg.arrays[dname]
+            if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
+                elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+            if len(strides) != len(elts):
                 raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
-                                  'got %d)' % (len(strides), len(visited_slice.elts)))
+                                  'got %d)' % (len(strides), len(elts)))
 
-            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(visited_slice.elts, strides))
+            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides))
 
         if len(strides) != 1:
             raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides))

From 4a375ef533f23a5be4111ec06b29e784f67c18a7 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 21:31:10 +0200
Subject: [PATCH 191/392] Added test.

---
 tests/python_frontend/indirections_test.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/python_frontend/indirections_test.py b/tests/python_frontend/indirections_test.py
index c59dffb922..fa6af21e4f 100644
--- a/tests/python_frontend/indirections_test.py
+++ b/tests/python_frontend/indirections_test.py
@@ -387,6 +387,27 @@ def test_spmv():
         assert (np.allclose(y, ref))
 
 
+def test_indirection_size_1():
+
+    def compute_index(scal: dc.int32[5]):
+        result = 0
+        with dace.tasklet:
+            s << scal
+            r >> result
+            r = s[1] + 1 - 1
+        return result
+
+    @dc.program
+    def tester(a: dc.float64[1, 2, 3], scal: dc.int32[5]):
+        ind = compute_index(scal)
+        a[0, ind, 0] = 1
+
+    arr = np.random.rand(1, 2, 3)
+    scal = np.array([1, 1, 1, 1, 1], dtype=np.int32)
+    tester(arr, scal)
+    assert arr[0, 1, 0] == 1
+
+
 if __name__ == "__main__":
     test_indirection_scalar()
     test_indirection_scalar_assign()
@@ -412,3 +433,4 @@ def test_spmv():
     test_indirection_array_nested()
     test_indirection_array_nested_nsdfg()
     test_spmv()
+    test_indirection_size_1()

From 5d66e936bd2fe198dd1f23bdfcf090842dfc3619 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 21:35:46 +0200
Subject: [PATCH 192/392] Rolled-back accidental push to master.

---
 dace/codegen/targets/cpp.py                | 12 +++---------
 tests/python_frontend/indirections_test.py | 22 ----------------------
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 1ec383815d..295bf21310 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -1082,17 +1082,11 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
                 ]
 
         if isinstance(visited_slice, ast.Tuple):
-            # If slice is multi-dimensional and writes to array with more than 1 elements, then:
-            # - Assume this is indirection (?)
-            # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
-            desc = self.sdfg.arrays[dname]
-            if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
-                elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
-            if len(strides) != len(elts):
+            if len(strides) != len(visited_slice.elts):
                 raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
-                                  'got %d)' % (len(strides), len(elts)))
+                                  'got %d)' % (len(strides), len(visited_slice.elts)))
 
-            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides))
+            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(visited_slice.elts, strides))
 
         if len(strides) != 1:
             raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides))
diff --git a/tests/python_frontend/indirections_test.py b/tests/python_frontend/indirections_test.py
index fa6af21e4f..c59dffb922 100644
--- a/tests/python_frontend/indirections_test.py
+++ b/tests/python_frontend/indirections_test.py
@@ -387,27 +387,6 @@ def test_spmv():
         assert (np.allclose(y, ref))
 
 
-def test_indirection_size_1():
-
-    def compute_index(scal: dc.int32[5]):
-        result = 0
-        with dace.tasklet:
-            s << scal
-            r >> result
-            r = s[1] + 1 - 1
-        return result
-
-    @dc.program
-    def tester(a: dc.float64[1, 2, 3], scal: dc.int32[5]):
-        ind = compute_index(scal)
-        a[0, ind, 0] = 1
-
-    arr = np.random.rand(1, 2, 3)
-    scal = np.array([1, 1, 1, 1, 1], dtype=np.int32)
-    tester(arr, scal)
-    assert arr[0, 1, 0] == 1
-
-
 if __name__ == "__main__":
     test_indirection_scalar()
     test_indirection_scalar_assign()
@@ -433,4 +412,3 @@ def tester(a: dc.float64[1, 2, 3], scal: dc.int32[5]):
     test_indirection_array_nested()
     test_indirection_array_nested_nsdfg()
     test_spmv()
-    test_indirection_size_1()

From 8981b815300183baba64bad11279deb9d860c907 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 21:38:04 +0200
Subject: [PATCH 193/392] Treat strides and tasklet-code slices the same way.

---
 dace/codegen/targets/cpp.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 295bf21310..1ec383815d 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -1082,11 +1082,17 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
                 ]
 
         if isinstance(visited_slice, ast.Tuple):
-            if len(strides) != len(visited_slice.elts):
+            # If slice is multi-dimensional and writes to array with more than 1 elements, then:
+            # - Assume this is indirection (?)
+            # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
+            desc = self.sdfg.arrays[dname]
+            if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
+                elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+            if len(strides) != len(elts):
                 raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
-                                  'got %d)' % (len(strides), len(visited_slice.elts)))
+                                  'got %d)' % (len(strides), len(elts)))
 
-            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(visited_slice.elts, strides))
+            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides))
 
         if len(strides) != 1:
             raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides))

From a68b23ee1bf29b6733cbbb88b78a67a8a4d79228 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 7 Jul 2023 21:38:18 +0200
Subject: [PATCH 194/392] Added test.

---
 tests/python_frontend/indirections_test.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/python_frontend/indirections_test.py b/tests/python_frontend/indirections_test.py
index c59dffb922..fa6af21e4f 100644
--- a/tests/python_frontend/indirections_test.py
+++ b/tests/python_frontend/indirections_test.py
@@ -387,6 +387,27 @@ def test_spmv():
         assert (np.allclose(y, ref))
 
 
+def test_indirection_size_1():
+
+    def compute_index(scal: dc.int32[5]):
+        result = 0
+        with dace.tasklet:
+            s << scal
+            r >> result
+            r = s[1] + 1 - 1
+        return result
+
+    @dc.program
+    def tester(a: dc.float64[1, 2, 3], scal: dc.int32[5]):
+        ind = compute_index(scal)
+        a[0, ind, 0] = 1
+
+    arr = np.random.rand(1, 2, 3)
+    scal = np.array([1, 1, 1, 1, 1], dtype=np.int32)
+    tester(arr, scal)
+    assert arr[0, 1, 0] == 1
+
+
 if __name__ == "__main__":
     test_indirection_scalar()
     test_indirection_scalar_assign()
@@ -412,3 +433,4 @@ def test_spmv():
     test_indirection_array_nested()
     test_indirection_array_nested_nsdfg()
     test_spmv()
+    test_indirection_size_1()

From b61acccef0ac8d6ed831c9634e8e759210b766b9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 10 Jul 2023 13:34:58 +0200
Subject: [PATCH 195/392] Fix for the case where dname was not defined.

---
 dace/codegen/targets/cpp.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 1ec383815d..afbc6fca12 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -1085,9 +1085,12 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
             # If slice is multi-dimensional and writes to array with more than 1 elements, then:
             # - Assume this is indirection (?)
             # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
-            desc = self.sdfg.arrays[dname]
-            if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
-                elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+            if target not in self.constants:
+                desc = self.sdfg.arrays[dname]
+                if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
+                    elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+            else:
+                elts = visited_slice.elts
             if len(strides) != len(elts):
                 raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
                                   'got %d)' % (len(strides), len(elts)))

From 4b055e60568810192d025f4068a8a88940a30dbb Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 10 Jul 2023 21:39:30 +0200
Subject: [PATCH 196/392] WIP: Reworking intemediate node removal by tracking
 dependencies of intermediate accesses to same data.

---
 .../subgraph/subgraph_fusion.py               | 129 ++++++++++++------
 1 file changed, 84 insertions(+), 45 deletions(-)

diff --git a/dace/transformation/subgraph/subgraph_fusion.py b/dace/transformation/subgraph/subgraph_fusion.py
index fa66319ddf..5cd693eb03 100644
--- a/dace/transformation/subgraph/subgraph_fusion.py
+++ b/dace/transformation/subgraph/subgraph_fusion.py
@@ -1,7 +1,7 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-""" This module contains classes that implement subgraph fusion
-"""
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" This module contains classes that implement subgraph fusion.    """
 import dace
+import networkx as nx
 
 from dace import dtypes, registry, symbolic, subsets, data
 from dace.sdfg import nodes, utils, replace, SDFG, scope_contains_scope
@@ -1144,7 +1144,35 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
 
         # Try to remove intermediate nodes that are not contained in the subgraph
         # by reconnecting their adjacent edges to nodes outside the subgraph.
-        for node in intermediate_nodes:
+
+        intermediate_data = dict()
+        for acc in intermediate_nodes:
+            if acc.data in intermediate_data:
+                intermediate_data[acc.data].append(acc)
+            else:
+                intermediate_data[acc.data] = [acc]
+        
+        filtered_intermediate_nodes = []
+        intermediate_nodes_deps = dict()
+        for _, accesses in intermediate_data.items():
+            if len(accesses) == 1:
+                filtered_intermediate_nodes.append(accesses[0])
+            else:
+                accesses_copy = list(accesses)
+                access_dict = {a: [] for a in accesses}
+                for acc in accesses:
+                    other_accesses = [a for a in accesses_copy if a != acc]
+                    for other_acc in other_accesses:
+                        if nx.has_path(graph.nx, other_acc, acc):
+                            accesses_copy.remove(other_acc)
+                            access_dict[acc].append(other_acc)
+                            access_dict[acc].extend(access_dict[other_acc])
+                            del access_dict[other_acc]
+                for acc in accesses_copy:
+                    filtered_intermediate_nodes.append(acc)
+                    intermediate_nodes_deps[acc] = [a for a in access_dict[acc]]
+             
+        for node in filtered_intermediate_nodes:
             # Checking if data are contained in the subgraph
             if not subgraph_contains_data[node.data]:
                 # Find existing outer access nodes
@@ -1163,59 +1191,70 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
                 # Compute the union of all incoming subsets.
                 # TODO: Do we expect this operation to ever fail?
                 in_subset: subsets.Subset = None
-                for ie in graph.in_edges(node):
-                    if in_subset:
-                        in_subset = subsets.union(in_subset, ie.data.dst_subset)
-                    else:
-                        in_subset = ie.data.dst_subset
+                accesses = [node] + intermediate_nodes_deps[node]
+                for acc in accesses:
+                    for ie in graph.in_edges(acc):
+                        if in_subset:
+                            in_subset = subsets.union(in_subset, ie.data.dst_subset)
+                        else:
+                            in_subset = ie.data.dst_subset 
+                # for ie in graph.in_edges(node):
+                #     if in_subset:
+                #         in_subset = subsets.union(in_subset, ie.data.dst_subset)
+                #     else:
+                #         in_subset = ie.data.dst_subset
 
                 # Create transient data corresponding to the union of the incoming subsets.
                 desc = sdfg.arrays[node.data]
-                name, new_desc = sdfg.add_temp_transient(in_subset.bounding_box_size(), desc.dtype, desc.storage)
-                new_node = graph.add_access(name)
+                name, _ = sdfg.add_temp_transient(in_subset.bounding_box_size(), desc.dtype, desc.storage)
 
                 # Reconnect incoming edges through the transient data.
-                for ie in graph.in_edges(node):
-                    mem = Memlet(data=name,
-                                 subset=ie.data.dst_subset.offset_new(in_subset, True),
-                                 other_subset=ie.data.src_subset)
-                    new_edge = graph.add_edge(ie.src, ie.src_conn, new_node, None, mem)
-                    to_remove.add(ie)
-                    # Update memlet paths.
-                    for e in graph.memlet_path(new_edge):
-                        if e.data.data == node.data:
-                            e.data.data = name
-                            e.data.dst_subset.offset(in_subset, True)
+                for acc in accesses:
+
+                    new_node = graph.add_access(name)
 
-                # Reconnect outgoing edges through the transient data.
-                for oe in graph.out_edges(node):
-                    if in_subset.covers(oe.data.src_subset):
+                    for ie in graph.in_edges(acc):
                         mem = Memlet(data=name,
-                                     subset=oe.data.src_subset.offset_new(in_subset, True),
-                                     other_subset=oe.data.dst_subset)
-                        new_edge = graph.add_edge(new_node, None, oe.dst, oe.dst_conn, mem)
+                                    subset=ie.data.dst_subset.offset_new(in_subset, True),
+                                    other_subset=ie.data.src_subset)
+                        new_edge = graph.add_edge(ie.src, ie.src_conn, new_node, None, mem)
+                        to_remove.add(ie)
                         # Update memlet paths.
                         for e in graph.memlet_path(new_edge):
                             if e.data.data == node.data:
                                 e.data.data = name
-                                e.data.src_subset.offset(in_subset, True)
-                    else:
-                        # If the outgoing subset is not covered by the transient data, connect to the outer input node.
-                        if not inode:
-                            inode = graph.add_access(node.data)
-                        graph.add_memlet_path(inode, global_map_entry, oe.dst, memlet=oe.data, dst_conn=oe.dst_conn)
-                    to_remove.add(oe)
-
-                # Connect transient data to the outer output node.
-                if not onode:
-                    onode = graph.add_access(node.data)
-                graph.add_memlet_path(new_node,
-                                      global_map_exit,
-                                      onode,
-                                      memlet=Memlet(data=node.data, subset=in_subset),
-                                      src_conn=None)
+                                e.data.dst_subset.offset(in_subset, True)
+
+                    # Reconnect outgoing edges through the transient data.
+                    for oe in graph.out_edges(acc):
+                        if in_subset.covers(oe.data.src_subset):
+                            mem = Memlet(data=name,
+                                        subset=oe.data.src_subset.offset_new(in_subset, True),
+                                        other_subset=oe.data.dst_subset)
+                            new_edge = graph.add_edge(new_node, None, oe.dst, oe.dst_conn, mem)
+                            # Update memlet paths.
+                            for e in graph.memlet_path(new_edge):
+                                if e.data.data == node.data:
+                                    e.data.data = name
+                                    e.data.src_subset.offset(in_subset, True)
+                        else:
+                            # If the outgoing subset is not covered by the transient data, connect to the outer input node.
+                            if not inode:
+                                inode = graph.add_access(node.data)
+                            graph.add_memlet_path(inode, global_map_entry, oe.dst, memlet=oe.data, dst_conn=oe.dst_conn)
+                        to_remove.add(oe)
+
+                    # Connect transient data to the outer output node.
+                    if acc is node:
+                        if not onode:
+                            onode = graph.add_access(node.data)
+                        graph.add_memlet_path(new_node,
+                                              global_map_exit,
+                                              onode,
+                                              memlet=Memlet(data=node.data, subset=in_subset),
+                                              src_conn=None)
 
                 for e in to_remove:
                     graph.remove_edge(e)
                 if to_remove:
-                    graph.remove_node(node)
+                    graph.remove_nodes_from(accesses)

From b7806f0f028805615b3f435692ce4cab958ca295 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 10 Jul 2023 21:39:45 +0200
Subject: [PATCH 197/392] WIP: Test

---
 .../subgraph_fusion/intermediate_mimo_test.py | 45 +++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/tests/transformations/subgraph_fusion/intermediate_mimo_test.py b/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
index 008ea75bca..850c4aca07 100644
--- a/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
+++ b/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
@@ -1,9 +1,8 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import copy
 import dace
 from dace.sdfg import nodes
 from dace.sdfg.graph import SubgraphView
-from dace.transformation.dataflow import MapFission
 from dace.transformation.helpers import nest_state_subgraph
 import numpy as np
 import unittest
@@ -101,5 +100,45 @@ def test_mimo():
     _test_quantitatively(sdfg)
 
 
+def test_single_data_multiple_intermediate_accesses():
+
+    @dace.program
+    def sdmi_accesses(ZSOLQA: dace.float64[1, 5, 5], ZEPSEC: dace.float64, ZQX: dace.float64[1, 137, 5],
+                      LLINDEX3: dace.bool[1, 5, 5], ZRATIO: dace.float64[1, 5], ZSINKSUM: dace.float64[1, 5]):
+        
+        for i in dace.map[0:5]:
+            ZSINKSUM[0, i] = 0.0
+            for j in dace.map[0:5]:
+                LLINDEX3[0, j, i] = False
+        
+        for i in dace.map[0:5]:
+            for k in range(5):
+                ZSINKSUM[0, i] = ZSINKSUM[0, i] - ZSOLQA[0, 0, k]
+        
+        for i in dace.map[0:5]:
+            t0 = max(ZEPSEC, ZQX[0, 0, i])
+            t1 = max(t0, ZSINKSUM[0, i])
+            ZRATIO[0, i] = t0 / t1
+    
+    sdfg = sdmi_accesses.to_sdfg(simplify=True)
+    assert len(sdfg.states()) == 1
+
+    graph = sdfg.states()[0]
+    subgraph = SubgraphView(graph, [node for node in graph.nodes()])
+
+    me = MultiExpansion()
+    me.setup_match(subgraph)
+    assert me.can_be_applied(sdfg, subgraph) == True
+    me.apply(sdfg)
+
+    sf = SubgraphFusion()
+    sf.setup_match(subgraph)
+    assert sf.can_be_applied(sdfg, subgraph) == True
+    sf.apply(sdfg)
+
+    sdfg.view()
+
+
 if __name__ == '__main__':
-    test_mimo()
+    # test_mimo()
+    test_single_data_multiple_intermediate_accesses()

From 767557d4a39f2e94d5324344d5438abe47c67709 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 18:58:44 +0100
Subject: [PATCH 198/392] Working on tensor-transpose library.

---
 .../ttranspose/environments/__init__.py       |   2 +
 .../libraries/ttranspose/environments/hptt.py |  21 ++++
 dace/libraries/ttranspose/nodes/ttranspose.py | 105 ++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 dace/libraries/ttranspose/environments/__init__.py
 create mode 100644 dace/libraries/ttranspose/environments/hptt.py
 create mode 100644 dace/libraries/ttranspose/nodes/ttranspose.py

diff --git a/dace/libraries/ttranspose/environments/__init__.py b/dace/libraries/ttranspose/environments/__init__.py
new file mode 100644
index 0000000000..0c6487def2
--- /dev/null
+++ b/dace/libraries/ttranspose/environments/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+from .hptt import *
diff --git a/dace/libraries/ttranspose/environments/hptt.py b/dace/libraries/ttranspose/environments/hptt.py
new file mode 100644
index 0000000000..c289abc96b
--- /dev/null
+++ b/dace/libraries/ttranspose/environments/hptt.py
@@ -0,0 +1,21 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import library
+
+
+@library.environment
+class HPTT:
+
+    cmake_minimum_version = None
+    cmake_packages = []
+    cmake_variables = {}
+    cmake_includes = []
+    cmake_libraries = []
+    cmake_compile_flags = ["-I${HPTT_ROOT}/include"]
+    cmake_link_flags = ["-L${HPTT_ROOT}/lib -lhptt"]
+    cmake_files = []
+
+    headers = ["http.h"]
+    state_fields = []
+    init_code = ""
+    finalize_code = ""
+    dependencies = []
diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/ttranspose/nodes/ttranspose.py
new file mode 100644
index 0000000000..8ec026ea1c
--- /dev/null
+++ b/dace/libraries/ttranspose/nodes/ttranspose.py
@@ -0,0 +1,105 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace import library, nodes, properties, subsets
+from dace.transformation.transformation import ExpandTransformation
+from numbers import Number
+
+
+@library.expansion
+class ExpandPure(ExpandTransformation):
+    """ Implements the pure expansion of TensorTranspose library node. """
+    
+    environments = []
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg):
+        inp_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
+
+        sdfg = dace.SDFG(f"{node.label}_sdfg")
+        _, inp_arr = sdfg.add_array("_inp", inp_tensor.shape, inp_tensor.dtype, inp_tensor.storage, strides=inp_tensor.storage)
+        _, out_arr = sdfg.add_array("_out", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.storage)
+        
+        state = sdfg.add_state(f"{node.label}_state")   
+        inp_rng = subsets.Range.from_array(inp_arr)
+        map_params = [f"__i{i}" for i in range(inp_arr.shape)]
+        map_rng = {i: subsets.Range([r])for i, r in zip(map_params, inp_rng)}
+        inp_mem = dace.Memlet(expr=f"_inp[{','.join([map_params])}]")
+        out_mem = dace.Memlet(expr=f"_out[{','.join(map_params[node.axes])}]")
+        inputs = {"_inp": inp_mem}
+        outputs = {"_out": out_mem}
+        code = f"_out = {node.alpha} * _inp"
+        if node.beta != 0:
+            inputs["_inout": out_mem]
+            code = f"_out = {node.alpha} * _inp + {node.beta} * _inout"
+        state.add_mapped_tasklet(f"{node.label}_tasklet", map_rng, inputs, code, outputs, external_edges=True)
+
+        return sdfg
+
+
+@library.expansion
+class ExpantHPTT(ExpandTransformation):
+    """
+    Implements the TensorTranspose library node using the High-Performance Tensor Transpose Library (HPTT).
+    For more information, see https://github.com/springer13/hptt.
+    """
+    pass
+
+
+
+@library.node
+class TensorTranspose(nodes.LibraryNode):
+    """ Implements out-of-place tensor transpositions. """
+
+    implementations = {
+        "pure": ExpandPure,
+        "HPTT": ExpandHPTT
+    }
+    default_implementation = "HPTT"
+
+    axes = properties.ListProperty(element_type=int, default=[], description="Permutation of input tensor's modes")
+    alpha = properties.Property(dtype=Number, default=1, description="Input tensor scaling factor")
+    beta = properties.Property(dtype=Number, default=0, description="Output tensor scaling factor")
+
+    def __init__(self, name, axes=[], alpha=1, beta=0, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_inp_tensor"}, outputs={"_out_tensor"}, **kwargs)
+        self.axes = axes
+        self.alpha = alpha
+        self.beta = beta
+    
+    def validate(self, sdfg, state):
+        """
+        Validates the tensor transposition operation.
+        :return: A tuple (inp_tensor, out_tensor) for the data descriptors in the parent SDFG.
+        """
+
+        inp_tensor, out_tensor = None, None
+        for e in state.out_edges(self):
+            if e.src_conn == "_out_tensor":
+                out_tensor = sdfg.arrays[e.data.data]
+        for e in state.in_edges(self):
+            if e.dst_conn == "_inp_tensor":
+                inp_tensor = sdfg.arrays[e.data.data]
+
+        if not inp_tensor:
+            raise ValueError("Missing the input tensor.")
+        if not out_tensor:
+            raise ValueError("Missing the output tensor.")
+
+        if inp_tensor.dtype != out_tensor.dtype:
+            raise ValueError("The datatype of the input and output tensors must match.")
+        
+        if inp_tensor.storage != out_tensor.storage:
+            raise ValueError("The storage of the input and output tensors must match.")
+        
+        if len(inp_tensor.shape) != len(out_tensor.shape):
+            raise ValueError("The input and output tensors must have the same number of modes.")
+        if len(inp_tensor.shape) != len(self.axes):
+            raise ValueError("The axes list property must have as many elements as the number of tensor modes.")
+        if sorted(self.axes) != list(range(len(inp_tensor.shape))):
+            raise ValueError("The axes list property is not a perimutation of the input tensor's modes.")
+        
+        transposed_shape = [inp_tensor.shape[t] for t in self.axes]
+        if transposed_shape != list(out_tensor.shape):
+            raise ValueError("The permutation of the input shape does not match the output shape.")
+
+        return inp_tensor, out_tensor

From c8291dc856e0fabd53391924c3f6802887c40d21 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 21:29:36 +0100
Subject: [PATCH 199/392] Small fixes in pure expansion of TensorTranspose.

---
 dace/libraries/ttranspose/__init__.py         |  6 ++++
 dace/libraries/ttranspose/nodes/__init__.py   |  2 ++
 dace/libraries/ttranspose/nodes/ttranspose.py | 29 ++++++++++---------
 3 files changed, 23 insertions(+), 14 deletions(-)
 create mode 100644 dace/libraries/ttranspose/__init__.py
 create mode 100644 dace/libraries/ttranspose/nodes/__init__.py

diff --git a/dace/libraries/ttranspose/__init__.py b/dace/libraries/ttranspose/__init__.py
new file mode 100644
index 0000000000..512d177beb
--- /dev/null
+++ b/dace/libraries/ttranspose/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+from dace.library import register_library
+from .nodes import *
+from .environments import *
+
+register_library(__name__, "ttranpose")
\ No newline at end of file
diff --git a/dace/libraries/ttranspose/nodes/__init__.py b/dace/libraries/ttranspose/nodes/__init__.py
new file mode 100644
index 0000000000..0babde8a78
--- /dev/null
+++ b/dace/libraries/ttranspose/nodes/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserv
+from .ttranspose import  TensorTranspose
diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/ttranspose/nodes/ttranspose.py
index 8ec026ea1c..e852374936 100644
--- a/dace/libraries/ttranspose/nodes/ttranspose.py
+++ b/dace/libraries/ttranspose/nodes/ttranspose.py
@@ -3,6 +3,7 @@
 from dace import library, nodes, properties, subsets
 from dace.transformation.transformation import ExpandTransformation
 from numbers import Number
+from .. import environments
 
 
 @library.expansion
@@ -16,20 +17,19 @@ def expansion(node, parent_state, parent_sdfg):
         inp_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
 
         sdfg = dace.SDFG(f"{node.label}_sdfg")
-        _, inp_arr = sdfg.add_array("_inp", inp_tensor.shape, inp_tensor.dtype, inp_tensor.storage, strides=inp_tensor.storage)
-        _, out_arr = sdfg.add_array("_out", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.storage)
+        _, inp_arr = sdfg.add_array("_inp_tensor", inp_tensor.shape, inp_tensor.dtype, inp_tensor.storage, strides=inp_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
         
         state = sdfg.add_state(f"{node.label}_state")   
-        inp_rng = subsets.Range.from_array(inp_arr)
-        map_params = [f"__i{i}" for i in range(inp_arr.shape)]
-        map_rng = {i: subsets.Range([r])for i, r in zip(map_params, inp_rng)}
-        inp_mem = dace.Memlet(expr=f"_inp[{','.join([map_params])}]")
-        out_mem = dace.Memlet(expr=f"_out[{','.join(map_params[node.axes])}]")
+        map_params = [f"__i{i}" for i in range(len(inp_arr.shape))]
+        map_rng = {i: f"0:{s}"for i, s in zip(map_params, inp_arr.shape)}
+        inp_mem = dace.Memlet(expr=f"_inp_tensor[{','.join(map_params)}]")
+        out_mem = dace.Memlet(expr=f"_out_tensor[{','.join([map_params[i] for i in node.axes])}]")
         inputs = {"_inp": inp_mem}
         outputs = {"_out": out_mem}
         code = f"_out = {node.alpha} * _inp"
         if node.beta != 0:
-            inputs["_inout": out_mem]
+            inputs["_inout"] = out_mem
             code = f"_out = {node.alpha} * _inp + {node.beta} * _inout"
         state.add_mapped_tasklet(f"{node.label}_tasklet", map_rng, inputs, code, outputs, external_edges=True)
 
@@ -37,12 +37,13 @@ def expansion(node, parent_state, parent_sdfg):
 
 
 @library.expansion
-class ExpantHPTT(ExpandTransformation):
+class ExpandHPTT(ExpandTransformation):
     """
     Implements the TensorTranspose library node using the High-Performance Tensor Transpose Library (HPTT).
     For more information, see https://github.com/springer13/hptt.
     """
-    pass
+    
+    environments = [environments.HPTT]
 
 
 
@@ -54,11 +55,11 @@ class TensorTranspose(nodes.LibraryNode):
         "pure": ExpandPure,
         "HPTT": ExpandHPTT
     }
-    default_implementation = "HPTT"
+    default_implementation = "pure"
 
-    axes = properties.ListProperty(element_type=int, default=[], description="Permutation of input tensor's modes")
-    alpha = properties.Property(dtype=Number, default=1, description="Input tensor scaling factor")
-    beta = properties.Property(dtype=Number, default=0, description="Output tensor scaling factor")
+    axes = properties.ListProperty(element_type=int, default=[], desc="Permutation of input tensor's modes")
+    alpha = properties.Property(dtype=Number, default=1, desc="Input tensor scaling factor")
+    beta = properties.Property(dtype=Number, default=0, desc="Output tensor scaling factor")
 
     def __init__(self, name, axes=[], alpha=1, beta=0, *args, **kwargs):
         super().__init__(name, *args, inputs={"_inp_tensor"}, outputs={"_out_tensor"}, **kwargs)

From c569286ce10bdbd1e8b744d651b1425b348c6622 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 21:36:34 +0100
Subject: [PATCH 200/392] Improvements to numpy transpose replacement to follow
 numpy semantics.

---
 dace/frontend/python/replacements.py | 19 +++++++++++--------
 tests/numpy/transpose_test.py        | 10 ++++++++++
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 3586d40374..00c7fc15c2 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -783,14 +783,17 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
         state.add_node(tasklet)
         state.add_edge(acc1, None, tasklet, '_inp', Memlet.from_array(inpname, arr1))
         state.add_edge(tasklet, '_out', acc2, None, Memlet.from_array(outname, arr2))
-    else:
-        state.add_mapped_tasklet(
-            "_transpose_", {"_i{}".format(i): "0:{}".format(s)
-                            for i, s in enumerate(arr1.shape)},
-            dict(_in=Memlet.simple(inpname, ", ".join("_i{}".format(i) for i, _ in enumerate(arr1.shape)))),
-            "_out = _in",
-            dict(_out=Memlet.simple(outname, ", ".join("_i{}".format(axes[i]) for i, _ in enumerate(arr1.shape)))),
-            external_edges=True)
+    else:  # tensor transpose
+        if len(axes) != len(arr1.shape) or sorted(axes) != list(range(len(arr1.shape))):
+            raise ValueError("axes don't match array")
+
+        read = state.add_read(inpname)
+        write = state.add_write(outname)
+        from dace.libraries.ttranspose import TensorTranspose
+        tasklet = TensorTranspose('_TensorTranspose', axes or list(range(len(arr1.shape))))
+        state.add_node(tasklet)
+        state.add_edge(read, None, tasklet, '_inp_tensor', Memlet.from_array(inpname, arr1))
+        state.add_edge(tasklet, '_out_tensor', write, None, Memlet.from_array(outname, arr2))
 
     return outname
 
diff --git a/tests/numpy/transpose_test.py b/tests/numpy/transpose_test.py
index bdbf687c37..5dd73af730 100644
--- a/tests/numpy/transpose_test.py
+++ b/tests/numpy/transpose_test.py
@@ -25,6 +25,14 @@ def test_transpose_axes1(A: dace.float32[10, 5, 3, 2]):
 def test_transpose_axes2(A: dace.float32[10, 5, 3, 2]):
     return np.transpose(A, axes=[3, 0, 2])
 
+@compare_numpy_output()
+def test_transpose_none(A: dace.float32[10, 5, 3, 2]):
+    return np.transpose(A)
+
+@compare_numpy_output()
+def test_transpose_no(A: dace.float32[10, 5, 3, 2]):
+    return np.transpose(A, axes=[0, 1, 2, 3])
+
 
 def test_transpose():
     A = np.random.rand(M, N).astype(np.float32)
@@ -42,3 +50,5 @@ def test_transpose():
     test_transpose_axes1()
     test_transpose_axes2()
     test_transpose()
+    test_transpose_none()
+    test_transpose_no()

From 459e07c5e466bccd2273fdf7b64a310823fac479 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 22:52:34 +0100
Subject: [PATCH 201/392] Added HPTT expansion.

---
 dace/libraries/ttranspose/__init__.py         |  2 +-
 .../libraries/ttranspose/environments/hptt.py | 29 +++++++++++++++----
 dace/libraries/ttranspose/nodes/ttranspose.py | 28 ++++++++++++++++--
 3 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/dace/libraries/ttranspose/__init__.py b/dace/libraries/ttranspose/__init__.py
index 512d177beb..6c49f26fa6 100644
--- a/dace/libraries/ttranspose/__init__.py
+++ b/dace/libraries/ttranspose/__init__.py
@@ -3,4 +3,4 @@
 from .nodes import *
 from .environments import *
 
-register_library(__name__, "ttranpose")
\ No newline at end of file
+register_library(__name__, "ttranspose")
\ No newline at end of file
diff --git a/dace/libraries/ttranspose/environments/hptt.py b/dace/libraries/ttranspose/environments/hptt.py
index c289abc96b..67a6057ed0 100644
--- a/dace/libraries/ttranspose/environments/hptt.py
+++ b/dace/libraries/ttranspose/environments/hptt.py
@@ -1,5 +1,7 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
-from dace import library
+import ctypes
+import os
+from dace import config, library
 
 
 @library.environment
@@ -8,14 +10,31 @@ class HPTT:
     cmake_minimum_version = None
     cmake_packages = []
     cmake_variables = {}
-    cmake_includes = []
     cmake_libraries = []
-    cmake_compile_flags = ["-I${HPTT_ROOT}/include"]
-    cmake_link_flags = ["-L${HPTT_ROOT}/lib -lhptt"]
+    cmake_compile_flags = []
+    cmake_link_flags = []
     cmake_files = []
 
-    headers = ["http.h"]
+    headers = ["hptt.h"]
     state_fields = []
     init_code = ""
     finalize_code = ""
     dependencies = []
+
+    @staticmethod
+    def cmake_includes():
+        if 'HPTT_ROOT' in os.environ:
+            return [os.path.join(os.environ['HPTT_ROOT'], 'include')]
+        else:
+            return []
+    
+    @staticmethod
+    def cmake_libraries():
+        if 'HPTT_ROOT' in os.environ:
+            prefix = config.Config.get('compiler', 'library_prefix')
+            suffix = config.Config.get('compiler', 'library_extension')
+            libfile = os.path.join(os.environ['HPTT_ROOT'], 'lib', prefix + 'hptt.' + suffix)
+            if os.path.isfile(libfile):
+                return [libfile]
+        
+        return ['hptt']
diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/ttranspose/nodes/ttranspose.py
index e852374936..42582168db 100644
--- a/dace/libraries/ttranspose/nodes/ttranspose.py
+++ b/dace/libraries/ttranspose/nodes/ttranspose.py
@@ -1,6 +1,9 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
-from dace import library, nodes, properties, subsets
+import multiprocessing
+from dace import library, nodes, properties
+from dace.libraries.blas import blas_helpers
+from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
 from numbers import Number
 from .. import environments
@@ -45,6 +48,27 @@ class ExpandHPTT(ExpandTransformation):
     
     environments = [environments.HPTT]
 
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg):
+        inp_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
+        axes = ','.join([symstr(a) for a in node.axes])
+        shape = ','.join([symstr(s) for s in inp_tensor.shape])
+        dchar = blas_helpers.to_blastype(inp_tensor.dtype.type).lower()
+        alpha = symstr(node.alpha)
+        beta = symstr(node.beta)
+        code = f"""
+            int perm[{len(inp_tensor.shape)}] = {{{axes}}};
+            int size[{len(inp_tensor.shape)}] = {{{shape}}};
+            {dchar}TensorTranspose(perm, {len(inp_tensor.shape)}, {alpha}, _inp_tensor, size, NULL, {beta}, _out_tensor, NULL, {multiprocessing.cpu_count()}, 1);
+        """
+
+        tasklet = nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP)
+
+        return tasklet
 
 
 @library.node
@@ -55,7 +79,7 @@ class TensorTranspose(nodes.LibraryNode):
         "pure": ExpandPure,
         "HPTT": ExpandHPTT
     }
-    default_implementation = "pure"
+    default_implementation = None
 
     axes = properties.ListProperty(element_type=int, default=[], desc="Permutation of input tensor's modes")
     alpha = properties.Property(dtype=Number, default=1, desc="Input tensor scaling factor")

From f272b051fc522def83aac9df5eb5aa69264086db Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 22:53:04 +0100
Subject: [PATCH 202/392] Added ttranspose library to the configuration schema.

---
 dace/config_schema.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index e378b6c1f2..ea862e5237 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -1016,3 +1016,18 @@ required:
                         description: >
                             Force the default implementation, even if an
                             implementation has been explicitly set on a node.
+            ttranspose:
+                type: dict
+                title: ttranspose
+                description: Built-in Tensor-Transpose DaCe library.
+                required:
+                    default_implementation:
+                        type: str
+                        default: pure
+                        description: Default implementation of Tensor-Transpose library nodes.
+                    override:
+                        type: bool
+                        default: false
+                        description: >
+                            Force the default implementation, even if an
+                            implementation has been explicitly set on a node.

From ba77a1f32bfad44218785c6ff65fe66b22e677de Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 22:53:20 +0100
Subject: [PATCH 203/392] Added HPTT tests.

---
 pytest.ini                    |  1 +
 tests/numpy/transpose_test.py | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/pytest.ini b/pytest.ini
index 340e3d3620..087be3d897 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -13,6 +13,7 @@ markers =
     mpi: Test requires MPI. (select with '-m mpi')
     scalapack: Test requires ScaLAPACK (Intel MKL and OpenMPI). (select with '-m scalapack')
     datainstrument: Test uses data instrumentation (select with '-m datainstrument')
+    hptt: Test requires the HPTT library (select with '-m "hptt')
 python_files =
     *_test.py
     *_cudatest.py
diff --git a/tests/numpy/transpose_test.py b/tests/numpy/transpose_test.py
index 5dd73af730..df2c9ff2e5 100644
--- a/tests/numpy/transpose_test.py
+++ b/tests/numpy/transpose_test.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import numpy as np
 import dace
+import pytest
 from common import compare_numpy_output
 
 M, N = 24, 24
@@ -45,6 +46,15 @@ def test_transpose():
     assert rel_error <= 1e-5
 
 
+@pytest.mark.hptt
+def test_hptt():
+    with dace.config.set_temporary('library', 'ttranspose', 'default_implementation', value='HPTT'):
+        test_transpose_axes0()
+        test_transpose_axes1()
+        test_transpose_axes2()
+        test_transpose_none()
+
+
 if __name__ == '__main__':
     test_transpose_axes0()
     test_transpose_axes1()
@@ -52,3 +62,4 @@ def test_transpose():
     test_transpose()
     test_transpose_none()
     test_transpose_no()
+    test_hptt()

From c0dbfa2289ff02d8177f9aa7dd8e5b26bae38c2c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 9 Mar 2022 23:02:10 +0100
Subject: [PATCH 204/392] Added TypeError exception for unsupported datatypes.

---
 dace/libraries/ttranspose/nodes/ttranspose.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/ttranspose/nodes/ttranspose.py
index 42582168db..c8ecd00136 100644
--- a/dace/libraries/ttranspose/nodes/ttranspose.py
+++ b/dace/libraries/ttranspose/nodes/ttranspose.py
@@ -54,6 +54,8 @@ def expansion(node, parent_state, parent_sdfg):
         axes = ','.join([symstr(a) for a in node.axes])
         shape = ','.join([symstr(s) for s in inp_tensor.shape])
         dchar = blas_helpers.to_blastype(inp_tensor.dtype.type).lower()
+        if dchar not in ('s', 'd', 'c', 'z'):
+            raise TypeError("HPTT supports only single and double (and corresponding complex) FP datatypes")
         alpha = symstr(node.alpha)
         beta = symstr(node.beta)
         code = f"""

From 9bb00a964005c56ebbdbceabc919cb0038ef3a6a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 11:13:28 +0100
Subject: [PATCH 205/392] Started working on TensorDot library node (see numpy
 linalg).

---
 dace/libraries/linalg/nodes/tensordot.py | 124 +++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 dace/libraries/linalg/nodes/tensordot.py

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
new file mode 100644
index 0000000000..5d3ed3c68b
--- /dev/null
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -0,0 +1,124 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import multiprocessing
+from dace import library, nodes, properties
+from dace.libraries.blas import blas_helpers
+from dace.symbolic import symstr
+from dace.transformation.transformation import ExpandTransformation
+from numbers import Number
+from .. import environments
+
+
+@library.expansion
+class ExpandPure(ExpandTransformation):
+    """ Implements the pure expansion of TensorDot library node. """
+    
+    environments = []
+
+
+@library.expansion
+class ExpandTTGT(ExpandTransformation):
+    """
+    Expands the TensorDot library node to TensorTranspose + GEMM operations.
+    TTGT stands for Transpose-Transpose-GEMM-Transpose.
+    """
+    
+    environments = []
+
+
+@library.expansion
+class ExpandCuTensor(ExpandTransformation):
+    """
+    Implements the TensorDot library node using cuTENSOR for CUDA-compatible GPUs.
+    For more information, see https://developer.nvidia.com/cutensor.
+    """
+
+
+@library.node
+class TensorDot(nodes.LibraryNode):
+    """ Implements tensor dot-product. """
+
+    implementations = {
+        "pure": ExpandPure,
+        "TTGT": ExpandTTGT,
+        "cuTENSOR": ExpandCuTensor
+    }
+    default_implementation = None
+
+    left_axes = properties.ListProperty(element_type=int, default=[], desc="Left tensor's contracting modes")
+    right_axes = properties.ListProperty(element_type=int, default=[], desc="Right tensor's contracting modes")
+    permutation = properties.ListProperty(element_type=int, allow_none=True, default=None, desc="Permutation of the output tensor")
+
+    def __init__(self, name, left_axes=[], right_axes=[], *args, **kwargs):
+        super().__init__(name, *args, inputs={"_left_tensor", "_right_tensor"}, outputs={"_out_tensor"}, **kwargs)
+        self.left_axes = left_axes
+        self.right_axes = right_axes
+    
+    def validate(self, sdfg, state):
+        """
+        Validates the tensor dot-product operation.
+        :return: A triple (left_tensor, right_tensor, out_tensor) for the data descriptors in the parent SDFG.
+        """
+
+        left_tensor, right_tensor, out_tensor = None, None
+        for e in state.out_edges(self):
+            if e.src_conn == "_out_tensor":
+                out_tensor = sdfg.arrays[e.data.data]
+        for e in state.in_edges(self):
+            if e.dst_conn == "_left_tensor":
+                left_tensor = sdfg.arrays[e.data.data]
+            elif e.dst_conn == "_right_tensor":
+                right_tensor = sdfg.arrays[e.data.data]
+
+        if not left_tensor or not right_tensor:
+            raise ValueError("Missing the input tensors.")
+        if not out_tensor:
+            raise ValueError("Missing the output tensor.")
+
+        if left_tensor.dtype != right_tensor.dtype or left_tensor.dtype != out_tensor.dtype:
+            raise TypeError("The datatype of the input and output tensors must match.")    
+        if left_tensor.storage != right_tensor.storage or left_tensor.storage != out_tensor.storage:
+            raise ValueError("The storage of the input and output tensors must match.")
+
+        if any(a >= len(left_tensor.shape) or a < 0 for a in self.left_axes):
+            raise ValueError("Axes for left tensor are out-of-bounds.")
+        if any(a >= len(right_tensor.shape) or a < 0 for a in self.right_axes):
+            raise ValueError("Axes for right tensor are out-of-bounds.")
+        if len(self.left_axes) != len(self.right_axes):
+            raise ValueError("The input tensors must have the same number of contracting modes.")
+        if any(left_tensor.shape[l] != left_tensor.shape[r] for l, r in zip(self.left_axes, self.right_axes)):
+            raise ValueError("The input tensors' contracting modes must have the same length.")
+        
+        dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in self.left_axes]
+        dot_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in self.right_axes])
+        out_shape = list(out_tensor.shape)
+        if len(dot_shape) != len(out_shape):
+            raise ValueError("The intermediate (dot-product) and output tensors must have the same number of modes..")
+        
+        # # We check if the output shape is a permutation of a dot-product shape.
+        # # NOTE: Since the shapes may be symbolic, we cannot just sort and compare them.
+        # for s in out_shape:
+        #     try:
+        #         idx = dot_shape.index(s)
+        #         dot_shape.pop(idx)
+        #     except ValueError:
+        #         raise ValueError("The output tensor shape is not a permutation of the dot-product shape.")
+        # if dot_shape:
+        #     raise ValueError("The output tensor shape is not a permutation of the dot-product shape.")
+
+
+        if not self.permutation:
+            if dot_shape != out_shape:
+                raise ValueError("The shapes of the intermediate (dot-product) and output tensors must match.")
+        else:
+            # NOTE: If the output tensor is transposed, then the permutation must be given explicitely. The permutation
+            # can only be inferred if each tensor mode has different length, which should never be assumed.
+            if len(out_tensor.shape) != len(self.permutation):
+                raise ValueError("The permutation list property must have as many elements as the number of output tensor modes.")
+            if sorted(self.permutation) != list(range(len(out_tensor.shape))):
+                raise ValueError("The permutation list property is not a perimutation of the output tensor's modes.")
+            transposed_shape = [dot_shape[p] for p in self.permutation]
+            if transposed_shape != list(out_tensor.shape):
+                raise ValueError("The permutation of the intermediate (dot-product) shape does not match the output shape.")
+
+        return left_tensor, right_tensor, out_tensor

From 1e0a3cd4d1c2cae4d7b86844b1f7feaede008695 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 12:58:59 +0100
Subject: [PATCH 206/392] Added tensor-dot pure expansion.

---
 dace/libraries/linalg/nodes/tensordot.py | 50 ++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 5d3ed3c68b..f228bd68f4 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -1,4 +1,5 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+import collections
 import dace
 import multiprocessing
 from dace import library, nodes, properties
@@ -15,6 +16,55 @@ class ExpandPure(ExpandTransformation):
     
     environments = []
 
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg):
+        left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
+
+        sdfg = dace.SDFG(f"{node.label}_sdfg")
+        _, left_arr = sdfg.add_array("_left_tensor", left_tensor.shape, left_tensor.dtype, left_tensor.storage, strides=left_tensor.strides)
+        _, right_arr = sdfg.add_array("_right_tensor", right_tensor.shape, right_tensor.dtype, right_tensor.storage, strides=right_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
+        
+        state = sdfg.add_state(f"{node.label}_init")
+        state.add_mapped_tasklet(f"{node.label}_init_tasklet", 
+                                 {f"__i{i}": f"0:{symstr(s)}" for i, s in enumerate(out_tensor.shape)},
+                                 {},
+                                 '__out = 0',
+                                 {'__out': dace.Memlet(expr=f"_out_tensor[{','.join(['__i%d' % i for i in range(len(out_tensor.shape))])}]")},
+                                 external_edges=True)
+
+        state = sdfg.add_state(f"{node.label}_state")
+
+        outer_map_shape = list([s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes])
+        outer_map_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])
+        outer_map_params = [f"__oi{i}" for i in range(len(outer_map_shape))]
+        outer_map_rng = {i: f"0:{symstr(s)}"for i, s in zip(outer_map_params, outer_map_shape)}
+        inner_map_shape = list([left_tensor.shape[i] for i in node.left_axes])
+        inner_map_params = [f"__ii{i}" for i in range(len(inner_map_shape))]
+        inner_map_rng = {i: f"0:{symstr(s)}"for i, s in zip(inner_map_params, inner_map_shape)}
+
+        left_idx = outer_map_params[:len(left_tensor.shape)-len(node.left_axes)]
+        left_dict = {j: inner_map_params[i] for i, j in enumerate(node.left_axes)}
+        left_sorted_dict = collections.OrderedDict(sorted(left_dict.items()))
+        for k, v in left_sorted_dict.items():
+            left_idx.insert(k, v)
+        right_idx = outer_map_params[len(left_tensor.shape)-len(node.left_axes):]
+        right_dict = {j: inner_map_params[i] for i, j in enumerate(node.right_axes)}
+        right_sorted_dict = collections.OrderedDict(sorted(right_dict.items()))
+        for k, v in right_sorted_dict.items():
+            right_idx.insert(k, v)
+        out_idx = outer_map_params
+
+        left_mem = dace.Memlet(expr=f"_left_tensor[{','.join(left_idx)}]")
+        right_mem = dace.Memlet(expr=f"_right_tensor[{','.join(right_idx)}]")
+        out_mem = dace.Memlet(expr=f"_out_tensor[{','.join(out_idx)}]", wcr="lambda x, y: x + y")
+        inputs = {"_left": left_mem, "_right": right_mem}
+        outputs = {"_out": out_mem}
+        code = f"_out = _left * _right"
+        state.add_mapped_tasklet(f"{node.label}_tasklet", {**outer_map_rng, **inner_map_rng}, inputs, code, outputs, external_edges=True)
+
+        return sdfg
+
 
 @library.expansion
 class ExpandTTGT(ExpandTransformation):

From 8b73eb069690a6a00fcdf403114848bfe1925ed1 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 13:18:07 +0100
Subject: [PATCH 207/392] Added tensordot replacement.

---
 dace/frontend/python/replacements.py     | 44 ++++++++++++++++++++++++
 dace/libraries/linalg/nodes/__init__.py  |  1 +
 dace/libraries/linalg/nodes/tensordot.py |  7 ++--
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 00c7fc15c2..d145fa317e 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -4540,6 +4540,50 @@ def _inv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inp_op: str):
     return out_arr[0]
 
 
+@oprepo.replaces('dace.tensordot')
+@oprepo.replaces('numpy.tensordot')
+def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op_b: str, axes: Union[int, Sequence[int]] = 2]):
+
+    for op in (op_a, op_b):
+        if not isinstance(op, str) or not op in sdfg.arrays.keys():
+            raise SyntaxError()
+
+    arr_a = sdfg.arrays[op_a]
+    arr_b = sdfg.arrays[op_b]
+
+    if isinstance(axes, Integral):
+        left_axes = list(range(len(arr_a.shape) - axes, len(arr_a.shape)))
+        right_axes = list(range(0, axes))
+    else:
+        left_axes = axes[0]
+        right_axes = axes[1]
+
+    # Some validation (more detailed validation is done inside the TensorDot library node)
+    if any(a >= len(arr_a.shape) or a < 0 for a in left_axes):
+        raise ValueError("Axes for left tensor are out-of-bounds.")
+    if any(a >= len(arr_b.shape) or a < 0 for a in right_axes):
+        raise ValueError("Axes for right tensor are out-of-bounds.")
+    if len(left_axes) != len(right_axes):
+        raise ValueError("The input tensors must have the same number of contracting modes.")
+    if any(arr_a.shape[l] != arr_b.shape[r] for l, r in zip(left_axes, right_axes)):
+        raise ValueError("The input tensors' contracting modes must have the same length.")
+    
+    dot_shape = [s for i, s in enumerate(arr_a.shape) if i not in left_axes]
+    dot_shape.extend([s for i, s in enumerate(arr_b.shape) if i not in right_axes])
+    op_c, arr_c = sdfg.add_temp_transient(arr_a.shape, arr_a.dtype, storage=arr_a.storage)
+
+    from dace.libraries.linalg import TensorDot
+    a = state.add_read(op_a)
+    b = state.add_read(op_b)
+    c = state.add_write(op_c)
+    tasklet = TensorDot("_TensorDot_", left_axes, right_axes)
+    state.add_edge(a, None, tasklet, '_left_tensor', Memlet.from_array(op_a, arr_a))
+    state.add_edge(b, None, tasklet, '_right_tensor', Memlet.from_array(op_b, arr_b))
+    state.add_edge(tasklet, '_out_tensor', c, None, Memlet.from_array(op_c, arr_c))
+
+    return op_c
+
+
 # CuPy replacements
 
 
diff --git a/dace/libraries/linalg/nodes/__init__.py b/dace/libraries/linalg/nodes/__init__.py
index 640d229db5..7d3c935dbf 100644
--- a/dace/libraries/linalg/nodes/__init__.py
+++ b/dace/libraries/linalg/nodes/__init__.py
@@ -2,3 +2,4 @@
 from .inv import Inv
 from .solve import Solve
 from .cholesky import Cholesky
+from .tensordot import TensorDot
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index f228bd68f4..98963626dd 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -54,6 +54,8 @@ def expansion(node, parent_state, parent_sdfg):
         for k, v in right_sorted_dict.items():
             right_idx.insert(k, v)
         out_idx = outer_map_params
+        if node.permutation:
+            out_idx = [outer_map_params[i] for i in node.permutation]
 
         left_mem = dace.Memlet(expr=f"_left_tensor[{','.join(left_idx)}]")
         right_mem = dace.Memlet(expr=f"_right_tensor[{','.join(right_idx)}]")
@@ -99,10 +101,11 @@ class TensorDot(nodes.LibraryNode):
     right_axes = properties.ListProperty(element_type=int, default=[], desc="Right tensor's contracting modes")
     permutation = properties.ListProperty(element_type=int, allow_none=True, default=None, desc="Permutation of the output tensor")
 
-    def __init__(self, name, left_axes=[], right_axes=[], *args, **kwargs):
+    def __init__(self, name, left_axes=[], right_axes=[], permutation=NOne, *args, **kwargs):
         super().__init__(name, *args, inputs={"_left_tensor", "_right_tensor"}, outputs={"_out_tensor"}, **kwargs)
         self.left_axes = left_axes
         self.right_axes = right_axes
+        self.permutation = permutation
     
     def validate(self, sdfg, state):
         """
@@ -136,7 +139,7 @@ def validate(self, sdfg, state):
             raise ValueError("Axes for right tensor are out-of-bounds.")
         if len(self.left_axes) != len(self.right_axes):
             raise ValueError("The input tensors must have the same number of contracting modes.")
-        if any(left_tensor.shape[l] != left_tensor.shape[r] for l, r in zip(self.left_axes, self.right_axes)):
+        if any(left_tensor.shape[l] != right_tensor.shape[r] for l, r in zip(self.left_axes, self.right_axes)):
             raise ValueError("The input tensors' contracting modes must have the same length.")
         
         dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in self.left_axes]

From 79f37fc34d993a7b4a6ca47b37d95ea426918880 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 13:32:31 +0100
Subject: [PATCH 208/392] Minor fixes.

---
 dace/frontend/python/replacements.py     |  4 ++--
 dace/libraries/linalg/nodes/tensordot.py | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index d145fa317e..e908dfd962 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -4542,7 +4542,7 @@ def _inv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inp_op: str):
 
 @oprepo.replaces('dace.tensordot')
 @oprepo.replaces('numpy.tensordot')
-def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op_b: str, axes: Union[int, Sequence[int]] = 2]):
+def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op_b: str, axes: Union[int, Sequence[int]] = 2):
 
     for op in (op_a, op_b):
         if not isinstance(op, str) or not op in sdfg.arrays.keys():
@@ -4570,7 +4570,7 @@ def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op
     
     dot_shape = [s for i, s in enumerate(arr_a.shape) if i not in left_axes]
     dot_shape.extend([s for i, s in enumerate(arr_b.shape) if i not in right_axes])
-    op_c, arr_c = sdfg.add_temp_transient(arr_a.shape, arr_a.dtype, storage=arr_a.storage)
+    op_c, arr_c = sdfg.add_temp_transient(dot_shape, arr_a.dtype, storage=arr_a.storage)
 
     from dace.libraries.linalg import TensorDot
     a = state.add_read(op_a)
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 98963626dd..dab2e96e62 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -25,8 +25,8 @@ def expansion(node, parent_state, parent_sdfg):
         _, right_arr = sdfg.add_array("_right_tensor", right_tensor.shape, right_tensor.dtype, right_tensor.storage, strides=right_tensor.strides)
         _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
         
-        state = sdfg.add_state(f"{node.label}_init")
-        state.add_mapped_tasklet(f"{node.label}_init_tasklet", 
+        init_state = sdfg.add_state(f"{node.label}_init", is_start_state=True)
+        init_state.add_mapped_tasklet(f"{node.label}_init_tasklet", 
                                  {f"__i{i}": f"0:{symstr(s)}" for i, s in enumerate(out_tensor.shape)},
                                  {},
                                  '__out = 0',
@@ -34,6 +34,7 @@ def expansion(node, parent_state, parent_sdfg):
                                  external_edges=True)
 
         state = sdfg.add_state(f"{node.label}_state")
+        sdfg.add_edge(init_state, state, dace.InterstateEdge())
 
         outer_map_shape = list([s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes])
         outer_map_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])
@@ -85,6 +86,8 @@ class ExpandCuTensor(ExpandTransformation):
     For more information, see https://developer.nvidia.com/cutensor.
     """
 
+    environments = []
+
 
 @library.node
 class TensorDot(nodes.LibraryNode):
@@ -101,7 +104,7 @@ class TensorDot(nodes.LibraryNode):
     right_axes = properties.ListProperty(element_type=int, default=[], desc="Right tensor's contracting modes")
     permutation = properties.ListProperty(element_type=int, allow_none=True, default=None, desc="Permutation of the output tensor")
 
-    def __init__(self, name, left_axes=[], right_axes=[], permutation=NOne, *args, **kwargs):
+    def __init__(self, name, left_axes=[], right_axes=[], permutation=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_left_tensor", "_right_tensor"}, outputs={"_out_tensor"}, **kwargs)
         self.left_axes = left_axes
         self.right_axes = right_axes
@@ -113,7 +116,7 @@ def validate(self, sdfg, state):
         :return: A triple (left_tensor, right_tensor, out_tensor) for the data descriptors in the parent SDFG.
         """
 
-        left_tensor, right_tensor, out_tensor = None, None
+        left_tensor, right_tensor, out_tensor = None, None, None
         for e in state.out_edges(self):
             if e.src_conn == "_out_tensor":
                 out_tensor = sdfg.arrays[e.data.data]

From 9e5cab7da736b2f9d2474c4488b63f49c32c3365 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 13:34:04 +0100
Subject: [PATCH 209/392] Added first tensordot test.

---
 tests/numpy/linalg_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index 6b3e24e58e..b10d2fad56 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -66,7 +66,20 @@ def test_linalg_cholesky():
     assert relative_error(val, ref) < 1e-10
 
 
+def test_tensordot_0():
+
+    @dace.program
+    def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B)
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
+        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+
+
 if __name__ == "__main__":
     test_linalg_inv()
     test_linalg_solve()
     test_linalg_cholesky()
+    test_tensordot_0()

From 1ca95d3c271c98e98f78faa45a1a37b923fb27c7 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 13:36:39 +0100
Subject: [PATCH 210/392] Added second tensordot test.

---
 tests/numpy/linalg_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index b10d2fad56..7ac362f63e 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -78,8 +78,21 @@ def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
         assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
 
 
+def test_tensordot_1():
+
+    @dace.program
+    def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]))
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
+        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+
+
 if __name__ == "__main__":
     test_linalg_inv()
     test_linalg_solve()
     test_linalg_cholesky()
     test_tensordot_0()
+    test_tensordot_1()

From 3511ece22ad9d3ca8decfb6a80b15b61e96777ec Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 16:39:56 +0100
Subject: [PATCH 211/392] Added TTGT implementation.

---
 dace/libraries/linalg/nodes/tensordot.py | 71 ++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index dab2e96e62..62eb47504d 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -3,6 +3,7 @@
 import dace
 import multiprocessing
 from dace import library, nodes, properties
+from dace.data import _prod
 from dace.libraries.blas import blas_helpers
 from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
@@ -78,6 +79,76 @@ class ExpandTTGT(ExpandTransformation):
     
     environments = []
 
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg):
+        left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
+
+        sdfg = dace.SDFG(f"{node.label}_sdfg")
+        _, left_arr = sdfg.add_array("_left_tensor", left_tensor.shape, left_tensor.dtype, left_tensor.storage, strides=left_tensor.strides)
+        _, right_arr = sdfg.add_array("_right_tensor", right_tensor.shape, right_tensor.dtype, right_tensor.storage, strides=right_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
+
+        from dace.frontend.python.replacements import _transpose
+        # NOTE: We use the numpy.transpose replacement because:
+        # (1) It will return the tensor itself if transposition is uncessary.
+        # (2) It will use matrix transpose operation for 2-mode tensors.
+        state = sdfg.add_state(f"{node.label}_inp_transpose_state", is_start_state=True)
+        left_axes = [i for i in range(left_arr.shape) if i not in node.left_axes]
+        left_axes.extend(node.left_axes)
+        left_tt = _transpose(None, sdfg, state, "_left_tensor", left_axes)
+        left_tt_arr = sdfg.arrays[left_tt]
+        right_axes = node.right_axes
+        right_axes.extend([i for i in range(right_arr.shape) if i not in node.right_axes])
+        right_tt = _transpose(None, sdfg, state, "_right_tensor", right_axes)
+        right_tt_arr = sdfg.arrays[right_tt]
+
+        from dace.libraries.blas import Gemm
+        prv_state = state
+        state = sdfg.add_state(f"{node.label}_gemm_state")
+        sdfg.add_edge(prv_state, state, dace.InterstateEdge())
+        left_shape = [_prod(left_tt_arr.shape[:-len(node.left_axes)]), _prod(left_tt_arr.shape[len(left_tt_arr.shape)-len(node.left_axes):])]
+        left_strides = [left_tt_arr.strides[-len(node.left_axes)-1], left_tt_arr.strides[-1]]
+        left_vname, left_view = sdfg.add_view(left_tt, left_shape, left_tt_arr.dtype, left_tt_arr.storage, strides=left_strides, find_new_name=True)
+        left_anode = state.add_read(left_tt)
+        left_vnode = state.add_access(left_vname)
+        state.add_edge(left_anode, None, left_vnode, 'views', dace.Memlet.from_array(left_tt, left_tt_arr))
+        right_shape = [_prod(right_tt_arr.shape[0:len(node.left_axes)]), _prod(right_tt_arr.shape[len(node.left_axes):]), ]
+        right_strides = [right_tt_arr.strides[len(node.right_axes)-1], right_tt_arr.strides[-1]]
+        right_vname, right_view = sdfg.add_view(right_tt, right_shape, right_tt_arr.dtype, right_tt_arr.storage, strides=right_strides, find_new_name=True)
+        right_anode = state.add_read(right_tt)
+        right_vnode = state.add_access(right_vname)
+        state.add_edge(right_anode, None, right_vnode, 'views', dace.Memlet.from_array(right_tt, right_tt_arr))
+        tasklet = Gemm(cin=False)
+        state.add_edge(left_vnode, None, tasklet, '_a', dace.Memlet.from_array(left_vname, left_view))
+        state.add_edge(right_vnode, None, tasklet, '_b', dace.Memlet.from_array(right_vname, right_view))
+
+        # Output handling
+        out_shape = [left_shape[0], right_shape[1]]
+        if node.permutation and node.permutation != list(range(len(node.permutation))):
+            dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes]
+            dot_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])
+            dot_name, dot_arr = sdfg.add_temp_transient(dot_shape, out_arr.dtype, out_arr.storage)
+            out_strides = [dot_arr.strides[len(left_tt_arr.shape)-len(node.left_axes)-1], dot_arr.strides[-1]]
+            dot_vname, dot_view = sdfg.add_view('__gemm_out', out_shape, dot_arr.dtype, dot_arr.storage, strides=out_strides, find_new_name=True)
+            dot_anode = state.add_access(dot_name)
+            dot_vnode = state.add_access(dot_vname)
+            state.add_edge(tasklet, '_c', dot_vnode, None, dace.Memlet.from_array(dot_vname, dot_view))
+            state.add_edge(dot_vnode, 'views', dot_anode, None, dace.Memlet.from_array(dot_name, dot_arr))
+            out_node = state.add_write('_out_tensor')
+            from dace.libraries.ttranspose import TensorTranspose
+            tasklet = TensorTranspose('_TensorTranspose', node.permutation)
+            state.add_edge(dot_anode, None, tasklet, '_inp_tensor', dace.Memlet.from_array(dot_name, dot_arr))
+            state.add_edge(tasklet, '_out_tensor', out_node, None, dace.Memlet.from_array('_out_tensor', out_arr))
+        else:
+            out_strides = [out_arr.strides[len(left_tt_arr.shape)-len(node.left_axes)-1], out_arr.strides[-1]]
+            out_vname, out_view = sdfg.add_view('__gemm_out', out_shape, out_arr.dtype, out_arr.storage, strides=out_strides, find_new_name=True)
+            out_anode = state.add_access(out_name)
+            out_vnode = state.add_access(out_vname)
+            state.add_edge(tasklet, '_c', out_vnode, None, dace.Memlet.from_array(out_vname, out_view))
+            state.add_edge(out_vnode, 'views', out_anode, None, dace.Memlet.from_array('_out_tensor', out_arr))
+
+        return sdfg
+
 
 @library.expansion
 class ExpandCuTensor(ExpandTransformation):

From f7b589c28b2b04970d3566327d02c27fcb766647 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 16:52:14 +0100
Subject: [PATCH 212/392] Small fixes.

---
 dace/libraries/linalg/nodes/tensordot.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 62eb47504d..7261195886 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -93,12 +93,12 @@ def expansion(node, parent_state, parent_sdfg):
         # (1) It will return the tensor itself if transposition is uncessary.
         # (2) It will use matrix transpose operation for 2-mode tensors.
         state = sdfg.add_state(f"{node.label}_inp_transpose_state", is_start_state=True)
-        left_axes = [i for i in range(left_arr.shape) if i not in node.left_axes]
+        left_axes = [i for i in range(len(left_arr.shape)) if i not in node.left_axes]
         left_axes.extend(node.left_axes)
         left_tt = _transpose(None, sdfg, state, "_left_tensor", left_axes)
         left_tt_arr = sdfg.arrays[left_tt]
-        right_axes = node.right_axes
-        right_axes.extend([i for i in range(right_arr.shape) if i not in node.right_axes])
+        right_axes = list(node.right_axes)
+        right_axes.extend([i for i in range(len(right_arr.shape)) if i not in node.right_axes])
         right_tt = _transpose(None, sdfg, state, "_right_tensor", right_axes)
         right_tt_arr = sdfg.arrays[right_tt]
 
@@ -112,13 +112,13 @@ def expansion(node, parent_state, parent_sdfg):
         left_anode = state.add_read(left_tt)
         left_vnode = state.add_access(left_vname)
         state.add_edge(left_anode, None, left_vnode, 'views', dace.Memlet.from_array(left_tt, left_tt_arr))
-        right_shape = [_prod(right_tt_arr.shape[0:len(node.left_axes)]), _prod(right_tt_arr.shape[len(node.left_axes):]), ]
+        right_shape = [_prod(right_tt_arr.shape[0:len(node.right_axes)]), _prod(right_tt_arr.shape[len(node.right_axes):]), ]
         right_strides = [right_tt_arr.strides[len(node.right_axes)-1], right_tt_arr.strides[-1]]
         right_vname, right_view = sdfg.add_view(right_tt, right_shape, right_tt_arr.dtype, right_tt_arr.storage, strides=right_strides, find_new_name=True)
         right_anode = state.add_read(right_tt)
         right_vnode = state.add_access(right_vname)
         state.add_edge(right_anode, None, right_vnode, 'views', dace.Memlet.from_array(right_tt, right_tt_arr))
-        tasklet = Gemm(cin=False)
+        tasklet = Gemm('_GEMM_', cin=False)
         state.add_edge(left_vnode, None, tasklet, '_a', dace.Memlet.from_array(left_vname, left_view))
         state.add_edge(right_vnode, None, tasklet, '_b', dace.Memlet.from_array(right_vname, right_view))
 
@@ -142,7 +142,7 @@ def expansion(node, parent_state, parent_sdfg):
         else:
             out_strides = [out_arr.strides[len(left_tt_arr.shape)-len(node.left_axes)-1], out_arr.strides[-1]]
             out_vname, out_view = sdfg.add_view('__gemm_out', out_shape, out_arr.dtype, out_arr.storage, strides=out_strides, find_new_name=True)
-            out_anode = state.add_access(out_name)
+            out_anode = state.add_access('_out_tensor')
             out_vnode = state.add_access(out_vname)
             state.add_edge(tasklet, '_c', out_vnode, None, dace.Memlet.from_array(out_vname, out_view))
             state.add_edge(out_vnode, 'views', out_anode, None, dace.Memlet.from_array('_out_tensor', out_arr))

From 970128d343b3c20de767f73b6d7c62d6ec716e78 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 10 Mar 2022 16:52:31 +0100
Subject: [PATCH 213/392] Added tests for TTGT expansion.

---
 tests/numpy/linalg_test.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index 7ac362f63e..0fbc63210c 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -78,6 +78,18 @@ def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
         assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
 
 
+def test_tensordot_01():
+
+    @dace.program
+    def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B)
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
+        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+
+
 def test_tensordot_1():
 
     @dace.program
@@ -90,9 +102,23 @@ def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
         assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
 
 
+def test_tensordot_11():
+
+    @dace.program
+    def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]))
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
+        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+
+
 if __name__ == "__main__":
     test_linalg_inv()
     test_linalg_solve()
     test_linalg_cholesky()
     test_tensordot_0()
     test_tensordot_1()
+    test_tensordot_01()
+    test_tensordot_11()

From ead72c14113ecdb298555e8464f5ed7a3918f136 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 16 Mar 2022 12:24:57 +0100
Subject: [PATCH 214/392] TTGT expansion of TensorDot will now use the transA
 and transB parameters of GEMM when possible.

---
 dace/libraries/linalg/nodes/tensordot.py | 66 +++++++++++++++++++-----
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 7261195886..e71629c1a7 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -93,37 +93,75 @@ def expansion(node, parent_state, parent_sdfg):
         # (1) It will return the tensor itself if transposition is uncessary.
         # (2) It will use matrix transpose operation for 2-mode tensors.
         state = sdfg.add_state(f"{node.label}_inp_transpose_state", is_start_state=True)
-        left_axes = [i for i in range(len(left_arr.shape)) if i not in node.left_axes]
-        left_axes.extend(node.left_axes)
-        left_tt = _transpose(None, sdfg, state, "_left_tensor", left_axes)
-        left_tt_arr = sdfg.arrays[left_tt]
-        right_axes = list(node.right_axes)
-        right_axes.extend([i for i in range(len(right_arr.shape)) if i not in node.right_axes])
-        right_tt = _transpose(None, sdfg, state, "_right_tensor", right_axes)
-        right_tt_arr = sdfg.arrays[right_tt]
+
+        if node.left_axes == list(range(len(node.left_axes))):
+            transA = True
+        else:
+            transA = False
+        if node.right_axes == list(range(len(right_arr.shape)-len(node.right_axes), len(right_arr.shape))):
+            transB = True
+        else:
+            transB = False
+        
+        if transA:
+            left_tt = "_left_tensor"
+            left_tt_arr = left_arr
+        else:
+            left_axes = [i for i in range(len(left_arr.shape)) if i not in node.left_axes]
+            left_axes.extend(node.left_axes)
+            left_tt = _transpose(None, sdfg, state, "_left_tensor", left_axes)
+            left_tt_arr = sdfg.arrays[left_tt]
+
+        if transB:
+            right_tt = "_right_tensor"
+            right_tt_arr = right_arr
+        else:
+            right_axes = list(node.right_axes)
+            right_axes.extend([i for i in range(len(right_arr.shape)) if i not in node.right_axes])
+            right_tt = _transpose(None, sdfg, state, "_right_tensor", right_axes)
+            right_tt_arr = sdfg.arrays[right_tt]
 
         from dace.libraries.blas import Gemm
         prv_state = state
         state = sdfg.add_state(f"{node.label}_gemm_state")
         sdfg.add_edge(prv_state, state, dace.InterstateEdge())
-        left_shape = [_prod(left_tt_arr.shape[:-len(node.left_axes)]), _prod(left_tt_arr.shape[len(left_tt_arr.shape)-len(node.left_axes):])]
-        left_strides = [left_tt_arr.strides[-len(node.left_axes)-1], left_tt_arr.strides[-1]]
+        
+        if transA:
+            left_shape = [_prod(left_tt_arr.shape[:len(node.left_axes)]), _prod(left_tt_arr.shape[len(node.left_axes):])]
+            left_strides = [left_tt_arr.strides[len(node.left_axes)-1], left_tt_arr.strides[-1]]
+        else:
+            left_shape = [_prod(left_tt_arr.shape[:-len(node.left_axes)]), _prod(left_tt_arr.shape[len(left_tt_arr.shape)-len(node.left_axes):])]
+            left_strides = [left_tt_arr.strides[-len(node.left_axes)-1], left_tt_arr.strides[-1]]
         left_vname, left_view = sdfg.add_view(left_tt, left_shape, left_tt_arr.dtype, left_tt_arr.storage, strides=left_strides, find_new_name=True)
         left_anode = state.add_read(left_tt)
         left_vnode = state.add_access(left_vname)
         state.add_edge(left_anode, None, left_vnode, 'views', dace.Memlet.from_array(left_tt, left_tt_arr))
-        right_shape = [_prod(right_tt_arr.shape[0:len(node.right_axes)]), _prod(right_tt_arr.shape[len(node.right_axes):]), ]
-        right_strides = [right_tt_arr.strides[len(node.right_axes)-1], right_tt_arr.strides[-1]]
+
+        if transB:
+            right_shape = [_prod(right_tt_arr.shape[:-len(node.right_axes)]), _prod(right_tt_arr.shape[len(right_tt_arr.shape)-len(node.right_axes):])]
+            right_strides = [right_tt_arr.strides[-len(node.right_axes)-1], right_tt_arr.strides[-1]]
+        else:
+            right_shape = [_prod(right_tt_arr.shape[0:len(node.right_axes)]), _prod(right_tt_arr.shape[len(node.right_axes):])]
+            right_strides = [right_tt_arr.strides[len(node.right_axes)-1], right_tt_arr.strides[-1]]
         right_vname, right_view = sdfg.add_view(right_tt, right_shape, right_tt_arr.dtype, right_tt_arr.storage, strides=right_strides, find_new_name=True)
         right_anode = state.add_read(right_tt)
         right_vnode = state.add_access(right_vname)
         state.add_edge(right_anode, None, right_vnode, 'views', dace.Memlet.from_array(right_tt, right_tt_arr))
-        tasklet = Gemm('_GEMM_', cin=False)
+
+        tasklet = Gemm('_GEMM_', cin=False, transA=transA, transB=transB)
         state.add_edge(left_vnode, None, tasklet, '_a', dace.Memlet.from_array(left_vname, left_view))
         state.add_edge(right_vnode, None, tasklet, '_b', dace.Memlet.from_array(right_vname, right_view))
 
         # Output handling
-        out_shape = [left_shape[0], right_shape[1]]
+        out_shape = []
+        if transA:
+            out_shape.append(left_shape[1])
+        else:
+            out_shape.append(left_shape[0])
+        if transB:
+            out_shape.append(right_shape[0])
+        else:
+            out_shape.append(right_shape[1])
         if node.permutation and node.permutation != list(range(len(node.permutation))):
             dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes]
             dot_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])

From 251e7f6c12673f19ceb0d8bbcf7315589beaecce Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 16 Mar 2022 13:29:00 +0100
Subject: [PATCH 215/392] Linalg library doesn't have environments.

---
 dace/libraries/linalg/nodes/tensordot.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index e71629c1a7..c356241a4f 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -8,7 +8,6 @@
 from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
 from numbers import Number
-from .. import environments
 
 
 @library.expansion

From ecaffd7ca207ca559d7d4be8931424ea45d8e0dd Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 21 Mar 2022 15:54:56 +0100
Subject: [PATCH 216/392] Added special case where TensorTranspose can be
 resolved as MatrixTranspose.

---
 dace/frontend/python/replacements.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index e908dfd962..a269e69fe3 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -784,8 +784,12 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
         state.add_edge(acc1, None, tasklet, '_inp', Memlet.from_array(inpname, arr1))
         state.add_edge(tasklet, '_out', acc2, None, Memlet.from_array(outname, arr2))
     else:  # tensor transpose
-        if len(axes) != len(arr1.shape) or sorted(axes) != list(range(len(arr1.shape))):
-            raise ValueError("axes don't match array")
+        modes = len(arr1.shape)
+        idx = axes.index(0)
+        if axes[idx:] == list(range(modes-idx)) and axes[:idx] == list(range(axes[-1] + 1, modes)):
+            matrix = _ndarray_reshape(pv, sdfg, state, inpname, [data._prod(arr1.shape[:idx]), data._prod(arr1.shape[idx:])])
+            trans_matrix = _transpose(pv, sdfg, state, matrix)
+            return _ndarray_reshape(pv, sdfg, state, trans_matrix, [arr1.shape[i] for i in axes])
 
         read = state.add_read(inpname)
         write = state.add_write(outname)

From 4804abe22b471ecd60bcce8d1dbc94b049095e89 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 21 Mar 2022 18:11:21 +0100
Subject: [PATCH 217/392] Fixed row/col computation

---
 dace/frontend/python/replacements.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index a269e69fe3..a45313dbbe 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -787,7 +787,9 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
         modes = len(arr1.shape)
         idx = axes.index(0)
         if axes[idx:] == list(range(modes-idx)) and axes[:idx] == list(range(axes[-1] + 1, modes)):
-            matrix = _ndarray_reshape(pv, sdfg, state, inpname, [data._prod(arr1.shape[:idx]), data._prod(arr1.shape[idx:])])
+            rows = data._prod([arr1.shape[axes[i]] for i in range(idx, len(arr1.shape))])
+            cols = data._prod([arr1.shape[axes[i]] for i in range(idx)])
+            matrix = _ndarray_reshape(pv, sdfg, state, inpname, [rows, cols])
             trans_matrix = _transpose(pv, sdfg, state, matrix)
             return _ndarray_reshape(pv, sdfg, state, trans_matrix, [arr1.shape[i] for i in axes])
 

From 5b66cf08f91373123ecfaa951407d371d2f14c1e Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 4 May 2022 14:26:48 +0200
Subject: [PATCH 218/392] Started working on cuTENSOR and ExpandCuTensor for
 TensorDot.

---
 .../lapack/environments/cusolverdn.py         |  2 +-
 .../libraries/linalg/environments/__init__.py |  2 +
 .../libraries/linalg/environments/cutensor.py | 39 +++++++++++
 dace/libraries/linalg/include/dace_cutensor.h | 66 +++++++++++++++++++
 dace/libraries/linalg/nodes/tensordot.py      | 57 +++++++++++++++-
 5 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 dace/libraries/linalg/environments/__init__.py
 create mode 100644 dace/libraries/linalg/environments/cutensor.py
 create mode 100644 dace/libraries/linalg/include/dace_cutensor.h

diff --git a/dace/libraries/lapack/environments/cusolverdn.py b/dace/libraries/lapack/environments/cusolverdn.py
index fbec753b64..5b27e421a7 100644
--- a/dace/libraries/lapack/environments/cusolverdn.py
+++ b/dace/libraries/lapack/environments/cusolverdn.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 import dace.library
 
 
diff --git a/dace/libraries/linalg/environments/__init__.py b/dace/libraries/linalg/environments/__init__.py
new file mode 100644
index 0000000000..f8678e6b8e
--- /dev/null
+++ b/dace/libraries/linalg/environments/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+from .cutensor import *
diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
new file mode 100644
index 0000000000..8576085d3c
--- /dev/null
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -0,0 +1,39 @@
+# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+
+
+@dace.library.environment
+class cuTensor:
+
+    cmake_minimum_version = None
+    cmake_packages = ["CUDA"]
+    cmake_variables = {}
+    cmake_includes = []
+    cmake_libraries = ["cutensor"]
+    cmake_compile_flags = []
+    cmake_link_flags = []
+    cmake_files = []
+
+    headers = {'frame': ["../include/dace_cutensor.h"], 'cuda': ["../include/dace_cutensor.h"]}
+    state_fields = ["dace::linalg::CuTensorHandle cutensor_handle;"]
+    init_code = ""
+    finalize_code = ""
+    dependencies = []
+
+    @staticmethod
+    def handle_setup_code(node):
+        location = node.location
+        if not location or "gpu" not in node.location:
+            location = 0
+        else:
+            try:
+                location = int(location["gpu"])
+            except ValueError:
+                raise ValueError("Invalid GPU identifier: {}".format(location))
+
+        code = """\
+const int __dace_cuda_device = {location};
+cutensorHandle_t &__dace_tensor_handle = __state->cutensor_handle.Get(__dace_cuda_device);
+// cutensorSetStream(__dace_tensor_handle, __dace_current_stream);\n"""
+
+        return code.format(location=location)
diff --git a/dace/libraries/linalg/include/dace_cutensor.h b/dace/libraries/linalg/include/dace_cutensor.h
new file mode 100644
index 0000000000..016e445830
--- /dev/null
+++ b/dace/libraries/linalg/include/dace_cutensor.h
@@ -0,0 +1,66 @@
+// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cutensor.h>
+
+#include <cstddef>    // size_t
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::to_string
+#include <unordered_map>
+
+namespace dace {
+
+namespace linalg {
+
+static void CheckCuTensorError(cutensorStatus_t const& status) {
+  if (status != CUTENSOR_STATUS_SUCCESS) {
+    throw std::runtime_error("cuTENSOR failed with error code: " + std::string(cutensorGetErrorString(status)));
+  }
+}
+
+static cutensorHandle_t CreateCuTensorHandle(int device) {
+  if (cudaSetDevice(device) != cudaSuccess) {
+    throw std::runtime_error("Failed to set CUDA device.");
+  }
+  cutensorHandle_t handle;
+  CheckCuTensorError(cutensorInit(&handle));
+  return handle;
+}
+
+/**
+ * cuTENSOR wrapper class for DaCe. Once constructed, the class can be used to
+ * get or create a cuTENSOR library handle (cutensorHandle_t) for a given
+ * GPU ID. The class is constructed when the cuTENSOR DaCe library is used.
+ **/
+class CuTensorHandle {
+ public:
+  CuTensorHandle() = default;
+  CuTensorHandle(CuTensorHandle const&) = delete;
+
+  cutensorHandle_t& Get(int device) {
+    auto f = handles_.find(device);
+    if (f == handles_.end()) {
+      // Lazily construct new cuSolverDn handle if the specified key does not
+      // yet exist
+      auto handle = CreateCuTensorHandle(device);
+      f = handles_.emplace(device, handle).first;
+    }
+    return f->second;
+  }
+
+  ~CuTensorHandle() {
+    // NOTE: It seems that the cuTENSOR API is missing a method of destroying a cuTENSOR handle
+    // for (auto& h : handles_) {
+    //   CheckCuTensorError(cutensorDestroy(h.second));
+    // }
+  }
+
+  CuTensorHandle& operator=(CuTensorHandle const&) = delete;
+
+  std::unordered_map<int, cutensorHandle_t> handles_;
+};
+
+}  // namespace linalg
+
+}  // namespace dace
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index c356241a4f..9f3f9769f7 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -8,6 +8,7 @@
 from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
 from numbers import Number
+import dace.libraries.linalg.environments as environments
 
 
 @library.expansion
@@ -194,7 +195,61 @@ class ExpandCuTensor(ExpandTransformation):
     For more information, see https://developer.nvidia.com/cutensor.
     """
 
-    environments = []
+    environments = [environments.cuTensor]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg):
+        left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
+
+        dtype = out_tensor.dtype.base_type
+        veclen = out_tensor.dtype.veclen
+
+        func, cuda_type, _ = blas_helpers.cublas_type_metadata(dtype)
+        func = func + 'getrf'
+
+        alpha = f"({cuda_type})1.0"
+        beta = f"({cuda_type})0.0"
+
+        left_modes = list(range(len(left_tensor.shape)))
+        right_modes = [node.left_axes[node.right_axes.index(i)] if i in node.right_axes else len(left_tensor.shape) + i
+                       for i in range(len(right_tensor.shape))]
+        out_modes = [i in left_modes if i not in node.left_axes]
+        out_modes = out_modes.extend([i for i in right_modes if i not in node.left_axes])
+        if node.permutation and node.permutation != list(range(len(node.permutation))):
+            out_modes = [node.permutation[i] for i in out_modes]
+
+        modes = f"""
+            std::vector<int> modeA{{{','.join(left_modes)}}};
+            std::vector<int> modeB{{{','.join(right_modes)}}};
+            std::vector<int> modeC{{{','.join(out_modes)}}};
+        """
+
+        code = ""
+        # code = (environments.cuTensor.handle_setup_code(node) + f"""
+        #         int __dace_workspace_size = 0;
+        #         {cuda_type}* __dace_workspace;
+        #         cusolverDn{func}_bufferSize(
+        #             __dace_cusolverDn_handle, {rows_x}, {cols_x}, ({cuda_type}*)_xin,
+        #             {stride_x}, &__dace_workspace_size);
+        #         cudaMalloc<{cuda_type}>(
+        #             &__dace_workspace,
+        #             sizeof({cuda_type}) * __dace_workspace_size);
+        #         cusolverDn{func}(
+        #             __dace_cusolverDn_handle, {rows_x}, {cols_x}, ({cuda_type}*)_xin,
+        #             {stride_x}, __dace_workspace, _ipiv, _res);
+        #         cudaFree(__dace_workspace);
+        #         """)
+
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP)
+        conn = tasklet.out_connectors
+        conn = {c: (dace.dtypes.pointer(dace.int32) if c == '_res' else t) for c, t in conn.items()}
+        tasklet.out_connectors = conn
+
+        return tasklet
 
 
 @library.node

From e838341de16072a2217c0b9b81821e45c6a7d9d6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 5 May 2022 16:52:14 +0200
Subject: [PATCH 219/392] ExpandCuTensor compiles

---
 .../libraries/linalg/environments/cutensor.py |   4 +-
 dace/libraries/linalg/nodes/tensordot.py      | 101 +++++++++++++-----
 dace/transformation/auto/auto_optimize.py     |   2 +-
 tests/numpy/linalg_test.py                    |  13 +++
 4 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index 8576085d3c..de34e0564e 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -33,7 +33,7 @@ def handle_setup_code(node):
 
         code = """\
 const int __dace_cuda_device = {location};
-cutensorHandle_t &__dace_tensor_handle = __state->cutensor_handle.Get(__dace_cuda_device);
-// cutensorSetStream(__dace_tensor_handle, __dace_current_stream);\n"""
+cutensorHandle_t &__dace_cutensor_handle = __state->cutensor_handle.Get(__dace_cuda_device);
+// cutensorSetStream(__dace_cutensor_handle, __dace_current_stream);\n"""
 
         return code.format(location=location)
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 9f3f9769f7..902b0521e8 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -202,52 +202,103 @@ def expansion(node, parent_state, parent_sdfg):
         left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
 
         dtype = out_tensor.dtype.base_type
-        veclen = out_tensor.dtype.veclen
-
         func, cuda_type, _ = blas_helpers.cublas_type_metadata(dtype)
+        cuda_dtype = blas_helpers.dtype_to_cudadatatype(dtype)
+        compute_type = f"CUTENSOR_COMPUTE{cuda_dtype[cuda_dtype.rfind('_'):]}"
         func = func + 'getrf'
 
         alpha = f"({cuda_type})1.0"
         beta = f"({cuda_type})0.0"
+        abtext = f"""
+            {cuda_type} alpha = {alpha};
+            {cuda_type} beta = {beta};
+        """
 
         left_modes = list(range(len(left_tensor.shape)))
         right_modes = [node.left_axes[node.right_axes.index(i)] if i in node.right_axes else len(left_tensor.shape) + i
                        for i in range(len(right_tensor.shape))]
-        out_modes = [i in left_modes if i not in node.left_axes]
-        out_modes = out_modes.extend([i for i in right_modes if i not in node.left_axes])
+        out_modes = [i for i in left_modes if i not in node.left_axes]
+        out_modes.extend([i for i in right_modes if i not in node.left_axes])
         if node.permutation and node.permutation != list(range(len(node.permutation))):
             out_modes = [node.permutation[i] for i in out_modes]
 
         modes = f"""
-            std::vector<int> modeA{{{','.join(left_modes)}}};
-            std::vector<int> modeB{{{','.join(right_modes)}}};
-            std::vector<int> modeC{{{','.join(out_modes)}}};
+            std::vector<int> modeA{{{','.join(str(m) for m in left_modes)}}};
+            std::vector<int> modeB{{{','.join(str(m) for m in right_modes)}}};
+            std::vector<int> modeC{{{','.join(str(m) for m in out_modes)}}};
+        """
+
+        extents = "std::unordered_map<int, int64_t> extent;\n"
+        for i, s in zip(left_modes, left_tensor.shape):
+            extents += f"extent[{i}] = {s};\n"
+        for i, s in zip(right_modes, left_tensor.shape):
+            if i in node.right_axes:
+                continue
+            extents += f"extent[{i}] = {s};\n"
+        extents += f"""
+            std::vector<int64_t> extentA;
+            for (auto mode : modeA) extentA.push_back(extent[mode]);
+            std::vector<int64_t> extentB;
+            for (auto mode : modeB) extentB.push_back(extent[mode]);
+            std::vector<int64_t> extentC;
+            for (auto mode : modeC) extentC.push_back(extent[mode]);
+        """
+
+        tdesc = f"""
+            cutensorTensorDescriptor_t descA, descB, descC;
+            cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descA, modeA.size(), extentA.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
+            cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descB, modeB.size(), extentB.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
+            cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descC, modeA.size(), extentA.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
+        """
+
+        cdesc = f"""
+            uint32_t alignmentRequirementA, alignmentRequirementB, alignmentRequirementC;
+            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _left_tensor, &descA, &alignmentRequirementA);
+            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _right_tensor, &descB, &alignmentRequirementB);
+            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _out_tensor, &descC, &alignmentRequirementC);
+            cutensorContractionDescriptor_t desc;
+            cutensorInitContractionDescriptor(
+                &__dace_cutensor_handle, &desc,
+                &descA, modeA.data(), alignmentRequirementA,
+                &descB, modeB.data(), alignmentRequirementB,
+                &descC, modeC.data(), alignmentRequirementC,
+                &descC, modeC.data(), alignmentRequirementC,
+                {compute_type});
+        """
+
+        workspace = """
+            cutensorContractionFind_t find;
+            cutensorInitContractionFind(&__dace_cutensor_handle, &find, CUTENSOR_ALGO_DEFAULT);
+            size_t worksize = 0;
+            cutensorContractionGetWorkspace(
+                &__dace_cutensor_handle, &desc, &find, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize);
+            void *work = nullptr;
+            if (worksize > 0) cudaMalloc(&work, worksize);
+        """
+
+        execute = """
+            cutensorContractionPlan_t plan;
+            cutensorInitContractionPlan(&__dace_cutensor_handle, &plan, &desc, &find, worksize);
+            cutensorContraction(
+                &__dace_cutensor_handle, &plan,
+                (void*)&alpha, _left_tensor, _right_tensor, (void*)&beta, _out_tensor, _out_tensor,
+                work, worksize, __dace_current_stream);
+            if (work) cudaFree(work);
         """
 
-        code = ""
-        # code = (environments.cuTensor.handle_setup_code(node) + f"""
-        #         int __dace_workspace_size = 0;
-        #         {cuda_type}* __dace_workspace;
-        #         cusolverDn{func}_bufferSize(
-        #             __dace_cusolverDn_handle, {rows_x}, {cols_x}, ({cuda_type}*)_xin,
-        #             {stride_x}, &__dace_workspace_size);
-        #         cudaMalloc<{cuda_type}>(
-        #             &__dace_workspace,
-        #             sizeof({cuda_type}) * __dace_workspace_size);
-        #         cusolverDn{func}(
-        #             __dace_cusolverDn_handle, {rows_x}, {cols_x}, ({cuda_type}*)_xin,
-        #             {stride_x}, __dace_workspace, _ipiv, _res);
-        #         cudaFree(__dace_workspace);
-        #         """)
+        code = f"{environments.cuTensor.handle_setup_code(node)}{abtext}{modes}{extents}{tdesc}{cdesc}{workspace}{execute}"
 
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
                                           node.in_connectors,
                                           node.out_connectors,
                                           code,
                                           language=dace.dtypes.Language.CPP)
-        conn = tasklet.out_connectors
-        conn = {c: (dace.dtypes.pointer(dace.int32) if c == '_res' else t) for c, t in conn.items()}
-        tasklet.out_connectors = conn
+        # conn = tasklet.out_connectors
+        # conn = {c: (dace.dtypes.pointer(dace.int32) if c == '_res' else t) for c, t in conn.items()}
+        # tasklet.out_connectors = conn
 
         return tasklet
 
diff --git a/dace/transformation/auto/auto_optimize.py b/dace/transformation/auto/auto_optimize.py
index 6177e9e38e..8e572a5c07 100644
--- a/dace/transformation/auto/auto_optimize.py
+++ b/dace/transformation/auto/auto_optimize.py
@@ -330,7 +330,7 @@ def find_fast_library(device: dtypes.DeviceType) -> List[str]:
             backend = 'none'
 
         if backend == 'cuda':
-            return ['cuBLAS', 'cuSolverDn', 'GPUAuto', 'CUB', 'pure']
+            return ['cuBLAS', 'cuSolverDn', 'GPUAuto', 'cuTENSOR', 'CUB', 'pure']
         elif backend == 'hip':
             return ['rocBLAS', 'GPUAuto', 'pure']
         else:
diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index 0fbc63210c..256dfc736f 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -89,6 +89,18 @@ def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
         assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
 
+@pytest.mark.gpu
+def test_tensordot_02():
+
+    @dace.program(device=dace.dtypes.DeviceType.GPU)
+    def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B)
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
+        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+
 
 def test_tensordot_1():
 
@@ -122,3 +134,4 @@ def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
     test_tensordot_1()
     test_tensordot_01()
     test_tensordot_11()
+    test_tensordot_02()

From ad53fe2adaa9f1b02fa13aba67cc82108c7834fa Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 11 May 2022 08:51:06 +0200
Subject: [PATCH 220/392] Fixed strides in cuTENSOR tensordot.

---
 .../libraries/linalg/environments/cutensor.py |  2 +-
 dace/libraries/linalg/nodes/tensordot.py      | 51 +++++++----
 tests/numpy/linalg_test.py                    | 85 +++++++++++++++++++
 3 files changed, 119 insertions(+), 19 deletions(-)

diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index de34e0564e..8d883d1f20 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -11,7 +11,7 @@ class cuTensor:
     cmake_includes = []
     cmake_libraries = ["cutensor"]
     cmake_compile_flags = []
-    cmake_link_flags = []
+    cmake_link_flags = ["-L /users/aziogas/libcutensor/lib/11 -lcutensor"]
     cmake_files = []
 
     headers = {'frame': ["../include/dace_cutensor.h"], 'cuda': ["../include/dace_cutensor.h"]}
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 902b0521e8..b241410d4d 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -220,7 +220,7 @@ def expansion(node, parent_state, parent_sdfg):
         out_modes = [i for i in left_modes if i not in node.left_axes]
         out_modes.extend([i for i in right_modes if i not in node.left_axes])
         if node.permutation and node.permutation != list(range(len(node.permutation))):
-            out_modes = [node.permutation[i] for i in out_modes]
+            out_modes = [out_modes[i] for i in node.permutation]
 
         modes = f"""
             std::vector<int> modeA{{{','.join(str(m) for m in left_modes)}}};
@@ -244,49 +244,64 @@ def expansion(node, parent_state, parent_sdfg):
             for (auto mode : modeC) extentC.push_back(extent[mode]);
         """
 
+        extents += f"""
+            std::vector<int64_t> stridesA{{{','.join(str(s) for s in left_tensor.strides)}}};
+            std::vector<int64_t> stridesB{{{','.join(str(s) for s in right_tensor.strides)}}};
+            std::vector<int64_t> stridesC{{{','.join(str(s) for s in out_tensor.strides)}}};
+        """
+
         tdesc = f"""
             cutensorTensorDescriptor_t descA, descB, descC;
-            cutensorInitTensorDescriptor(
-                &__dace_cutensor_handle, &descA, modeA.size(), extentA.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
-            cutensorInitTensorDescriptor(
-                &__dace_cutensor_handle, &descB, modeB.size(), extentB.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
-            cutensorInitTensorDescriptor(
-                &__dace_cutensor_handle, &descC, modeA.size(), extentA.data(), NULL, {cuda_dtype}, CUTENSOR_OP_IDENTITY);
+            dace::linalg::CheckCuTensorError(cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descA, modeA.size(), extentA.data(), stridesA.data(), {cuda_dtype}, CUTENSOR_OP_IDENTITY));
+            dace::linalg::CheckCuTensorError(cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descB, modeB.size(), extentB.data(), stridesB.data(), {cuda_dtype}, CUTENSOR_OP_IDENTITY));
+            dace::linalg::CheckCuTensorError(cutensorInitTensorDescriptor(
+                &__dace_cutensor_handle, &descC, modeC.size(), extentC.data(), stridesC.data(), {cuda_dtype}, CUTENSOR_OP_IDENTITY));
+            // printf("Tensor descriptors created!\\n");
         """
 
         cdesc = f"""
             uint32_t alignmentRequirementA, alignmentRequirementB, alignmentRequirementC;
-            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _left_tensor, &descA, &alignmentRequirementA);
-            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _right_tensor, &descB, &alignmentRequirementB);
-            cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _out_tensor, &descC, &alignmentRequirementC);
+            dace::linalg::CheckCuTensorError(cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _left_tensor, &descA, &alignmentRequirementA));
+            dace::linalg::CheckCuTensorError(cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _right_tensor, &descB, &alignmentRequirementB));
+            dace::linalg::CheckCuTensorError(cutensorGetAlignmentRequirement(&__dace_cutensor_handle, _out_tensor, &descC, &alignmentRequirementC));
             cutensorContractionDescriptor_t desc;
-            cutensorInitContractionDescriptor(
+            dace::linalg::CheckCuTensorError(cutensorInitContractionDescriptor(
                 &__dace_cutensor_handle, &desc,
                 &descA, modeA.data(), alignmentRequirementA,
                 &descB, modeB.data(), alignmentRequirementB,
                 &descC, modeC.data(), alignmentRequirementC,
                 &descC, modeC.data(), alignmentRequirementC,
-                {compute_type});
+                {compute_type}));
+            // printf("Memory alignment and coontraction descriptor created!\\n");
         """
 
         workspace = """
             cutensorContractionFind_t find;
-            cutensorInitContractionFind(&__dace_cutensor_handle, &find, CUTENSOR_ALGO_DEFAULT);
+            dace::linalg::CheckCuTensorError(cutensorInitContractionFind(&__dace_cutensor_handle, &find, CUTENSOR_ALGO_DEFAULT));
             size_t worksize = 0;
-            cutensorContractionGetWorkspace(
-                &__dace_cutensor_handle, &desc, &find, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize);
+            dace::linalg::CheckCuTensorError(cutensorContractionGetWorkspace(
+                &__dace_cutensor_handle, &desc, &find, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize));
             void *work = nullptr;
             if (worksize > 0) cudaMalloc(&work, worksize);
+            // printf("Workspace created!\\n");
         """
 
         execute = """
             cutensorContractionPlan_t plan;
-            cutensorInitContractionPlan(&__dace_cutensor_handle, &plan, &desc, &find, worksize);
-            cutensorContraction(
+            dace::linalg::CheckCuTensorError(cutensorInitContractionPlan(&__dace_cutensor_handle, &plan, &desc, &find, worksize));
+            cutensorStatus_t err;
+            err = cutensorContraction(
                 &__dace_cutensor_handle, &plan,
-                (void*)&alpha, _left_tensor, _right_tensor, (void*)&beta, _out_tensor, _out_tensor,
+                (void*)&alpha, _right_tensor, _left_tensor, (void*)&beta, _out_tensor, _out_tensor,
                 work, worksize, __dace_current_stream);
+            cudaStreamSynchronize(__dace_current_stream);
+            if(err != CUTENSOR_STATUS_SUCCESS) {
+                printf("ERROR: %s\\n", cutensorGetErrorString(err));
+            }
             if (work) cudaFree(work);
+            // printf("Contraction executed!\\n");
         """
 
         code = f"{environments.cuTensor.handle_setup_code(node)}{abtext}{modes}{extents}{tdesc}{cdesc}{workspace}{execute}"
diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index 256dfc736f..5e6e65474a 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -126,6 +126,88 @@ def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
         assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
 
 
+@pytest.mark.gpu
+def test_tensordot_12():
+
+    @dace.program(device=dace.dtypes.DeviceType.GPU)
+    def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]))
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
+        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+
+
+def test_tensordot_2():
+
+    @dace.program
+    def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
+        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+    
+    @dace.program
+    def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
+        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+
+
+def test_tensordot_21():
+
+    @dace.program
+    def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
+        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+    
+    @dace.program
+    def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
+        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+
+
+def test_tensordot_22():
+
+    @dace.program(device=dace.dtypes.DeviceType.GPU)
+    def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
+        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+    
+    @dace.program(device=dace.dtypes.DeviceType.GPU)
+    def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
+        return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    
+    A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
+    ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
+    with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
+        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+
+
 if __name__ == "__main__":
     test_linalg_inv()
     test_linalg_solve()
@@ -135,3 +217,6 @@ def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3
     test_tensordot_01()
     test_tensordot_11()
     test_tensordot_02()
+    test_tensordot_12()
+    test_tensordot_21()
+    test_tensordot_22()

From 3221aadd5b89d1532990c91b5522f32503d859f6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 12 May 2022 20:24:34 +0200
Subject: [PATCH 221/392] Small fixed for cuTENSOR.tensordot

---
 dace/libraries/linalg/environments/cutensor.py | 2 +-
 dace/libraries/linalg/nodes/tensordot.py       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index 8d883d1f20..e3fd1ed486 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -11,7 +11,7 @@ class cuTensor:
     cmake_includes = []
     cmake_libraries = ["cutensor"]
     cmake_compile_flags = []
-    cmake_link_flags = ["-L /users/aziogas/libcutensor/lib/11 -lcutensor"]
+    cmake_link_flags = ["-L /scratch/snx3000/aziogas/libcutensor/lib/11 -lcutensor"]
     cmake_files = []
 
     headers = {'frame': ["../include/dace_cutensor.h"], 'cuda': ["../include/dace_cutensor.h"]}
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index b241410d4d..cd3fa6532a 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -231,7 +231,7 @@ def expansion(node, parent_state, parent_sdfg):
         extents = "std::unordered_map<int, int64_t> extent;\n"
         for i, s in zip(left_modes, left_tensor.shape):
             extents += f"extent[{i}] = {s};\n"
-        for i, s in zip(right_modes, left_tensor.shape):
+        for i, s in zip(right_modes, right_tensor.shape):
             if i in node.right_axes:
                 continue
             extents += f"extent[{i}] = {s};\n"
@@ -294,7 +294,7 @@ def expansion(node, parent_state, parent_sdfg):
             cutensorStatus_t err;
             err = cutensorContraction(
                 &__dace_cutensor_handle, &plan,
-                (void*)&alpha, _right_tensor, _left_tensor, (void*)&beta, _out_tensor, _out_tensor,
+                (void*)&alpha, _left_tensor, _right_tensor, (void*)&beta, _out_tensor, _out_tensor,
                 work, worksize, __dace_current_stream);
             cudaStreamSynchronize(__dace_current_stream);
             if(err != CUTENSOR_STATUS_SUCCESS) {
@@ -362,8 +362,8 @@ def validate(self, sdfg, state):
 
         if left_tensor.dtype != right_tensor.dtype or left_tensor.dtype != out_tensor.dtype:
             raise TypeError("The datatype of the input and output tensors must match.")    
-        if left_tensor.storage != right_tensor.storage or left_tensor.storage != out_tensor.storage:
-            raise ValueError("The storage of the input and output tensors must match.")
+        # if left_tensor.storage != right_tensor.storage or left_tensor.storage != out_tensor.storage:
+        #     raise ValueError("The storage of the input and output tensors must match.")
 
         if any(a >= len(left_tensor.shape) or a < 0 for a in self.left_axes):
             raise ValueError("Axes for left tensor are out-of-bounds.")

From 1dddd8fa058030a08b95afa5bccbabc2f5fd64ef Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 9 Jun 2022 21:08:17 +0200
Subject: [PATCH 222/392] libcutensor.lib must now be on the user's
 (LD_)LIBRARY_PATH

---
 dace/libraries/linalg/environments/cutensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index e3fd1ed486..8504460985 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -11,7 +11,7 @@ class cuTensor:
     cmake_includes = []
     cmake_libraries = ["cutensor"]
     cmake_compile_flags = []
-    cmake_link_flags = ["-L /scratch/snx3000/aziogas/libcutensor/lib/11 -lcutensor"]
+    cmake_link_flags = ["-L -lcutensor"]
     cmake_files = []
 
     headers = {'frame': ["../include/dace_cutensor.h"], 'cuda': ["../include/dace_cutensor.h"]}

From fbefd886fceaf055398f22c4803fa2badb95a644 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 4 May 2022 13:03:06 +0200
Subject: [PATCH 223/392] Enabled optional and non-standard extension to
 `numpy.tensordot` that allows permutation of the output.

---
 dace/frontend/python/replacements.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index a45313dbbe..b12ae82452 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -4548,7 +4548,15 @@ def _inv(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inp_op: str):
 
 @oprepo.replaces('dace.tensordot')
 @oprepo.replaces('numpy.tensordot')
-def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op_b: str, axes: Union[int, Sequence[int]] = 2):
+def _tensordot(pv: 'ProgramVisitor',
+               sdfg: SDFG,
+               state: SDFGState,
+               op_a: str,
+               op_b: str,
+               axes: Union[int, Sequence[int]] = 2,
+               out_axes: Sequence[int] = None):
+    
+    # NOTE: `out_axes` is a non-standard extension to `numpy.tensordot`, allowing trasposition of the output
 
     for op in (op_a, op_b):
         if not isinstance(op, str) or not op in sdfg.arrays.keys():
@@ -4576,13 +4584,19 @@ def _tensordot(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op_a: str, op
     
     dot_shape = [s for i, s in enumerate(arr_a.shape) if i not in left_axes]
     dot_shape.extend([s for i, s in enumerate(arr_b.shape) if i not in right_axes])
+
+    if out_axes:
+        if list(sorted(out_axes)) != list(range(len(dot_shape))):
+            raise ValueError("Output axes is not a permutation of the output's modes.")
+        dot_shape = [dot_shape[i] for i in out_axes]
+
     op_c, arr_c = sdfg.add_temp_transient(dot_shape, arr_a.dtype, storage=arr_a.storage)
 
     from dace.libraries.linalg import TensorDot
     a = state.add_read(op_a)
     b = state.add_read(op_b)
     c = state.add_write(op_c)
-    tasklet = TensorDot("_TensorDot_", left_axes, right_axes)
+    tasklet = TensorDot("_TensorDot_", left_axes, right_axes, out_axes)
     state.add_edge(a, None, tasklet, '_left_tensor', Memlet.from_array(op_a, arr_a))
     state.add_edge(b, None, tasklet, '_right_tensor', Memlet.from_array(op_b, arr_b))
     state.add_edge(tasklet, '_out_tensor', c, None, Memlet.from_array(op_c, arr_c))

From 3ba8499b996699378dff9a55d4328e432dc17024 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 13:29:18 +0200
Subject: [PATCH 224/392] Clean-up

---
 dace/frontend/python/replacements.py          |  11 +-
 .../libraries/lapack/environments/__init__.py |   2 +-
 .../lapack/environments/cusolverdn.py         |   2 +-
 .../lapack/include/dace_cusolverdn.h          |   2 +-
 dace/libraries/linalg/__init__.py             |   2 +-
 .../libraries/linalg/environments/cutensor.py |   2 +-
 dace/libraries/linalg/include/dace_cutensor.h |   2 +-
 dace/libraries/linalg/nodes/__init__.py       |   2 +-
 dace/libraries/linalg/nodes/tensordot.py      | 180 ++++++++++++------
 dace/libraries/ttranspose/__init__.py         |   4 +-
 .../ttranspose/environments/__init__.py       |   2 +-
 .../libraries/ttranspose/environments/hptt.py |   6 +-
 dace/libraries/ttranspose/nodes/__init__.py   |   4 +-
 dace/libraries/ttranspose/nodes/ttranspose.py |  45 +++--
 tests/numpy/linalg_test.py                    |  73 +++----
 tests/numpy/transpose_test.py                 |   8 +-
 16 files changed, 215 insertions(+), 132 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index b12ae82452..5047a74858 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 
 import ast
@@ -783,10 +783,11 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
         state.add_node(tasklet)
         state.add_edge(acc1, None, tasklet, '_inp', Memlet.from_array(inpname, arr1))
         state.add_edge(tasklet, '_out', acc2, None, Memlet.from_array(outname, arr2))
-    else:  # tensor transpose
+    else:  # Tensor transpose
         modes = len(arr1.shape)
         idx = axes.index(0)
-        if axes[idx:] == list(range(modes-idx)) and axes[:idx] == list(range(axes[-1] + 1, modes)):
+        # Special case of tensor transposition: matrix transpose + reshape
+        if axes[idx:] == list(range(modes - idx)) and axes[:idx] == list(range(axes[-1] + 1, modes)):
             rows = data._prod([arr1.shape[axes[i]] for i in range(idx, len(arr1.shape))])
             cols = data._prod([arr1.shape[axes[i]] for i in range(idx)])
             matrix = _ndarray_reshape(pv, sdfg, state, inpname, [rows, cols])
@@ -4555,7 +4556,7 @@ def _tensordot(pv: 'ProgramVisitor',
                op_b: str,
                axes: Union[int, Sequence[int]] = 2,
                out_axes: Sequence[int] = None):
-    
+
     # NOTE: `out_axes` is a non-standard extension to `numpy.tensordot`, allowing trasposition of the output
 
     for op in (op_a, op_b):
@@ -4581,7 +4582,7 @@ def _tensordot(pv: 'ProgramVisitor',
         raise ValueError("The input tensors must have the same number of contracting modes.")
     if any(arr_a.shape[l] != arr_b.shape[r] for l, r in zip(left_axes, right_axes)):
         raise ValueError("The input tensors' contracting modes must have the same length.")
-    
+
     dot_shape = [s for i, s in enumerate(arr_a.shape) if i not in left_axes]
     dot_shape.extend([s for i, s in enumerate(arr_b.shape) if i not in right_axes])
 
diff --git a/dace/libraries/lapack/environments/__init__.py b/dace/libraries/lapack/environments/__init__.py
index 62825a9c3a..800e454986 100644
--- a/dace/libraries/lapack/environments/__init__.py
+++ b/dace/libraries/lapack/environments/__init__.py
@@ -1,2 +1,2 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from .cusolverdn import *
diff --git a/dace/libraries/lapack/environments/cusolverdn.py b/dace/libraries/lapack/environments/cusolverdn.py
index 5b27e421a7..c92c8bf3e7 100644
--- a/dace/libraries/lapack/environments/cusolverdn.py
+++ b/dace/libraries/lapack/environments/cusolverdn.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace.library
 
 
diff --git a/dace/libraries/lapack/include/dace_cusolverdn.h b/dace/libraries/lapack/include/dace_cusolverdn.h
index 6f22e67105..2da65ffa2f 100644
--- a/dace/libraries/lapack/include/dace_cusolverdn.h
+++ b/dace/libraries/lapack/include/dace_cusolverdn.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+// Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 #pragma once
 
 #include <cuda_runtime.h>
diff --git a/dace/libraries/linalg/__init__.py b/dace/libraries/linalg/__init__.py
index f4994b4275..2673dc66cb 100644
--- a/dace/libraries/linalg/__init__.py
+++ b/dace/libraries/linalg/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from dace.library import register_library
 from .nodes import *
 
diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index 8504460985..e3572a0673 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace.library
 
 
diff --git a/dace/libraries/linalg/include/dace_cutensor.h b/dace/libraries/linalg/include/dace_cutensor.h
index 016e445830..8079892285 100644
--- a/dace/libraries/linalg/include/dace_cutensor.h
+++ b/dace/libraries/linalg/include/dace_cutensor.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+// Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 #pragma once
 
 #include <cuda_runtime.h>
diff --git a/dace/libraries/linalg/nodes/__init__.py b/dace/libraries/linalg/nodes/__init__.py
index 7d3c935dbf..5df44ba4a5 100644
--- a/dace/libraries/linalg/nodes/__init__.py
+++ b/dace/libraries/linalg/nodes/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from .inv import Inv
 from .solve import Solve
 from .cholesky import Cholesky
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index cd3fa6532a..7ab588d78c 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -1,20 +1,18 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import collections
 import dace
-import multiprocessing
 from dace import library, nodes, properties
 from dace.data import _prod
 from dace.libraries.blas import blas_helpers
 from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
-from numbers import Number
 import dace.libraries.linalg.environments as environments
 
 
 @library.expansion
 class ExpandPure(ExpandTransformation):
     """ Implements the pure expansion of TensorDot library node. """
-    
+
     environments = []
 
     @staticmethod
@@ -22,17 +20,31 @@ def expansion(node, parent_state, parent_sdfg):
         left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
 
         sdfg = dace.SDFG(f"{node.label}_sdfg")
-        _, left_arr = sdfg.add_array("_left_tensor", left_tensor.shape, left_tensor.dtype, left_tensor.storage, strides=left_tensor.strides)
-        _, right_arr = sdfg.add_array("_right_tensor", right_tensor.shape, right_tensor.dtype, right_tensor.storage, strides=right_tensor.strides)
-        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
-        
+        _, left_arr = sdfg.add_array("_left_tensor",
+                                     left_tensor.shape,
+                                     left_tensor.dtype,
+                                     left_tensor.storage,
+                                     strides=left_tensor.strides)
+        _, right_arr = sdfg.add_array("_right_tensor",
+                                      right_tensor.shape,
+                                      right_tensor.dtype,
+                                      right_tensor.storage,
+                                      strides=right_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor",
+                                    out_tensor.shape,
+                                    out_tensor.dtype,
+                                    out_tensor.storage,
+                                    strides=out_tensor.strides)
+
         init_state = sdfg.add_state(f"{node.label}_init", is_start_state=True)
-        init_state.add_mapped_tasklet(f"{node.label}_init_tasklet", 
-                                 {f"__i{i}": f"0:{symstr(s)}" for i, s in enumerate(out_tensor.shape)},
-                                 {},
-                                 '__out = 0',
-                                 {'__out': dace.Memlet(expr=f"_out_tensor[{','.join(['__i%d' % i for i in range(len(out_tensor.shape))])}]")},
-                                 external_edges=True)
+        init_state.add_mapped_tasklet(
+            f"{node.label}_init_tasklet", {f"__i{i}": f"0:{symstr(s)}"
+                                           for i, s in enumerate(out_tensor.shape)}, {},
+            '__out = 0', {
+                '__out':
+                dace.Memlet(expr=f"_out_tensor[{','.join(['__i%d' % i for i in range(len(out_tensor.shape))])}]")
+            },
+            external_edges=True)
 
         state = sdfg.add_state(f"{node.label}_state")
         sdfg.add_edge(init_state, state, dace.InterstateEdge())
@@ -40,17 +52,17 @@ def expansion(node, parent_state, parent_sdfg):
         outer_map_shape = list([s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes])
         outer_map_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])
         outer_map_params = [f"__oi{i}" for i in range(len(outer_map_shape))]
-        outer_map_rng = {i: f"0:{symstr(s)}"for i, s in zip(outer_map_params, outer_map_shape)}
+        outer_map_rng = {i: f"0:{symstr(s)}" for i, s in zip(outer_map_params, outer_map_shape)}
         inner_map_shape = list([left_tensor.shape[i] for i in node.left_axes])
         inner_map_params = [f"__ii{i}" for i in range(len(inner_map_shape))]
-        inner_map_rng = {i: f"0:{symstr(s)}"for i, s in zip(inner_map_params, inner_map_shape)}
+        inner_map_rng = {i: f"0:{symstr(s)}" for i, s in zip(inner_map_params, inner_map_shape)}
 
-        left_idx = outer_map_params[:len(left_tensor.shape)-len(node.left_axes)]
+        left_idx = outer_map_params[:len(left_tensor.shape) - len(node.left_axes)]
         left_dict = {j: inner_map_params[i] for i, j in enumerate(node.left_axes)}
         left_sorted_dict = collections.OrderedDict(sorted(left_dict.items()))
         for k, v in left_sorted_dict.items():
             left_idx.insert(k, v)
-        right_idx = outer_map_params[len(left_tensor.shape)-len(node.left_axes):]
+        right_idx = outer_map_params[len(left_tensor.shape) - len(node.left_axes):]
         right_dict = {j: inner_map_params[i] for i, j in enumerate(node.right_axes)}
         right_sorted_dict = collections.OrderedDict(sorted(right_dict.items()))
         for k, v in right_sorted_dict.items():
@@ -65,7 +77,14 @@ def expansion(node, parent_state, parent_sdfg):
         inputs = {"_left": left_mem, "_right": right_mem}
         outputs = {"_out": out_mem}
         code = f"_out = _left * _right"
-        state.add_mapped_tasklet(f"{node.label}_tasklet", {**outer_map_rng, **inner_map_rng}, inputs, code, outputs, external_edges=True)
+        state.add_mapped_tasklet(f"{node.label}_tasklet", {
+            **outer_map_rng,
+            **inner_map_rng
+        },
+                                 inputs,
+                                 code,
+                                 outputs,
+                                 external_edges=True)
 
         return sdfg
 
@@ -76,7 +95,7 @@ class ExpandTTGT(ExpandTransformation):
     Expands the TensorDot library node to TensorTranspose + GEMM operations.
     TTGT stands for Transpose-Transpose-GEMM-Transpose.
     """
-    
+
     environments = []
 
     @staticmethod
@@ -84,9 +103,21 @@ def expansion(node, parent_state, parent_sdfg):
         left_tensor, right_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
 
         sdfg = dace.SDFG(f"{node.label}_sdfg")
-        _, left_arr = sdfg.add_array("_left_tensor", left_tensor.shape, left_tensor.dtype, left_tensor.storage, strides=left_tensor.strides)
-        _, right_arr = sdfg.add_array("_right_tensor", right_tensor.shape, right_tensor.dtype, right_tensor.storage, strides=right_tensor.strides)
-        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
+        _, left_arr = sdfg.add_array("_left_tensor",
+                                     left_tensor.shape,
+                                     left_tensor.dtype,
+                                     left_tensor.storage,
+                                     strides=left_tensor.strides)
+        _, right_arr = sdfg.add_array("_right_tensor",
+                                      right_tensor.shape,
+                                      right_tensor.dtype,
+                                      right_tensor.storage,
+                                      strides=right_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor",
+                                    out_tensor.shape,
+                                    out_tensor.dtype,
+                                    out_tensor.storage,
+                                    strides=out_tensor.strides)
 
         from dace.frontend.python.replacements import _transpose
         # NOTE: We use the numpy.transpose replacement because:
@@ -98,11 +129,11 @@ def expansion(node, parent_state, parent_sdfg):
             transA = True
         else:
             transA = False
-        if node.right_axes == list(range(len(right_arr.shape)-len(node.right_axes), len(right_arr.shape))):
+        if node.right_axes == list(range(len(right_arr.shape) - len(node.right_axes), len(right_arr.shape))):
             transB = True
         else:
             transB = False
-        
+
         if transA:
             left_tt = "_left_tensor"
             left_tt_arr = left_arr
@@ -125,25 +156,47 @@ def expansion(node, parent_state, parent_sdfg):
         prv_state = state
         state = sdfg.add_state(f"{node.label}_gemm_state")
         sdfg.add_edge(prv_state, state, dace.InterstateEdge())
-        
+
         if transA:
-            left_shape = [_prod(left_tt_arr.shape[:len(node.left_axes)]), _prod(left_tt_arr.shape[len(node.left_axes):])]
-            left_strides = [left_tt_arr.strides[len(node.left_axes)-1], left_tt_arr.strides[-1]]
+            left_shape = [
+                _prod(left_tt_arr.shape[:len(node.left_axes)]),
+                _prod(left_tt_arr.shape[len(node.left_axes):])
+            ]
+            left_strides = [left_tt_arr.strides[len(node.left_axes) - 1], left_tt_arr.strides[-1]]
         else:
-            left_shape = [_prod(left_tt_arr.shape[:-len(node.left_axes)]), _prod(left_tt_arr.shape[len(left_tt_arr.shape)-len(node.left_axes):])]
-            left_strides = [left_tt_arr.strides[-len(node.left_axes)-1], left_tt_arr.strides[-1]]
-        left_vname, left_view = sdfg.add_view(left_tt, left_shape, left_tt_arr.dtype, left_tt_arr.storage, strides=left_strides, find_new_name=True)
+            left_shape = [
+                _prod(left_tt_arr.shape[:-len(node.left_axes)]),
+                _prod(left_tt_arr.shape[len(left_tt_arr.shape) - len(node.left_axes):])
+            ]
+            left_strides = [left_tt_arr.strides[-len(node.left_axes) - 1], left_tt_arr.strides[-1]]
+        left_vname, left_view = sdfg.add_view(left_tt,
+                                              left_shape,
+                                              left_tt_arr.dtype,
+                                              left_tt_arr.storage,
+                                              strides=left_strides,
+                                              find_new_name=True)
         left_anode = state.add_read(left_tt)
         left_vnode = state.add_access(left_vname)
         state.add_edge(left_anode, None, left_vnode, 'views', dace.Memlet.from_array(left_tt, left_tt_arr))
 
         if transB:
-            right_shape = [_prod(right_tt_arr.shape[:-len(node.right_axes)]), _prod(right_tt_arr.shape[len(right_tt_arr.shape)-len(node.right_axes):])]
-            right_strides = [right_tt_arr.strides[-len(node.right_axes)-1], right_tt_arr.strides[-1]]
+            right_shape = [
+                _prod(right_tt_arr.shape[:-len(node.right_axes)]),
+                _prod(right_tt_arr.shape[len(right_tt_arr.shape) - len(node.right_axes):])
+            ]
+            right_strides = [right_tt_arr.strides[-len(node.right_axes) - 1], right_tt_arr.strides[-1]]
         else:
-            right_shape = [_prod(right_tt_arr.shape[0:len(node.right_axes)]), _prod(right_tt_arr.shape[len(node.right_axes):])]
-            right_strides = [right_tt_arr.strides[len(node.right_axes)-1], right_tt_arr.strides[-1]]
-        right_vname, right_view = sdfg.add_view(right_tt, right_shape, right_tt_arr.dtype, right_tt_arr.storage, strides=right_strides, find_new_name=True)
+            right_shape = [
+                _prod(right_tt_arr.shape[0:len(node.right_axes)]),
+                _prod(right_tt_arr.shape[len(node.right_axes):])
+            ]
+            right_strides = [right_tt_arr.strides[len(node.right_axes) - 1], right_tt_arr.strides[-1]]
+        right_vname, right_view = sdfg.add_view(right_tt,
+                                                right_shape,
+                                                right_tt_arr.dtype,
+                                                right_tt_arr.storage,
+                                                strides=right_strides,
+                                                find_new_name=True)
         right_anode = state.add_read(right_tt)
         right_vnode = state.add_access(right_vname)
         state.add_edge(right_anode, None, right_vnode, 'views', dace.Memlet.from_array(right_tt, right_tt_arr))
@@ -166,8 +219,13 @@ def expansion(node, parent_state, parent_sdfg):
             dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in node.left_axes]
             dot_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in node.right_axes])
             dot_name, dot_arr = sdfg.add_temp_transient(dot_shape, out_arr.dtype, out_arr.storage)
-            out_strides = [dot_arr.strides[len(left_tt_arr.shape)-len(node.left_axes)-1], dot_arr.strides[-1]]
-            dot_vname, dot_view = sdfg.add_view('__gemm_out', out_shape, dot_arr.dtype, dot_arr.storage, strides=out_strides, find_new_name=True)
+            out_strides = [dot_arr.strides[len(left_tt_arr.shape) - len(node.left_axes) - 1], dot_arr.strides[-1]]
+            dot_vname, dot_view = sdfg.add_view('__gemm_out',
+                                                out_shape,
+                                                dot_arr.dtype,
+                                                dot_arr.storage,
+                                                strides=out_strides,
+                                                find_new_name=True)
             dot_anode = state.add_access(dot_name)
             dot_vnode = state.add_access(dot_vname)
             state.add_edge(tasklet, '_c', dot_vnode, None, dace.Memlet.from_array(dot_vname, dot_view))
@@ -178,8 +236,13 @@ def expansion(node, parent_state, parent_sdfg):
             state.add_edge(dot_anode, None, tasklet, '_inp_tensor', dace.Memlet.from_array(dot_name, dot_arr))
             state.add_edge(tasklet, '_out_tensor', out_node, None, dace.Memlet.from_array('_out_tensor', out_arr))
         else:
-            out_strides = [out_arr.strides[len(left_tt_arr.shape)-len(node.left_axes)-1], out_arr.strides[-1]]
-            out_vname, out_view = sdfg.add_view('__gemm_out', out_shape, out_arr.dtype, out_arr.storage, strides=out_strides, find_new_name=True)
+            out_strides = [out_arr.strides[len(left_tt_arr.shape) - len(node.left_axes) - 1], out_arr.strides[-1]]
+            out_vname, out_view = sdfg.add_view('__gemm_out',
+                                                out_shape,
+                                                out_arr.dtype,
+                                                out_arr.storage,
+                                                strides=out_strides,
+                                                find_new_name=True)
             out_anode = state.add_access('_out_tensor')
             out_vnode = state.add_access(out_vname)
             state.add_edge(tasklet, '_c', out_vnode, None, dace.Memlet.from_array(out_vname, out_view))
@@ -215,8 +278,10 @@ def expansion(node, parent_state, parent_sdfg):
         """
 
         left_modes = list(range(len(left_tensor.shape)))
-        right_modes = [node.left_axes[node.right_axes.index(i)] if i in node.right_axes else len(left_tensor.shape) + i
-                       for i in range(len(right_tensor.shape))]
+        right_modes = [
+            node.left_axes[node.right_axes.index(i)] if i in node.right_axes else len(left_tensor.shape) + i
+            for i in range(len(right_tensor.shape))
+        ]
         out_modes = [i for i in left_modes if i not in node.left_axes]
         out_modes.extend([i for i in right_modes if i not in node.left_axes])
         if node.permutation and node.permutation != list(range(len(node.permutation))):
@@ -311,9 +376,6 @@ def expansion(node, parent_state, parent_sdfg):
                                           node.out_connectors,
                                           code,
                                           language=dace.dtypes.Language.CPP)
-        # conn = tasklet.out_connectors
-        # conn = {c: (dace.dtypes.pointer(dace.int32) if c == '_res' else t) for c, t in conn.items()}
-        # tasklet.out_connectors = conn
 
         return tasklet
 
@@ -322,23 +384,22 @@ def expansion(node, parent_state, parent_sdfg):
 class TensorDot(nodes.LibraryNode):
     """ Implements tensor dot-product. """
 
-    implementations = {
-        "pure": ExpandPure,
-        "TTGT": ExpandTTGT,
-        "cuTENSOR": ExpandCuTensor
-    }
+    implementations = {"pure": ExpandPure, "TTGT": ExpandTTGT, "cuTENSOR": ExpandCuTensor}
     default_implementation = None
 
     left_axes = properties.ListProperty(element_type=int, default=[], desc="Left tensor's contracting modes")
     right_axes = properties.ListProperty(element_type=int, default=[], desc="Right tensor's contracting modes")
-    permutation = properties.ListProperty(element_type=int, allow_none=True, default=None, desc="Permutation of the output tensor")
+    permutation = properties.ListProperty(element_type=int,
+                                          allow_none=True,
+                                          default=None,
+                                          desc="Permutation of the output tensor")
 
     def __init__(self, name, left_axes=[], right_axes=[], permutation=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_left_tensor", "_right_tensor"}, outputs={"_out_tensor"}, **kwargs)
         self.left_axes = left_axes
         self.right_axes = right_axes
         self.permutation = permutation
-    
+
     def validate(self, sdfg, state):
         """
         Validates the tensor dot-product operation.
@@ -361,7 +422,8 @@ def validate(self, sdfg, state):
             raise ValueError("Missing the output tensor.")
 
         if left_tensor.dtype != right_tensor.dtype or left_tensor.dtype != out_tensor.dtype:
-            raise TypeError("The datatype of the input and output tensors must match.")    
+            raise TypeError("The datatype of the input and output tensors must match.")
+        # TODO: Check disabled due to causing issues with CUDA + MPI. Revisit in the future.
         # if left_tensor.storage != right_tensor.storage or left_tensor.storage != out_tensor.storage:
         #     raise ValueError("The storage of the input and output tensors must match.")
 
@@ -373,14 +435,15 @@ def validate(self, sdfg, state):
             raise ValueError("The input tensors must have the same number of contracting modes.")
         if any(left_tensor.shape[l] != right_tensor.shape[r] for l, r in zip(self.left_axes, self.right_axes)):
             raise ValueError("The input tensors' contracting modes must have the same length.")
-        
+
         dot_shape = [s for i, s in enumerate(left_tensor.shape) if i not in self.left_axes]
         dot_shape.extend([s for i, s in enumerate(right_tensor.shape) if i not in self.right_axes])
         out_shape = list(out_tensor.shape)
         if len(dot_shape) != len(out_shape):
             raise ValueError("The intermediate (dot-product) and output tensors must have the same number of modes..")
-        
+
         # # We check if the output shape is a permutation of a dot-product shape.
+        # TODO: Check disabled due to causing issues with valid test cases. Revisit in the future.
         # # NOTE: Since the shapes may be symbolic, we cannot just sort and compare them.
         # for s in out_shape:
         #     try:
@@ -391,7 +454,6 @@ def validate(self, sdfg, state):
         # if dot_shape:
         #     raise ValueError("The output tensor shape is not a permutation of the dot-product shape.")
 
-
         if not self.permutation:
             if dot_shape != out_shape:
                 raise ValueError("The shapes of the intermediate (dot-product) and output tensors must match.")
@@ -399,11 +461,13 @@ def validate(self, sdfg, state):
             # NOTE: If the output tensor is transposed, then the permutation must be given explicitely. The permutation
             # can only be inferred if each tensor mode has different length, which should never be assumed.
             if len(out_tensor.shape) != len(self.permutation):
-                raise ValueError("The permutation list property must have as many elements as the number of output tensor modes.")
+                raise ValueError(
+                    "The permutation list property must have as many elements as the number of output tensor modes.")
             if sorted(self.permutation) != list(range(len(out_tensor.shape))):
                 raise ValueError("The permutation list property is not a perimutation of the output tensor's modes.")
             transposed_shape = [dot_shape[p] for p in self.permutation]
             if transposed_shape != list(out_tensor.shape):
-                raise ValueError("The permutation of the intermediate (dot-product) shape does not match the output shape.")
+                raise ValueError(
+                    "The permutation of the intermediate (dot-product) shape does not match the output shape.")
 
         return left_tensor, right_tensor, out_tensor
diff --git a/dace/libraries/ttranspose/__init__.py b/dace/libraries/ttranspose/__init__.py
index 6c49f26fa6..868af39ac6 100644
--- a/dace/libraries/ttranspose/__init__.py
+++ b/dace/libraries/ttranspose/__init__.py
@@ -1,6 +1,6 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from dace.library import register_library
 from .nodes import *
 from .environments import *
 
-register_library(__name__, "ttranspose")
\ No newline at end of file
+register_library(__name__, "ttranspose")
diff --git a/dace/libraries/ttranspose/environments/__init__.py b/dace/libraries/ttranspose/environments/__init__.py
index 0c6487def2..267e46a5e4 100644
--- a/dace/libraries/ttranspose/environments/__init__.py
+++ b/dace/libraries/ttranspose/environments/__init__.py
@@ -1,2 +1,2 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from .hptt import *
diff --git a/dace/libraries/ttranspose/environments/hptt.py b/dace/libraries/ttranspose/environments/hptt.py
index 67a6057ed0..2ee7695437 100644
--- a/dace/libraries/ttranspose/environments/hptt.py
+++ b/dace/libraries/ttranspose/environments/hptt.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import ctypes
 import os
 from dace import config, library
@@ -27,7 +27,7 @@ def cmake_includes():
             return [os.path.join(os.environ['HPTT_ROOT'], 'include')]
         else:
             return []
-    
+
     @staticmethod
     def cmake_libraries():
         if 'HPTT_ROOT' in os.environ:
@@ -36,5 +36,5 @@ def cmake_libraries():
             libfile = os.path.join(os.environ['HPTT_ROOT'], 'lib', prefix + 'hptt.' + suffix)
             if os.path.isfile(libfile):
                 return [libfile]
-        
+
         return ['hptt']
diff --git a/dace/libraries/ttranspose/nodes/__init__.py b/dace/libraries/ttranspose/nodes/__init__.py
index 0babde8a78..a6d558f2ed 100644
--- a/dace/libraries/ttranspose/nodes/__init__.py
+++ b/dace/libraries/ttranspose/nodes/__init__.py
@@ -1,2 +1,2 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserv
-from .ttranspose import  TensorTranspose
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserv
+from .ttranspose import TensorTranspose
diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/ttranspose/nodes/ttranspose.py
index c8ecd00136..9d87a84343 100644
--- a/dace/libraries/ttranspose/nodes/ttranspose.py
+++ b/dace/libraries/ttranspose/nodes/ttranspose.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import multiprocessing
 from dace import library, nodes, properties
@@ -12,7 +12,7 @@
 @library.expansion
 class ExpandPure(ExpandTransformation):
     """ Implements the pure expansion of TensorTranspose library node. """
-    
+
     environments = []
 
     @staticmethod
@@ -20,12 +20,20 @@ def expansion(node, parent_state, parent_sdfg):
         inp_tensor, out_tensor = node.validate(parent_sdfg, parent_state)
 
         sdfg = dace.SDFG(f"{node.label}_sdfg")
-        _, inp_arr = sdfg.add_array("_inp_tensor", inp_tensor.shape, inp_tensor.dtype, inp_tensor.storage, strides=inp_tensor.strides)
-        _, out_arr = sdfg.add_array("_out_tensor", out_tensor.shape, out_tensor.dtype, out_tensor.storage, strides=out_tensor.strides)
-        
-        state = sdfg.add_state(f"{node.label}_state")   
+        _, inp_arr = sdfg.add_array("_inp_tensor",
+                                    inp_tensor.shape,
+                                    inp_tensor.dtype,
+                                    inp_tensor.storage,
+                                    strides=inp_tensor.strides)
+        _, out_arr = sdfg.add_array("_out_tensor",
+                                    out_tensor.shape,
+                                    out_tensor.dtype,
+                                    out_tensor.storage,
+                                    strides=out_tensor.strides)
+
+        state = sdfg.add_state(f"{node.label}_state")
         map_params = [f"__i{i}" for i in range(len(inp_arr.shape))]
-        map_rng = {i: f"0:{s}"for i, s in zip(map_params, inp_arr.shape)}
+        map_rng = {i: f"0:{s}" for i, s in zip(map_params, inp_arr.shape)}
         inp_mem = dace.Memlet(expr=f"_inp_tensor[{','.join(map_params)}]")
         out_mem = dace.Memlet(expr=f"_out_tensor[{','.join([map_params[i] for i in node.axes])}]")
         inputs = {"_inp": inp_mem}
@@ -45,7 +53,7 @@ class ExpandHPTT(ExpandTransformation):
     Implements the TensorTranspose library node using the High-Performance Tensor Transpose Library (HPTT).
     For more information, see https://github.com/springer13/hptt.
     """
-    
+
     environments = [environments.HPTT]
 
     @staticmethod
@@ -65,10 +73,10 @@ def expansion(node, parent_state, parent_sdfg):
         """
 
         tasklet = nodes.Tasklet(node.name,
-                                          node.in_connectors,
-                                          node.out_connectors,
-                                          code,
-                                          language=dace.dtypes.Language.CPP)
+                                node.in_connectors,
+                                node.out_connectors,
+                                code,
+                                language=dace.dtypes.Language.CPP)
 
         return tasklet
 
@@ -77,10 +85,7 @@ def expansion(node, parent_state, parent_sdfg):
 class TensorTranspose(nodes.LibraryNode):
     """ Implements out-of-place tensor transpositions. """
 
-    implementations = {
-        "pure": ExpandPure,
-        "HPTT": ExpandHPTT
-    }
+    implementations = {"pure": ExpandPure, "HPTT": ExpandHPTT}
     default_implementation = None
 
     axes = properties.ListProperty(element_type=int, default=[], desc="Permutation of input tensor's modes")
@@ -92,7 +97,7 @@ def __init__(self, name, axes=[], alpha=1, beta=0, *args, **kwargs):
         self.axes = axes
         self.alpha = alpha
         self.beta = beta
-    
+
     def validate(self, sdfg, state):
         """
         Validates the tensor transposition operation.
@@ -114,17 +119,17 @@ def validate(self, sdfg, state):
 
         if inp_tensor.dtype != out_tensor.dtype:
             raise ValueError("The datatype of the input and output tensors must match.")
-        
+
         if inp_tensor.storage != out_tensor.storage:
             raise ValueError("The storage of the input and output tensors must match.")
-        
+
         if len(inp_tensor.shape) != len(out_tensor.shape):
             raise ValueError("The input and output tensors must have the same number of modes.")
         if len(inp_tensor.shape) != len(self.axes):
             raise ValueError("The axes list property must have as many elements as the number of tensor modes.")
         if sorted(self.axes) != list(range(len(inp_tensor.shape))):
             raise ValueError("The axes list property is not a perimutation of the input tensor's modes.")
-        
+
         transposed_shape = [inp_tensor.shape[t] for t in self.axes]
         if transposed_shape != list(out_tensor.shape):
             raise ValueError("The permutation of the input shape does not match the output shape.")
diff --git a/tests/numpy/linalg_test.py b/tests/numpy/linalg_test.py
index 5e6e65474a..9211d84130 100644
--- a/tests/numpy/linalg_test.py
+++ b/tests/numpy/linalg_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import numpy as np
 import pytest
@@ -71,11 +71,11 @@ def test_tensordot_0():
     @dace.program
     def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B)
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
-        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+        assert (np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
 
 
 def test_tensordot_01():
@@ -83,23 +83,26 @@ def test_tensordot_01():
     @dace.program
     def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B)
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
-        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+        assert (np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+
 
-@pytest.mark.gpu
+# TODO: Enable after fixing cuTENSOR in CI
+#@pytest.mark.gpu
+@pytest.mark.skip
 def test_tensordot_02():
 
     @dace.program(device=dace.dtypes.DeviceType.GPU)
     def tensordot_0(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B)
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
-        assert(np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
+        assert (np.allclose(tensordot_0(A.copy(), B.copy()), tensordot_0.f(A, B)))
 
 
 def test_tensordot_1():
@@ -107,11 +110,11 @@ def test_tensordot_1():
     @dace.program
     def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]))
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
-        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+        assert (np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
 
 
 def test_tensordot_11():
@@ -119,24 +122,26 @@ def test_tensordot_11():
     @dace.program
     def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]))
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
-        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+        assert (np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
 
 
-@pytest.mark.gpu
+# TODO: Enable after fixing cuTENSOR in CI
+#@pytest.mark.gpu
+@pytest.mark.skip
 def test_tensordot_12():
 
     @dace.program(device=dace.dtypes.DeviceType.GPU)
     def tensordot_1(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]))
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
-        assert(np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
+        assert (np.allclose(tensordot_1(A.copy(), B.copy()), tensordot_1.f(A, B)))
 
 
 def test_tensordot_2():
@@ -144,22 +149,22 @@ def test_tensordot_2():
     @dace.program
     def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
-        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
-    
+        assert (np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+
     @dace.program
     def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='pure'):
-        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+        assert (np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
 
 
 def test_tensordot_21():
@@ -167,45 +172,48 @@ def test_tensordot_21():
     @dace.program
     def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
-        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
-    
+        assert (np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+
     @dace.program
     def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='TTGT'):
-        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+        assert (np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
 
 
+# TODO: Enable after fixing cuTENSOR in CI
+#@pytest.mark.gpu
+@pytest.mark.skip
 def test_tensordot_22():
 
     @dace.program(device=dace.dtypes.DeviceType.GPU)
     def tensordot_2a(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[7, 6, 5, 4, 3, 2, 1, 0])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[7, 6, 5, 4, 3, 2, 1, 0])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
-        assert(np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
-    
+        assert (np.allclose(tensordot_2a(A.copy(), B.copy()), ref))
+
     @dace.program(device=dace.dtypes.DeviceType.GPU)
     def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3, 3, 3]):
         return np.tensordot(A, B, axes=([0, 3], [4, 2]), out_axes=[0, 7, 1, 6, 2, 5, 3, 4])
-    
+
     A = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     B = np.arange(3**6, dtype=np.float32).reshape(3, 3, 3, 3, 3, 3)
     ref = np.transpose(np.tensordot(A, B, axes=([0, 3], [4, 2])), axes=[0, 7, 1, 6, 2, 5, 3, 4])
     with dace.config.set_temporary('library', 'linalg', 'default_implementation', value='cuTENSOR'):
-        assert(np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
+        assert (np.allclose(tensordot_2b(A.copy(), B.copy()), ref))
 
 
 if __name__ == "__main__":
@@ -213,10 +221,11 @@ def tensordot_2b(A: dace.float32[3, 3, 3, 3, 3, 3], B: dace.float32[3, 3, 3, 3,
     test_linalg_solve()
     test_linalg_cholesky()
     test_tensordot_0()
-    test_tensordot_1()
     test_tensordot_01()
-    test_tensordot_11()
     test_tensordot_02()
+    test_tensordot_1()
+    test_tensordot_11()
     test_tensordot_12()
+    test_tensordot_2()
     test_tensordot_21()
     test_tensordot_22()
diff --git a/tests/numpy/transpose_test.py b/tests/numpy/transpose_test.py
index df2c9ff2e5..c56a8a0cda 100644
--- a/tests/numpy/transpose_test.py
+++ b/tests/numpy/transpose_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import numpy as np
 import dace
 import pytest
@@ -26,10 +26,12 @@ def test_transpose_axes1(A: dace.float32[10, 5, 3, 2]):
 def test_transpose_axes2(A: dace.float32[10, 5, 3, 2]):
     return np.transpose(A, axes=[3, 0, 2])
 
+
 @compare_numpy_output()
 def test_transpose_none(A: dace.float32[10, 5, 3, 2]):
     return np.transpose(A)
 
+
 @compare_numpy_output()
 def test_transpose_no(A: dace.float32[10, 5, 3, 2]):
     return np.transpose(A, axes=[0, 1, 2, 3])
@@ -46,7 +48,9 @@ def test_transpose():
     assert rel_error <= 1e-5
 
 
-@pytest.mark.hptt
+# TODO: Enable after fixing HPTT in CI
+# @pytest.mark.hptt
+@pytest.mark.skip
 def test_hptt():
     with dace.config.set_temporary('library', 'ttranspose', 'default_implementation', value='HPTT'):
         test_transpose_axes0()

From 50f867254d275fc4341b3c363327d68dc1ac27b3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 15:30:22 +0200
Subject: [PATCH 225/392] Increased priority of the "views" connector.

---
 dace/sdfg/utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 36084472d2..8d251efd89 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -841,9 +841,15 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
         return out_edge
     if not src_is_data and not dst_is_data:
         return None
+    
+    # Check if there is a 'views' connector
+    if in_edge.dst_conn and in_edge.dst_conn == 'views':
+        return in_edge
+    if out_edge.src_conn and out_edge.src_conn == 'views':
+        return out_edge
 
-    # If both sides lead to access nodes, if one memlet's data points to the
-    # view it cannot point to the viewed node.
+    # TODO: This sounds arbitrary and is not well communicated to the frontends. Revisit in the future.
+    # If both sides lead to access nodes, if one memlet's data points to the view it cannot point to the viewed node.
     if in_edge.data.data == view.data and out_edge.data.data != view.data:
         return out_edge
     if in_edge.data.data != view.data and out_edge.data.data == view.data:
@@ -851,12 +857,6 @@ def get_view_edge(state: SDFGState, view: nd.AccessNode) -> gr.MultiConnectorEdg
     if in_edge.data.data == view.data and out_edge.data.data == view.data:
         return None
 
-    # Check if there is a 'views' connector
-    if in_edge.dst_conn and in_edge.dst_conn == 'views':
-        return in_edge
-    if out_edge.src_conn and out_edge.src_conn == 'views':
-        return out_edge
-
     # If both memlets' data are the respective access nodes, the access
     # node at the highest scope is the one that is viewed.
     if isinstance(in_edge.src, nd.EntryNode):

From 478b87af7cdee877115d95a5c85fb9534da82ac8 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 15:31:23 +0200
Subject: [PATCH 226/392] Reworked algorithm to find multiple sources and
 sinks. Attempt to rename only when there is a single source and sink node.

---
 .../subgraph/subgraph_fusion.py               | 139 ++++++++++--------
 1 file changed, 79 insertions(+), 60 deletions(-)

diff --git a/dace/transformation/subgraph/subgraph_fusion.py b/dace/transformation/subgraph/subgraph_fusion.py
index 5cd693eb03..75487b0186 100644
--- a/dace/transformation/subgraph/subgraph_fusion.py
+++ b/dace/transformation/subgraph/subgraph_fusion.py
@@ -1144,7 +1144,10 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
 
         # Try to remove intermediate nodes that are not contained in the subgraph
         # by reconnecting their adjacent edges to nodes outside the subgraph.
+        # NOTE: Currently limited to cases where there is a single source and sink
+        # if there are multiple intermediate accesses for the same data.
 
+        # Sort intermediate nodes by data name
         intermediate_data = dict()
         for acc in intermediate_nodes:
             if acc.data in intermediate_data:
@@ -1152,109 +1155,125 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
             else:
                 intermediate_data[acc.data] = [acc]
         
-        filtered_intermediate_nodes = []
-        intermediate_nodes_deps = dict()
-        for _, accesses in intermediate_data.items():
-            if len(accesses) == 1:
-                filtered_intermediate_nodes.append(accesses[0])
-            else:
-                accesses_copy = list(accesses)
-                access_dict = {a: [] for a in accesses}
-                for acc in accesses:
-                    other_accesses = [a for a in accesses_copy if a != acc]
-                    for other_acc in other_accesses:
-                        if nx.has_path(graph.nx, other_acc, acc):
-                            accesses_copy.remove(other_acc)
-                            access_dict[acc].append(other_acc)
-                            access_dict[acc].extend(access_dict[other_acc])
-                            del access_dict[other_acc]
-                for acc in accesses_copy:
-                    filtered_intermediate_nodes.append(acc)
-                    intermediate_nodes_deps[acc] = [a for a in access_dict[acc]]
+        filtered_intermediate_data = dict()
+        intermediate_sources = dict()
+        intermediate_sinks = dict()
+        for dname, accesses in intermediate_data.items():
+
+            sources = set(accesses)
+            sinks = set(accesses)
+
+            # Find sinks
+            for acc0 in accesses:
+                for acc1 in set(sinks):
+                    if acc0 is acc1:
+                        continue
+                    if nx.has_path(graph.nx, acc0, acc1):
+                        sinks.remove(acc0)
+                        break
+            if len(sinks) > 1:
+                continue
+            # Find sources
+            for acc0 in accesses:
+                for acc1 in set(sources):
+                    if acc0 is acc1:
+                        continue
+                    if nx.has_path(graph.nx, acc1, acc0):
+                        sources.remove(acc0)
+                        break
+            if len(sources) > 1:
+                continue
+
+            filtered_intermediate_data[dname] = accesses
+            intermediate_sources[dname] = sources
+            intermediate_sinks[dname] = sinks
+
+        edges_to_remove = set()
              
-        for node in filtered_intermediate_nodes:
+        for dname, accesses in filtered_intermediate_data.items():
+
             # Checking if data are contained in the subgraph
-            if not subgraph_contains_data[node.data]:
+            if not subgraph_contains_data[dname]:
                 # Find existing outer access nodes
                 inode, onode = None, None
                 for e in graph.in_edges(global_map_entry):
-                    if isinstance(e.src, nodes.AccessNode) and node.data == e.src.data:
+                    if isinstance(e.src, nodes.AccessNode) and dname == e.src.data:
                         inode = e.src
                         break
                 for e in graph.out_edges(global_map_exit):
-                    if isinstance(e.dst, nodes.AccessNode) and node.data == e.dst.data:
+                    if isinstance(e.dst, nodes.AccessNode) and dname == e.dst.data:
                         onode = e.dst
                         break
 
-                to_remove = set()
-
                 # Compute the union of all incoming subsets.
                 # TODO: Do we expect this operation to ever fail?
                 in_subset: subsets.Subset = None
-                accesses = [node] + intermediate_nodes_deps[node]
+                first_subset: subsets.Subset = None
                 for acc in accesses:
                     for ie in graph.in_edges(acc):
                         if in_subset:
                             in_subset = subsets.union(in_subset, ie.data.dst_subset)
                         else:
                             in_subset = ie.data.dst_subset 
-                # for ie in graph.in_edges(node):
-                #     if in_subset:
-                #         in_subset = subsets.union(in_subset, ie.data.dst_subset)
-                #     else:
-                #         in_subset = ie.data.dst_subset
+                            first_subset = ie.data.dst_subset
 
                 # Create transient data corresponding to the union of the incoming subsets.
-                desc = sdfg.arrays[node.data]
-                name, _ = sdfg.add_temp_transient(in_subset.bounding_box_size(), desc.dtype, desc.storage)
+                desc = sdfg.arrays[dname]
+                new_name, _ = sdfg.add_temp_transient(in_subset.bounding_box_size(), desc.dtype, desc.storage)
 
-                # Reconnect incoming edges through the transient data.
                 for acc in accesses:
 
-                    new_node = graph.add_access(name)
+                    acc.data = new_name
 
+                     # Reconnect incoming edges through the transient data.
                     for ie in graph.in_edges(acc):
-                        mem = Memlet(data=name,
+                        mem = Memlet(data=new_name,
                                     subset=ie.data.dst_subset.offset_new(in_subset, True),
                                     other_subset=ie.data.src_subset)
-                        new_edge = graph.add_edge(ie.src, ie.src_conn, new_node, None, mem)
-                        to_remove.add(ie)
+                        # new_edge = graph.add_edge(ie.src, ie.src_conn, new_node, None, mem)
+                        ie.data = mem
                         # Update memlet paths.
-                        for e in graph.memlet_path(new_edge):
-                            if e.data.data == node.data:
-                                e.data.data = name
+                        for e in graph.memlet_path(ie):
+                            if e.data.data == dname:
+                                e.data.data = new_name
                                 e.data.dst_subset.offset(in_subset, True)
 
                     # Reconnect outgoing edges through the transient data.
                     for oe in graph.out_edges(acc):
                         if in_subset.covers(oe.data.src_subset):
-                            mem = Memlet(data=name,
+                            mem = Memlet(data=new_name,
                                         subset=oe.data.src_subset.offset_new(in_subset, True),
                                         other_subset=oe.data.dst_subset)
-                            new_edge = graph.add_edge(new_node, None, oe.dst, oe.dst_conn, mem)
+                            # new_edge = graph.add_edge(new_node, None, oe.dst, oe.dst_conn, mem)
+                            oe.data = mem
                             # Update memlet paths.
-                            for e in graph.memlet_path(new_edge):
-                                if e.data.data == node.data:
-                                    e.data.data = name
+                            for e in graph.memlet_path(oe):
+                                if e.data.data == dname:
+                                    e.data.data = new_name
                                     e.data.src_subset.offset(in_subset, True)
                         else:
+                            # NOTE: For debugging purposes
+                            intersect = subsets.intersects(in_subset, oe.data.src_subset)
+                            if intersect is None:
+                                warnings.warn(f'{dname}[{in_subset}] may intersect with {dname}[{oe.data.src_subset}]')
+                            elif intersect:
+                                raise ValueError(f'{dname}[{in_subset}] intersects with {dname}[{oe.data.src_subset}]')
                             # If the outgoing subset is not covered by the transient data, connect to the outer input node.
                             if not inode:
-                                inode = graph.add_access(node.data)
+                                inode = graph.add_access(dname)
                             graph.add_memlet_path(inode, global_map_entry, oe.dst, memlet=oe.data, dst_conn=oe.dst_conn)
-                        to_remove.add(oe)
+                            edges_to_remove.add(oe)
+
 
                     # Connect transient data to the outer output node.
-                    if acc is node:
+                    if acc in intermediate_sinks[dname]:
                         if not onode:
-                            onode = graph.add_access(node.data)
-                        graph.add_memlet_path(new_node,
-                                              global_map_exit,
-                                              onode,
-                                              memlet=Memlet(data=node.data, subset=in_subset),
-                                              src_conn=None)
-
-                for e in to_remove:
-                    graph.remove_edge(e)
-                if to_remove:
-                    graph.remove_nodes_from(accesses)
+                            onode = graph.add_access(dname)
+                        graph.add_memlet_path(acc,
+                                            global_map_exit,
+                                            onode,
+                                            memlet=Memlet(data=dname, subset=in_subset),
+                                            src_conn=None)
+
+        for e in edges_to_remove:
+            graph.remove_edge(e)

From eff7ba8fd42a5203245f4c9fe002ce9437165aaf Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 15:31:44 +0200
Subject: [PATCH 227/392] Updated test.

---
 .../subgraph_fusion/intermediate_mimo_test.py | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/transformations/subgraph_fusion/intermediate_mimo_test.py b/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
index 850c4aca07..1996d3c4a5 100644
--- a/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
+++ b/tests/transformations/subgraph_fusion/intermediate_mimo_test.py
@@ -123,6 +123,19 @@ def sdmi_accesses(ZSOLQA: dace.float64[1, 5, 5], ZEPSEC: dace.float64, ZQX: dace
     sdfg = sdmi_accesses.to_sdfg(simplify=True)
     assert len(sdfg.states()) == 1
 
+    rng = np.random.default_rng(42)
+    ZSOLQA = rng.random((1, 5, 5))
+    ZEPSEC = rng.random()
+    ZQX = rng.random((1, 137, 5))
+    ref_LLINDEX3 = rng.random((1, 5, 5)) > 0.5
+    ref_ZRATIO = rng.random((1, 5))
+    ref_ZSINKSUM = rng.random((1, 5))
+    val_LLINDEX3 = ref_LLINDEX3.copy()
+    val_ZRATIO = ref_ZRATIO.copy()
+    val_ZSINKSUM = ref_ZSINKSUM.copy()
+
+    sdfg(ZSOLQA=ZSOLQA, ZEPSEC=ZEPSEC, ZQX=ZQX, LLINDEX3=ref_LLINDEX3, ZRATIO=ref_ZRATIO, ZSINKSUM=ref_ZSINKSUM)
+
     graph = sdfg.states()[0]
     subgraph = SubgraphView(graph, [node for node in graph.nodes()])
 
@@ -136,9 +149,13 @@ def sdmi_accesses(ZSOLQA: dace.float64[1, 5, 5], ZEPSEC: dace.float64, ZQX: dace
     assert sf.can_be_applied(sdfg, subgraph) == True
     sf.apply(sdfg)
 
-    sdfg.view()
+    sdfg(ZSOLQA=ZSOLQA, ZEPSEC=ZEPSEC, ZQX=ZQX, LLINDEX3=val_LLINDEX3, ZRATIO=val_ZRATIO, ZSINKSUM=val_ZSINKSUM)
+
+    assert np.allclose(ref_LLINDEX3, val_LLINDEX3)
+    assert np.allclose(ref_ZRATIO, val_ZRATIO)
+    assert np.allclose(ref_ZSINKSUM, val_ZSINKSUM)
 
 
 if __name__ == '__main__':
-    # test_mimo()
+    test_mimo()
     test_single_data_multiple_intermediate_accesses()

From f81238c7349411082216f23b58384b8d2e856a58 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 15:32:33 +0200
Subject: [PATCH 228/392] Clean up

---
 .../subgraph/subgraph_fusion.py               | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/dace/transformation/subgraph/subgraph_fusion.py b/dace/transformation/subgraph/subgraph_fusion.py
index 75487b0186..a56336fa8d 100644
--- a/dace/transformation/subgraph/subgraph_fusion.py
+++ b/dace/transformation/subgraph/subgraph_fusion.py
@@ -1154,7 +1154,7 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
                 intermediate_data[acc.data].append(acc)
             else:
                 intermediate_data[acc.data] = [acc]
-        
+
         filtered_intermediate_data = dict()
         intermediate_sources = dict()
         intermediate_sinks = dict()
@@ -1189,7 +1189,7 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
             intermediate_sinks[dname] = sinks
 
         edges_to_remove = set()
-             
+
         for dname, accesses in filtered_intermediate_data.items():
 
             # Checking if data are contained in the subgraph
@@ -1214,7 +1214,7 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
                         if in_subset:
                             in_subset = subsets.union(in_subset, ie.data.dst_subset)
                         else:
-                            in_subset = ie.data.dst_subset 
+                            in_subset = ie.data.dst_subset
                             first_subset = ie.data.dst_subset
 
                 # Create transient data corresponding to the union of the incoming subsets.
@@ -1225,11 +1225,11 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
 
                     acc.data = new_name
 
-                     # Reconnect incoming edges through the transient data.
+                    # Reconnect incoming edges through the transient data.
                     for ie in graph.in_edges(acc):
                         mem = Memlet(data=new_name,
-                                    subset=ie.data.dst_subset.offset_new(in_subset, True),
-                                    other_subset=ie.data.src_subset)
+                                     subset=ie.data.dst_subset.offset_new(in_subset, True),
+                                     other_subset=ie.data.src_subset)
                         # new_edge = graph.add_edge(ie.src, ie.src_conn, new_node, None, mem)
                         ie.data = mem
                         # Update memlet paths.
@@ -1242,8 +1242,8 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
                     for oe in graph.out_edges(acc):
                         if in_subset.covers(oe.data.src_subset):
                             mem = Memlet(data=new_name,
-                                        subset=oe.data.src_subset.offset_new(in_subset, True),
-                                        other_subset=oe.data.dst_subset)
+                                         subset=oe.data.src_subset.offset_new(in_subset, True),
+                                         other_subset=oe.data.dst_subset)
                             # new_edge = graph.add_edge(new_node, None, oe.dst, oe.dst_conn, mem)
                             oe.data = mem
                             # Update memlet paths.
@@ -1264,16 +1264,15 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
                             graph.add_memlet_path(inode, global_map_entry, oe.dst, memlet=oe.data, dst_conn=oe.dst_conn)
                             edges_to_remove.add(oe)
 
-
                     # Connect transient data to the outer output node.
                     if acc in intermediate_sinks[dname]:
                         if not onode:
                             onode = graph.add_access(dname)
                         graph.add_memlet_path(acc,
-                                            global_map_exit,
-                                            onode,
-                                            memlet=Memlet(data=dname, subset=in_subset),
-                                            src_conn=None)
+                                              global_map_exit,
+                                              onode,
+                                              memlet=Memlet(data=dname, subset=in_subset),
+                                              src_conn=None)
 
         for e in edges_to_remove:
             graph.remove_edge(e)

From ae1270e6a8f207089be3f361cbddc9f1a3d4e006 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 17:44:41 +0200
Subject: [PATCH 229/392] Added support for interstate edges to
 RefineNestedAccess.

---
 dace/transformation/interstate/sdfg_nesting.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index b33ad43a3b..a25d819a2d 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -956,7 +956,21 @@ def _candidates(
                         continue
                     in_candidates[e.data.data] = (e.data, nstate, set(range(len(e.data.subset))))
 
-        # TODO: Check in_candidates in interstate edges as well
+        # Check interstate edges for candidates
+        for e in nsdfg.sdfg.edges():
+            for m in e.data.get_read_memlets(nsdfg.sdfg.arrays):
+                # If more than one unique element detected, remove from candidates
+                if m.data in in_candidates:
+                    memlet, ns, indices = in_candidates[m.data]
+                    # Try to find dimensions in which there is a mismatch and remove them from list
+                    for i, (s1, s2) in enumerate(zip(m.subset, memlet.subset)):
+                        if s1 != s2 and i in indices:
+                            indices.remove(i)
+                    if len(indices) == 0:
+                        ignore.add(m.data)
+                    in_candidates[m.data] = (memlet, ns, indices)
+                    continue
+                in_candidates[m.data] = (m, None, set(range(len(m.subset))))
 
         # Check in/out candidates
         for cand in in_candidates.keys() & out_candidates.keys():
@@ -986,7 +1000,7 @@ def _check_cand(candidates, outer_edges):
                     continue
 
                 # Check w.r.t. loops
-                if len(nstate.ranges) > 0:
+                if nstate is not None and len(nstate.ranges) > 0:
                     # Re-annotate loop ranges, in case someone changed them
                     # TODO: Move out of here!
                     for ns in nsdfg.sdfg.states():

From eb3dd0558d33d7855ee8fc8bf49d8daa937f2bc9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 17:45:15 +0200
Subject: [PATCH 230/392] Added tests.

---
 .../refine_nested_access_test.py              | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 tests/transformations/refine_nested_access_test.py

diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
new file mode 100644
index 0000000000..30d2a3e77e
--- /dev/null
+++ b/tests/transformations/refine_nested_access_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Tests for the RefineNestedAccess transformation. """
+
+import dace
+import numpy as np
+
+from dace.transformation.interstate import RefineNestedAccess
+
+
+def test_refine_dataflow():
+
+    i = dace.symbol('i')
+    j = dace.symbol('j')
+
+    @dace.program
+    def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5]):
+        B[i, j] = A[j, i]
+
+    sdfg = dace.SDFG('refine_dataflow')
+    sdfg.add_array('A', [5, 5], dace.int32)
+    sdfg.add_array('B', [5, 5], dace.int32)
+
+    state = sdfg.add_state()
+    A = state.add_access('A')
+    B = state.add_access('B')
+    me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A'}, {'B'}, {'i': 'i', 'j': 'j'})
+    state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
+    state.add_memlet_path(nsdfg, mx, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
+
+    num = sdfg.apply_transformations_repeated(RefineNestedAccess)
+    assert num == 1
+
+    for edge in state.out_edges(me):
+        assert edge.data.subset == dace.subsets.Range([(j, j, 1), (i, i, 1)])
+    for edge in state.in_edges(mx):
+        assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
+
+    A = np.arange(25, dtype=np.int32).reshape(5, 5).copy()
+    B = np.empty((5, 5), dtype=np.int32)
+    sdfg(A=A, B=B)
+    assert np.allclose(B, A.T)
+
+
+def test_refine_interstate():
+
+    i = dace.symbol('i')
+    j = dace.symbol('j')
+
+    @dace.program
+    def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]):
+        if select[i, j]:
+            B[i, j] = A[j, i]
+        else:
+            B[i, j] = A[i, j]
+
+    sdfg = dace.SDFG('refine_dataflow')
+    sdfg.add_array('A', [5, 5], dace.int32)
+    sdfg.add_array('B', [5, 5], dace.int32)
+    sdfg.add_array('select', [5, 5], dace.bool)
+
+    state = sdfg.add_state()
+    A = state.add_access('A')
+    B = state.add_access('B')
+    select = state.add_access('select')
+    me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A', 'select'}, {'B'}, {'i': 'i', 'j': 'j'})
+    state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
+    state.add_memlet_path(select,
+                          me,
+                          nsdfg,
+                          dst_conn='select',
+                          memlet=dace.Memlet.from_array('select', sdfg.arrays['select']))
+    state.add_memlet_path(nsdfg, mx, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
+
+    num = sdfg.apply_transformations_repeated(RefineNestedAccess)
+    assert num == 1
+
+    for edge in state.out_edges(me):
+        if edge.data.data == 'A':
+            expr = dace.symbolic.pystr_to_symbolic('Max(i, j)')
+            assert edge.data.subset == dace.subsets.Range([(0, expr, 1), (0, expr, 1)])
+        else:
+            assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
+    for edge in state.in_edges(mx):
+        assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
+
+    A = np.arange(25, dtype=np.int32).reshape(5, 5).copy()
+    B = np.empty((5, 5), dtype=np.int32)
+    select = np.empty((5, 5), dtype=np.bool_)
+    select[:] = True
+    upper = np.triu(select, k=0)
+    sdfg(A=A, B=B, select=upper)
+    lower = np.tril(A, k=0)
+    diag = np.diag(np.diag(A))
+    assert np.allclose(B, lower.T + lower - diag)
+
+
+if __name__ == '__main__':
+    test_refine_dataflow()
+    test_refine_interstate()

From 641e5e0c85c4278c8e7c7933216f747413e7a3a6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 17:48:25 +0200
Subject: [PATCH 231/392] Rollback push to master.

---
 .../transformation/interstate/sdfg_nesting.py |  18 +---
 .../refine_nested_access_test.py              | 101 ------------------
 2 files changed, 1 insertion(+), 118 deletions(-)
 delete mode 100644 tests/transformations/refine_nested_access_test.py

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index a25d819a2d..1d4a7eb2e5 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -956,22 +956,6 @@ def _candidates(
                         continue
                     in_candidates[e.data.data] = (e.data, nstate, set(range(len(e.data.subset))))
 
-        # Check interstate edges for candidates
-        for e in nsdfg.sdfg.edges():
-            for m in e.data.get_read_memlets(nsdfg.sdfg.arrays):
-                # If more than one unique element detected, remove from candidates
-                if m.data in in_candidates:
-                    memlet, ns, indices = in_candidates[m.data]
-                    # Try to find dimensions in which there is a mismatch and remove them from list
-                    for i, (s1, s2) in enumerate(zip(m.subset, memlet.subset)):
-                        if s1 != s2 and i in indices:
-                            indices.remove(i)
-                    if len(indices) == 0:
-                        ignore.add(m.data)
-                    in_candidates[m.data] = (memlet, ns, indices)
-                    continue
-                in_candidates[m.data] = (m, None, set(range(len(m.subset))))
-
         # Check in/out candidates
         for cand in in_candidates.keys() & out_candidates.keys():
             s1, nstate1, ind1 = in_candidates[cand]
@@ -1000,7 +984,7 @@ def _check_cand(candidates, outer_edges):
                     continue
 
                 # Check w.r.t. loops
-                if nstate is not None and len(nstate.ranges) > 0:
+                if len(nstate.ranges) > 0:
                     # Re-annotate loop ranges, in case someone changed them
                     # TODO: Move out of here!
                     for ns in nsdfg.sdfg.states():
diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
deleted file mode 100644
index 30d2a3e77e..0000000000
--- a/tests/transformations/refine_nested_access_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-""" Tests for the RefineNestedAccess transformation. """
-
-import dace
-import numpy as np
-
-from dace.transformation.interstate import RefineNestedAccess
-
-
-def test_refine_dataflow():
-
-    i = dace.symbol('i')
-    j = dace.symbol('j')
-
-    @dace.program
-    def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5]):
-        B[i, j] = A[j, i]
-
-    sdfg = dace.SDFG('refine_dataflow')
-    sdfg.add_array('A', [5, 5], dace.int32)
-    sdfg.add_array('B', [5, 5], dace.int32)
-
-    state = sdfg.add_state()
-    A = state.add_access('A')
-    B = state.add_access('B')
-    me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
-    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A'}, {'B'}, {'i': 'i', 'j': 'j'})
-    state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
-    state.add_memlet_path(nsdfg, mx, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
-
-    num = sdfg.apply_transformations_repeated(RefineNestedAccess)
-    assert num == 1
-
-    for edge in state.out_edges(me):
-        assert edge.data.subset == dace.subsets.Range([(j, j, 1), (i, i, 1)])
-    for edge in state.in_edges(mx):
-        assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
-
-    A = np.arange(25, dtype=np.int32).reshape(5, 5).copy()
-    B = np.empty((5, 5), dtype=np.int32)
-    sdfg(A=A, B=B)
-    assert np.allclose(B, A.T)
-
-
-def test_refine_interstate():
-
-    i = dace.symbol('i')
-    j = dace.symbol('j')
-
-    @dace.program
-    def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]):
-        if select[i, j]:
-            B[i, j] = A[j, i]
-        else:
-            B[i, j] = A[i, j]
-
-    sdfg = dace.SDFG('refine_dataflow')
-    sdfg.add_array('A', [5, 5], dace.int32)
-    sdfg.add_array('B', [5, 5], dace.int32)
-    sdfg.add_array('select', [5, 5], dace.bool)
-
-    state = sdfg.add_state()
-    A = state.add_access('A')
-    B = state.add_access('B')
-    select = state.add_access('select')
-    me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
-    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A', 'select'}, {'B'}, {'i': 'i', 'j': 'j'})
-    state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
-    state.add_memlet_path(select,
-                          me,
-                          nsdfg,
-                          dst_conn='select',
-                          memlet=dace.Memlet.from_array('select', sdfg.arrays['select']))
-    state.add_memlet_path(nsdfg, mx, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
-
-    num = sdfg.apply_transformations_repeated(RefineNestedAccess)
-    assert num == 1
-
-    for edge in state.out_edges(me):
-        if edge.data.data == 'A':
-            expr = dace.symbolic.pystr_to_symbolic('Max(i, j)')
-            assert edge.data.subset == dace.subsets.Range([(0, expr, 1), (0, expr, 1)])
-        else:
-            assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
-    for edge in state.in_edges(mx):
-        assert edge.data.subset == dace.subsets.Range([(i, i, 1), (j, j, 1)])
-
-    A = np.arange(25, dtype=np.int32).reshape(5, 5).copy()
-    B = np.empty((5, 5), dtype=np.int32)
-    select = np.empty((5, 5), dtype=np.bool_)
-    select[:] = True
-    upper = np.triu(select, k=0)
-    sdfg(A=A, B=B, select=upper)
-    lower = np.tril(A, k=0)
-    diag = np.diag(np.diag(A))
-    assert np.allclose(B, lower.T + lower - diag)
-
-
-if __name__ == '__main__':
-    test_refine_dataflow()
-    test_refine_interstate()

From 27d6f767cbd1b9a9a9aa0b9770d7306f6f658f4c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 17:51:58 +0200
Subject: [PATCH 232/392] Updated comments.

---
 dace/transformation/interstate/sdfg_nesting.py     | 2 +-
 tests/transformations/refine_nested_access_test.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index a25d819a2d..b7989c558f 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -956,7 +956,7 @@ def _candidates(
                         continue
                     in_candidates[e.data.data] = (e.data, nstate, set(range(len(e.data.subset))))
 
-        # Check interstate edges for candidates
+        # Check read memlets in interstate edges for candidates
         for e in nsdfg.sdfg.edges():
             for m in e.data.get_read_memlets(nsdfg.sdfg.arrays):
                 # If more than one unique element detected, remove from candidates
diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
index 30d2a3e77e..a08c8f60c7 100644
--- a/tests/transformations/refine_nested_access_test.py
+++ b/tests/transformations/refine_nested_access_test.py
@@ -1,6 +1,5 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """ Tests for the RefineNestedAccess transformation. """
-
 import dace
 import numpy as np
 

From 243001b7c5f918682f3160d71be21fc19b5e0322 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Tue, 11 Jul 2023 09:44:21 -0700
Subject: [PATCH 233/392] Fix Einsum replacement for transposed outputs

---
 dace/frontend/common/einsum.py | 10 +++++++++-
 tests/numpy/einsum_test.py     | 26 +++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/common/einsum.py b/dace/frontend/common/einsum.py
index b9c3fff6b4..f678cdea58 100644
--- a/dace/frontend/common/einsum.py
+++ b/dace/frontend/common/einsum.py
@@ -199,7 +199,7 @@ def _create_einsum_internal(sdfg: SDFG,
 
     if init_output is None:
         init_output = (beta != 1.0)
-    
+
     if alpha is None:
         alpha = 1.0
     if beta is None:
@@ -373,6 +373,14 @@ def _create_einsum_internal(sdfg: SDFG,
             strides['sCN'] = 1
             strides['sCB'] = strides['sCM'] = strides['N']
 
+        # Transposed output, swap order
+        if strides['sCM'] == 1:
+            strides['sCM'], strides['sCN'] = strides['sCN'], strides['sCM']
+            strides['M'], strides['N'] = strides['N'], strides['M']
+            (strides['sAM'], strides['sAK'], strides['sAB'], strides['sBK'], strides['sBN'], strides['sBB']) = \
+                (strides['sBN'], strides['sBK'], strides['sBB'], strides['sAK'], strides['sAM'], strides['sAB'])
+            a, b = b, a
+
         # Create nested SDFG for GEMM
         nsdfg = create_batch_gemm_sdfg(dtype, strides, alpha, beta)
 
diff --git a/tests/numpy/einsum_test.py b/tests/numpy/einsum_test.py
index d3267653d9..2128d26565 100644
--- a/tests/numpy/einsum_test.py
+++ b/tests/numpy/einsum_test.py
@@ -8,6 +8,7 @@
 
 
 def test_general_einsum():
+
     @dace.program
     def einsumtest(A: dace.float64[M, N], B: dace.float64[N, M], C: dace.float64[M]):
         return np.einsum('ij,ji,i->', A, B, C)
@@ -20,6 +21,7 @@ def einsumtest(A: dace.float64[M, N], B: dace.float64[N, M], C: dace.float64[M])
 
 
 def test_matmul():
+
     @dace.program
     def einsumtest(A: dace.float64[M, N], B: dace.float64[N, M]):
         return np.einsum('ik,kj', A, B)
@@ -30,6 +32,7 @@ def einsumtest(A: dace.float64[M, N], B: dace.float64[N, M]):
 
 
 def test_batch_matmul():
+
     @dace.program
     def einsumtest(A: dace.float64[4, M, N], B: dace.float64[4, N, M]):
         return np.einsum('bik,bkj->bij', A, B)
@@ -40,6 +43,7 @@ def einsumtest(A: dace.float64[4, M, N], B: dace.float64[4, N, M]):
 
 
 def test_opteinsum_sym():
+
     @dace.program
     def einsumtest(A: dace.float64[N, N, N, N], B: dace.float64[N, N, N, N], C: dace.float64[N, N, N, N],
                    D: dace.float64[N, N, N, N], E: dace.float64[N, N, N, N]):
@@ -175,6 +179,7 @@ def tester(A, B):
     sdfg(A, B)
     assert np.allclose(B, np.einsum('ijk->', A))
 
+
 def test_lift_einsum_reduce_partial():
     from dace.libraries.standard.nodes.reduce import Reduce
     from dace.libraries.blas.nodes.einsum import Einsum
@@ -197,7 +202,7 @@ def tester(A, B):
     # Specialize to ensure Reduce node is there
     sdfg.expand_library_nodes(recursive=False)
     rnode = next(node for node, _ in sdfg.all_nodes_recursive() if isinstance(node, Reduce))
-    assert tuple(rnode.axes) == (1,)
+    assert tuple(rnode.axes) == (1, )
 
     sdfg(A, B)
     assert np.allclose(B, np.einsum('ijk->ik', A))
@@ -297,6 +302,24 @@ def tester(A, B):
         assert np.allclose(sdfg(A, B), C)
 
 
+def test_c_transposed():
+    N, F_in, F_out = 2, 3, 3
+
+    @dace.program
+    def fn(a, b, c):
+        c[:] = np.einsum('nm,nf->fm', a, b)
+
+    a = np.random.rand(N, F_in)
+    b = np.random.rand(N, F_out)
+    c_expected = np.zeros((F_out, F_in))
+    c = np.zeros((F_out, F_in))
+
+    fn.f(a, b, c_expected)
+    fn(a, b, c)
+
+    assert np.allclose(c, c_expected)
+
+
 if __name__ == '__main__':
     test_general_einsum()
     test_matmul()
@@ -312,3 +335,4 @@ def tester(A, B):
     test_lift_einsum_beta()
     test_lift_einsum_alpha_beta(False)
     test_lift_einsum_alpha_beta(True)
+    test_c_transposed()

From 8d1647ee2eb7117a3baf9cb3d778afb24253181f Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 11 Jul 2023 20:33:22 +0200
Subject: [PATCH 234/392] Don't simplify inner SDFG.

---
 tests/transformations/refine_nested_access_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
index a08c8f60c7..725a438ae5 100644
--- a/tests/transformations/refine_nested_access_test.py
+++ b/tests/transformations/refine_nested_access_test.py
@@ -23,7 +23,7 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5]):
     A = state.add_access('A')
     B = state.add_access('B')
     me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
-    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A'}, {'B'}, {'i': 'i', 'j': 'j'})
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A'}, {'B'}, {'i': 'i', 'j': 'j'})
     state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
     state.add_memlet_path(nsdfg, mx, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
 
@@ -63,7 +63,10 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]
     B = state.add_access('B')
     select = state.add_access('select')
     me, mx = state.add_map('m', dict(i='0:5', j='0:5'))
-    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(), sdfg, {'A', 'select'}, {'B'}, {'i': 'i', 'j': 'j'})
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A', 'select'}, {'B'}, {
+        'i': 'i',
+        'j': 'j'
+    })
     state.add_memlet_path(A, me, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
     state.add_memlet_path(select,
                           me,

From e67aa8eb56d055967ce8f4dd74dfffc893db304b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:17:36 +0200
Subject: [PATCH 235/392] The COMM_WORLD communicator object does not have its
 name changes to mpi4py.MPI.COMM_WORLD any longer. All (mpi4py) communicators
 are now allowed to pass as-is through preprocessing.

---
 dace/frontend/python/preprocessing.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 03f07d0050..ea312a18c0 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -1525,17 +1525,10 @@ def visit_Name(self, node: ast.Name) -> Union[ast.Name, ast.Attribute]:
             obj = self.globals[node.id]
             if isinstance(obj, self.MPI.Comm):
                 lattr = ast.Attribute(ast.Name(id='mpi4py', ctx=ast.Load), attr='MPI')
-                if obj is self.MPI.COMM_WORLD:
-                    newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_WORLD'), node)
-                    newnode.parent = node.parent
-                    return newnode
-                elif obj is self.MPI.COMM_NULL:
+                if obj is self.MPI.COMM_NULL:
                     newnode = ast.copy_location(ast.Attribute(value=lattr, attr='COMM_NULL'), node)
                     newnode.parent = node.parent
                     return newnode
-                else:
-                    raise DaceSyntaxError('Only the COMM_WORLD and COMM_NULL mpi4py.MPI communicators can be used '
-                                          'directly inside a DaCe Python program.')
         return node
     
     def visit_Attribute(self, node: ast.Attribute) -> ast.Attribute:

From 2741579000d87a2429ae4c7b805398fb5590c411 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:21:22 +0200
Subject: [PATCH 236/392] All (mpi4py) communicators in the global context are
 now registered in the ProgramVisitor's defined variables. When calling a
 method on an object, if the object is not in the ProgramVisitor's
 current/outer scope variables, pass to the method a tuple with the object's
 name and the object itself.

---
 dace/frontend/python/newast.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 5147dc01fe..ce2a9e06e1 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1312,11 +1312,11 @@ def defined(self):
 
         # MPI-related stuff
         result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids})
-        # try:
-        #     from mpi4py import MPI
-        #     result.update({k: v for k, v in self.globals.items() if v is MPI.COMM_WORLD})
-        # except:
-        #     pass
+        try:
+            from mpi4py import MPI
+            result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)})
+        except:
+            pass
 
         return result
 
@@ -4369,8 +4369,11 @@ def visit_Call(self, node: ast.Call, create_callbacks=False):
             # Add object as first argument
             if modname in self.variables.keys():
                 arg = self.variables[modname]
-            else:
+            elif modname in self.scope_vars.keys():
                 arg = self.scope_vars[modname]
+            else:
+                # Fallback to (name, object)
+                arg = (modname, self.defined[modname])
             args.append(arg)
         # Otherwise, try to find a default implementation for the SDFG
         elif not found_ufunc:

From 6fe26c46a00f438d0ce7c8430d313c4e8b9280c6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:23:14 +0200
Subject: [PATCH 237/392] The Bcast LibraryNode can now accept as a string the
 name of a variable that holds the Fortran int handle of a communicator.

---
 dace/libraries/mpi/nodes/bcast.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dace/libraries/mpi/nodes/bcast.py b/dace/libraries/mpi/nodes/bcast.py
index c39ef91980..bf3729ff38 100644
--- a/dace/libraries/mpi/nodes/bcast.py
+++ b/dace/libraries/mpi/nodes/bcast.py
@@ -42,11 +42,16 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         if isinstance(buffer, dace.data.Scalar):
             ref = "&"
 
+        init = ""
         comm = "MPI_COMM_WORLD"
         if node.grid:
             comm = f"__state->{node.grid}_comm"
+        elif node.fcomm:
+            init = f"MPI_Comm __comm = MPI_Comm_f2c({node.fcomm});"
+            comm = "__comm"
 
         code = f"""
+            {init}
             MPI_Bcast({ref}_inbuffer, {count_str}, {mpi_dtype_str}, _root, {comm});
             _outbuffer = _inbuffer;"""
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
@@ -67,10 +72,12 @@ class Bcast(MPINode):
     default_implementation = "MPI"
 
     grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
+    fcomm = dace.properties.Property(dtype=str, allow_none=True, default=None)
 
-    def __init__(self, name, grid=None, *args, **kwargs):
+    def __init__(self, name, grid=None, fcomm=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_inbuffer", "_root"}, outputs={"_outbuffer"}, **kwargs)
         self.grid = grid
+        self.fcomm = fcomm
 
     def validate(self, sdfg, state):
         """

From af624c3770bcec7d6efbaf64863f7ee0d9db4129 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:25:32 +0200
Subject: [PATCH 238/392] Replacements for COMM_WORLD were removed. Instead,
 the Intracomm's class method replacements should now trigger. Added
 (experimental) support for calling Bcast from a Cart/Intracomm object defined
 in CPython.

---
 dace/frontend/common/distr.py | 84 +++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index af08623083..72fe176ac0 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1,6 +1,6 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 from numbers import Integral, Number
-from typing import Sequence, Union
+from typing import Sequence, Tuple, Union
 
 import dace
 from dace import dtypes, symbolic
@@ -58,7 +58,8 @@ def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm:
     """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _cart_create(pv, sdfg, state, dims)
 
@@ -155,18 +156,19 @@ def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Co
 ##### MPI Collectives
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast')
+# @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast')
 @oprepo.replaces('dace.comm.Bcast')
 def _bcast(pv: ProgramVisitor,
            sdfg: SDFG,
            state: SDFGState,
            buffer: str,
            root: Union[str, sp.Expr, Number] = 0,
-           grid: str = None):
+           grid: str = None,
+           fcomm: str = None):
 
     from dace.libraries.mpi.nodes.bcast import Bcast
 
-    libnode = Bcast('_Bcast_', grid)
+    libnode = Bcast('_Bcast_', grid, fcomm)
     desc = sdfg.arrays[buffer]
     in_buffer = state.add_read(buffer)
     out_buffer = state.add_write(buffer)
@@ -185,19 +187,23 @@ def _bcast(pv: ProgramVisitor,
     return None
 
 
+@oprepo.replaces_method('Cartcomm', 'Bcast')
 @oprepo.replaces_method('Intracomm', 'Bcast')
 def _intracomm_bcast(pv: 'ProgramVisitor',
                      sdfg: SDFG,
                      state: SDFGState,
-                     icomm: 'Intracomm',
+                     comm: Tuple[str, 'Comm'],
                      buffer: str,
                      root: Union[str, sp.Expr, Number] = 0):
     """ Equivalent to `dace.comm.Bcast(buffer, root)`. """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
-        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
-    return _bcast(pv, sdfg, state, buffer, root)
+    comm_name, comm_obj = comm
+    if comm_obj == MPI.COMM_WORLD:
+        return _bcast(pv, sdfg, state, buffer, root)
+    # NOTE: Highly experimental
+    sdfg.add_scalar(comm_name, dace.int32)
+    return _bcast(pv, sdfg, state, buffer, root, fcomm=comm_name)
 
 
 @oprepo.replaces_method('ProcessGrid', 'Bcast')
@@ -248,7 +254,6 @@ def _Reduce(pv: ProgramVisitor,
     return None
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall')
 @oprepo.replaces('dace.comm.Alltoall')
 def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None):
 
@@ -271,7 +276,8 @@ def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icom
     """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _alltoall(pv, sdfg, state, inp_buffer, out_buffer)
 
@@ -284,7 +290,6 @@ def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: s
     return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid)
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
 def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
 
@@ -306,7 +311,8 @@ def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, ico
     """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     if inp_buffer != MPI.IN_PLACE:
         raise ValueError('DaCe currently supports in-place Allreduce only.')
@@ -391,7 +397,6 @@ def _gather(pv: ProgramVisitor,
 ##### Point-To-Point Communication
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send')
 @oprepo.replaces('dace.comm.Send')
 def _send(pv: ProgramVisitor,
           sdfg: SDFG,
@@ -464,7 +469,27 @@ def _send(pv: ProgramVisitor,
     return None
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend')
+@oprepo.replaces_method('Intracomm', 'Send')
+def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+                     dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+    """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """
+
+    from mpi4py import MPI
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    return _send(pv, sdfg, state, buffer, dst, tag)
+
+
+@oprepo.replaces_method('ProcessGrid', 'Send')
+def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str,
+                 dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+    """ Equivalent to `dace.comm.Send(buffer, dst, tag, grid=pgrid)`. """
+
+    raise NotImplementedError('ProcessGrid.Send is not supported yet.')
+    # return _send(pv, sdfg, state, buffer, dst, tag, grid=pgrid)
+
+
 @oprepo.replaces('dace.comm.Isend')
 def _isend(pv: ProgramVisitor,
            sdfg: SDFG,
@@ -571,7 +596,8 @@ def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm:
     """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
     _isend(pv, sdfg, state, buffer, dst, tag, req)
@@ -589,7 +615,6 @@ def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str,
     return req
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv')
 @oprepo.replaces('dace.comm.Recv')
 def _recv(pv: ProgramVisitor,
           sdfg: SDFG,
@@ -662,7 +687,27 @@ def _recv(pv: ProgramVisitor,
     return None
 
 
-@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv')
+@oprepo.replaces_method('Intracomm', 'Recv')
+def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+                     src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+    """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """
+
+    from mpi4py import MPI
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
+        raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
+    return _recv(pv, sdfg, state, buffer, src, tag)
+
+
+@oprepo.replaces_method('ProcessGrid', 'Recv')
+def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str,
+                 src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+    """ Equivalent to `dace.comm.Recv(buffer, dst, tag, grid=pgrid)`. """
+
+    raise NotImplementedError('ProcessGrid.Recv is not supported yet.')
+    # return _recv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid)
+
+
 @oprepo.replaces('dace.comm.Irecv')
 def _irecv(pv: ProgramVisitor,
            sdfg: SDFG,
@@ -767,7 +812,8 @@ def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm:
     """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """
 
     from mpi4py import MPI
-    if icomm != MPI.COMM_WORLD:
+    icomm_name, icomm_obj = icomm
+    if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
     _irecv(pv, sdfg, state, buffer, src, tag, req)

From fe22182ceccb4ae286802480fdba8f5b6d598bb6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:26:19 +0200
Subject: [PATCH 239/392] Added two new Bcast tests for COMM_WORLD and
 Intracomm object.

---
 tests/library/mpi/mpi4py_test.py | 72 ++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index a81294c47f..e99768be5c 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -6,6 +6,76 @@
 import pytest
 
 
+@pytest.mark.mpi
+def test_comm_world_bcast():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def comm_world_bcast(A: dace.int32[10]):
+        commworld.Bcast(A)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+    
+    sdfg = None
+    if rank == 0:
+        sdfg = comm_world_bcast.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    if rank == 0:
+        A = np.arange(10, dtype=np.int32)
+        A_ref = A.copy()
+    else:
+        A = np.zeros((10, ), dtype=np.int32)
+        A_ref = A.copy()
+
+    func(A=A)
+    comm_world_bcast.f(A_ref)
+
+    assert(np.array_equal(A, A_ref))
+
+
+@pytest.mark.mpi
+def test_external_comm_bcast():
+
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    new_comm = commworld.Split(rank % 2, 0)
+
+    @dace.program
+    def external_comm_bcast(A: dace.int32[10]):
+        new_comm.Bcast(A)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+    
+    sdfg = None
+    if rank == 0:
+        sdfg = external_comm_bcast.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    if rank == 0:
+        A = np.arange(10, dtype=np.int32)
+        A_ref = A.copy()
+    elif rank == 1:
+        A = np.arange(10, 20, dtype=np.int32)
+        A_ref = A.copy()
+    else:
+        A = np.zeros((10, ), dtype=np.int32)
+        A_ref = A.copy()
+
+    func(A=A, new_comm=new_comm.py2f())
+    external_comm_bcast.f(A_ref)
+
+    assert(np.array_equal(A, A_ref))
+
 
 @pytest.mark.mpi
 def test_process_grid_bcast():
@@ -258,6 +328,8 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
 
 
 if __name__ == "__main__":
+    test_comm_world_bcast()
+    test_external_comm_bcast()
     test_process_grid_bcast()
     test_sub_grid_bcast()
     test_3mm()

From 6c5ffa1d77fdabac4adc80b817e4e9a3e5000050 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:49:10 +0200
Subject: [PATCH 240/392] Restored replacements needed for full name of
 COMM_WORLD. Cleaned up duplicate methods.

---
 dace/frontend/common/distr.py | 84 +++++------------------------------
 1 file changed, 10 insertions(+), 74 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 72fe176ac0..68b6f120d8 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -156,7 +156,7 @@ def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Co
 ##### MPI Collectives
 
 
-# @oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast')
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Bcast')
 @oprepo.replaces('dace.comm.Bcast')
 def _bcast(pv: ProgramVisitor,
            sdfg: SDFG,
@@ -224,6 +224,7 @@ def _mpi4py_to_MPI(MPI, op):
     raise NotImplementedError
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Reduce')
 @oprepo.replaces('dace.comm.Reduce')
 def _Reduce(pv: ProgramVisitor,
             sdfg: SDFG,
@@ -254,6 +255,7 @@ def _Reduce(pv: ProgramVisitor,
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Alltoall')
 @oprepo.replaces('dace.comm.Alltoall')
 def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, outbuffer: str, grid: str = None):
 
@@ -290,6 +292,7 @@ def _pgrid_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: s
     return _alltoall(pv, sdfg, state, inp_buffer, out_buffer, grid=pgrid)
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Allreduce')
 @oprepo.replaces('dace.comm.Allreduce')
 def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op: str, grid: str = None):
 
@@ -334,6 +337,7 @@ def _pgrid_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid:
     return _allreduce(pv, sdfg, state, out_buffer, op, grid=pgrid)
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Scatter')
 @oprepo.replaces('dace.comm.Scatter')
 def _scatter(pv: ProgramVisitor,
              sdfg: SDFG,
@@ -364,6 +368,7 @@ def _scatter(pv: ProgramVisitor,
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Gather')
 @oprepo.replaces('dace.comm.Gather')
 def _gather(pv: ProgramVisitor,
             sdfg: SDFG,
@@ -397,6 +402,7 @@ def _gather(pv: ProgramVisitor,
 ##### Point-To-Point Communication
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Send')
 @oprepo.replaces('dace.comm.Send')
 def _send(pv: ProgramVisitor,
           sdfg: SDFG,
@@ -490,6 +496,7 @@ def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str,
     # return _send(pv, sdfg, state, buffer, dst, tag, grid=pgrid)
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Isend')
 @oprepo.replaces('dace.comm.Isend')
 def _isend(pv: ProgramVisitor,
            sdfg: SDFG,
@@ -615,6 +622,7 @@ def _pgrid_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str,
     return req
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Recv')
 @oprepo.replaces('dace.comm.Recv')
 def _recv(pv: ProgramVisitor,
           sdfg: SDFG,
@@ -708,6 +716,7 @@ def _pgrid_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str,
     # return _recv(pv, sdfg, state, buffer, src, tag, req, grid=pgrid)
 
 
+@oprepo.replaces('mpi4py.MPI.COMM_WORLD.Irecv')
 @oprepo.replaces('dace.comm.Irecv')
 def _irecv(pv: ProgramVisitor,
            sdfg: SDFG,
@@ -891,79 +900,6 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
-@oprepo.replaces('dace.comm.Cart_create')
-def _cart_create(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, dims: ShapeType):
-    """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html).
-
-        :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3].
-        :return: Name of the new process-grid descriptor.
-    """
-    pgrid_name = sdfg.add_pgrid(dims)
-
-    # Dummy tasklet adds MPI variables to the program's state.
-    from dace.libraries.mpi import Dummy
-    tasklet = Dummy(pgrid_name, [
-        f'MPI_Comm {pgrid_name}_comm;',
-        f'MPI_Group {pgrid_name}_group;',
-        f'int {pgrid_name}_coords[{len(dims)}];',
-        f'int {pgrid_name}_dims[{len(dims)}];',
-        f'int {pgrid_name}_rank;',
-        f'int {pgrid_name}_size;',
-        f'bool {pgrid_name}_valid;',
-    ])
-
-    state.add_node(tasklet)
-
-    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
-    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
-    wnode = state.add_write(pgrid_name)
-    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
-
-    return pgrid_name
-
-
-@oprepo.replaces('dace.comm.Cart_sub')
-def _cart_sub(pv: ProgramVisitor,
-              sdfg: SDFG,
-              state: SDFGState,
-              parent_grid: str,
-              color: Sequence[Union[Integral, bool]],
-              exact_grid: RankType = None):
-    """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program.
-        The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html).
-
-        :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`).
-        :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`).
-        :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication.
-        :return: Name of the new sub-grid descriptor.
-    """
-    pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid)
-
-    # Count sub-grid dimensions.
-    pgrid_ndims = sum([bool(c) for c in color])
-
-    # Dummy tasklet adds MPI variables to the program's state.
-    from dace.libraries.mpi import Dummy
-    tasklet = Dummy(pgrid_name, [
-        f'MPI_Comm {pgrid_name}_comm;',
-        f'MPI_Group {pgrid_name}_group;',
-        f'int {pgrid_name}_coords[{pgrid_ndims}];',
-        f'int {pgrid_name}_dims[{pgrid_ndims}];',
-        f'int {pgrid_name}_rank;',
-        f'int {pgrid_name}_size;',
-        f'bool {pgrid_name}_valid;',
-    ])
-
-    state.add_node(tasklet)
-
-    # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations.
-    _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True)
-    wnode = state.add_write(pgrid_name)
-    state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal))
-
-    return pgrid_name
-
-
 @oprepo.replaces('dace.comm.Subarray')
 def _subarray(pv: ProgramVisitor,
               sdfg: SDFG,

From c3b1a4b0c4f09d1c894540c02ef7d2d51cde4fff Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 15:58:19 +0200
Subject: [PATCH 241/392] Further clean up

---
 dace/frontend/python/newast.py          | 7 -------
 dace/frontend/python/replacements.py    | 1 -
 dace/libraries/mpi/nodes/isend.py       | 1 +
 tests/library/mpi/mpi4py_test.py        | 4 ++--
 tests/library/mpi/mpi_send_recv_test.py | 2 +-
 5 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index ce2a9e06e1..fef2d989d5 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1150,13 +1150,6 @@ def __init__(self,
         # Indirections
         self.indirections = dict()
 
-        # Add mpi4py.MPI.COMM_WORLD aliases to variables
-        # try:
-        #     from mpi4py import MPI
-        #     self.variables.update({k: "MPI_COMM_WORLD" for k, v in self.globals.items() if v is MPI.COMM_WORLD})
-        # except:
-        #     pass
-
     @classmethod
     def progress_count(cls) -> int:
         """ Returns the number of parsed SDFGs so far within this run. """
diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 30c92be81f..a681f48ba6 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -295,7 +295,6 @@ def _numpy_full(pv: ProgramVisitor,
     else:
         is_data = True
         vtype = sdfg.arrays[fill_value].dtype
-        # raise mem_parser.DaceSyntaxError(pv, None, "Fill value {f} must be a number!".format(f=fill_value))
     dtype = dtype or vtype
     name, _ = sdfg.add_temp_transient(shape, dtype)
 
diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py
index 95b3de3ae7..8de4035515 100644
--- a/dace/libraries/mpi/nodes/isend.py
+++ b/dace/libraries/mpi/nodes/isend.py
@@ -97,6 +97,7 @@ def validate(self, sdfg, state):
             if e.src_conn == "_request":
                 req = sdfg.arrays[e.data.data]
 
+        # TODO: Should we expect any integer type here and cast to int32 later?. Investigate further in the future.
         # if dest.dtype.base_type != dace.dtypes.int32:
         #     raise ValueError("Destination must be an integer!")
         # if tag.dtype.base_type != dace.dtypes.int32:
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index e99768be5c..1bbeae627f 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -91,8 +91,8 @@ def pgrid_bcast(A: dace.int32[10]):
         if pgrid != MPI.COMM_NULL:
             pgrid.Bcast(A)
 
-    if size < 2:
-        raise ValueError("Please run this test with at least two processes.")
+    # if size < 2:
+    #     raise ValueError("Please run this test with at least two processes.")
 
     sdfg = None
     if rank == 0:
diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py
index ec094e7cf5..bf39c955d3 100644
--- a/tests/library/mpi/mpi_send_recv_test.py
+++ b/tests/library/mpi/mpi_send_recv_test.py
@@ -103,7 +103,7 @@ def test_dace_send_recv():
     sdfg = None
     if rank == 0:
         sdfg = dace_send_recv.to_sdfg(simplify=True)
-        # disable openMP section for blocking
+        # Disable OpenMP section to allow blocking
         sdfg.openmp_sections = False
     mpi_sdfg = utils.distributed_compile(sdfg, comm)
 

From acfde5f27d454b03ce98271a28def14185a361e4 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 17:48:20 +0200
Subject: [PATCH 242/392] MapFusion creates new intermediate data in the
 presence of NestedSDFGs.

---
 dace/transformation/dataflow/map_fusion.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/dace/transformation/dataflow/map_fusion.py b/dace/transformation/dataflow/map_fusion.py
index 7075befa19..9a0dd0e313 100644
--- a/dace/transformation/dataflow/map_fusion.py
+++ b/dace/transformation/dataflow/map_fusion.py
@@ -443,9 +443,16 @@ def fuse_nodes(self, sdfg, graph, edge, new_dst, new_dst_conn, other_edges=None)
             graph.node_id(edge.dst),
             edge.dst_conn,
         )
-        # Add intermediate memory between subgraphs. If a scalar,
-        # uses direct connection. If an array, adds a transient node
-        if edge.data.subset.num_elements() == 1:
+        # Add intermediate memory between subgraphs.
+        # If a scalar, uses direct connection. If an array, adds a transient node.
+        # NOTE: If any of the src/dst nodes is a nested SDFG, treat it as an array.
+        is_scalar = edge.data.subset.num_elements() == 1
+        accesses = (
+            [graph.memlet_path(e1)[0].src for e0 in graph.in_edges(access_node) for e1 in graph.memlet_tree(e0)] +
+            [graph.memlet_path(e1)[-1].dst for e0 in graph.out_edges(access_node) for e1 in graph.memlet_tree(e0)])
+        if any(isinstance(a, nodes.NestedSDFG) for a in accesses):
+            is_scalar = False
+        if is_scalar:
             local_name, _ = sdfg.add_scalar(
                 local_name,
                 dtype=access_node.desc(graph).dtype,
@@ -520,5 +527,7 @@ def fuse_nodes(self, sdfg, graph, edge, new_dst, new_dst_conn, other_edges=None)
             # Modify data and memlets on all surrounding edges to match array
             for neighbor in graph.all_edges(local_node):
                 for e in graph.memlet_tree(neighbor):
+                    if e.data.data == local_name:
+                        continue
                     e.data.data = local_name
                     e.data.subset.offset(old_edge.data.subset, negative=True)

From 6677fe7d14e5102029d1c586a5fea72f7c8a3055 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 17:48:36 +0200
Subject: [PATCH 243/392] Added tests.

---
 tests/transformations/mapfusion_test.py | 61 +++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/tests/transformations/mapfusion_test.py b/tests/transformations/mapfusion_test.py
index 6035b885e5..653fb9d120 100644
--- a/tests/transformations/mapfusion_test.py
+++ b/tests/transformations/mapfusion_test.py
@@ -214,6 +214,65 @@ def inner_product(A: dace.float32[N], B: dace.float32[N], out: dace.float32[1]):
     assert np.allclose(val[0], ref)
 
 
+def test_fusion_with_nested_sdfg_0():
+    
+    @dace.program
+    def fusion_with_nested_sdfg_0(A: dace.int32[10], B: dace.int32[10], C: dace.int32[10]):
+        tmp = np.empty([10], dtype=np.int32)
+        for i in dace.map[0:10]:
+            if C[i] < 0:
+                tmp[i] = B[i] - A[i]
+            else:
+                tmp[i] = B[i] + A[i]
+        for i in dace.map[0:10]:
+            A[i] = tmp[i] * 2
+    
+    sdfg = fusion_with_nested_sdfg_0.to_sdfg(simplify=True)
+    sdfg.apply_transformations(MapFusion)
+
+    for sd in sdfg.all_sdfgs_recursive():
+        if sd is not sdfg:
+            node = sd.parent_nsdfg_node
+            state = sd.parent
+            for e0 in state.out_edges(node):
+                for e1 in state.memlet_tree(e0):
+                    dst = state.memlet_path(e1)[-1].dst
+                assert isinstance(dst, dace.nodes.AccessNode)
+
+
+def test_fusion_with_nested_sdfg_1():
+    
+    @dace.program
+    def fusion_with_nested_sdfg_1(A: dace.int32[10], B: dace.int32[10], C: dace.int32[10]):
+        tmp = np.empty([10], dtype=np.int32)
+        for i in dace.map[0:10]:
+            with dace.tasklet:
+                a << A[i]
+                b << B[i]
+                t >> tmp[i]
+                t = b - a
+        for i in dace.map[0:10]:
+            if C[i] < 0:
+                A[i] = tmp[i] * 2
+            else:
+                B[i] = tmp[i] * 2
+    
+    sdfg = fusion_with_nested_sdfg_1.to_sdfg(simplify=True)
+    sdfg.apply_transformations(MapFusion)
+
+    if len(sdfg.states()) != 1:
+        return
+
+    for sd in sdfg.all_sdfgs_recursive():
+        if sd is not sdfg:
+            node = sd.parent_nsdfg_node
+            state = sd.parent
+            for e0 in state.in_edges(node):
+                for e1 in state.memlet_tree(e0):
+                    src = state.memlet_path(e1)[0].src
+                assert isinstance(src, dace.nodes.AccessNode)
+
+
 if __name__ == '__main__':
     test_fusion_simple()
     test_multiple_fusions()
@@ -221,3 +280,5 @@ def inner_product(A: dace.float32[N], B: dace.float32[N], out: dace.float32[1]):
     test_fusion_with_transient()
     test_fusion_with_inverted_indices()
     test_fusion_with_empty_memlet()
+    test_fusion_with_nested_sdfg_0()
+    test_fusion_with_nested_sdfg_1()

From 684642d62654d42ad16242a59bda327eb6c276a1 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 17:53:47 +0200
Subject: [PATCH 244/392] Added validation error.

---
 dace/sdfg/validation.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index abad1e7907..f3b76b2683 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -562,6 +562,12 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         src_node = path[0].src
         dst_node = path[-1].dst
 
+        # NestedSDFGs must connect to AccessNodes
+        if isinstance(src_node, nd.NestedSDFG) and not isinstance(dst_node, nd.AccessNode):
+            raise InvalidSDFGEdgeError("Nested SDFG source nodes must be AccessNodes", sdfg, state_id, eid)
+        if isinstance(dst_node, nd.NestedSDFG) and not isinstance(src_node, nd.AccessNode):
+            raise InvalidSDFGEdgeError("Nested SDFG destination nodes must be AccessNodes", sdfg, state_id, eid)
+
         # Set up memlet-specific SDFG context
         memlet_context = copy.copy(context)
         for pe in path:

From 61c43941f5c4a732ac8a4b9b4f81bdedebf48d5a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 18:16:24 +0200
Subject: [PATCH 245/392] Unpack visited condition's test.

---
 dace/frontend/python/newast.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 52a6862083..2283b433bd 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -2352,6 +2352,8 @@ def _visit_test(self, node: ast.Expr):
         # Visit test-condition
         if not is_test_simple:
             parsed_node = self.visit(node)
+            if isinstance(parsed_node, (list, tuple)) and len(parsed_node) == 1:
+                parsed_node = parsed_node[0]
             if isinstance(parsed_node, str) and parsed_node in self.sdfg.arrays:
                 datadesc = self.sdfg.arrays[parsed_node]
                 if isinstance(datadesc, data.Array):

From 77896f7db2d9c253a23260fb4fe428f1657c57f4 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 18:21:59 +0200
Subject: [PATCH 246/392] Special case for code convering to bool.

---
 dace/frontend/python/replacements.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 3586d40374..528aef1ec8 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -4370,6 +4370,8 @@ def _datatype_converter(sdfg: SDFG, state: SDFGState, arg: UfuncInput, dtype: dt
         'outputs': ['__out'],
         'code': "__out = dace.{}(__inp)".format(dtype.to_string())
     }
+    if dtype in (dace.bool, dace.bool_):
+        impl['code'] = "__out = dace.bool_(__inp)"
     tasklet_params = _set_tasklet_params(impl, [arg])
 
     # Visitor input only needed when `has_where == True`.

From 346cfdee7eba3a6cb88ef4e0f870a316c6cac3b9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 18:25:31 +0200
Subject: [PATCH 247/392] Added test.

---
 tests/python_frontend/conditionals_test.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/python_frontend/conditionals_test.py b/tests/python_frontend/conditionals_test.py
index 03058c7bf8..994a45ed80 100644
--- a/tests/python_frontend/conditionals_test.py
+++ b/tests/python_frontend/conditionals_test.py
@@ -161,6 +161,19 @@ def if_return_chain(i: dace.int64):
     assert if_return_chain(15)[0] == 4
 
 
+def test_if_test_call():
+
+    @dace.program
+    def if_test_call(a, b):
+        if bool(a):
+            return a
+        else:
+            return b
+
+    assert if_test_call(0, 2)[0] == if_test_call.f(0, 2)
+    assert if_test_call(1, 2)[0] == if_test_call.f(1, 2)
+
+
 if __name__ == "__main__":
     test_simple_if()
     test_call_if()
@@ -169,3 +182,4 @@ def if_return_chain(i: dace.int64):
     test_call_while()
     test_if_return_both()
     test_if_return_chain()
+    test_if_test_call()

From 247ce2eab3d3211e012738283c12cc7f65b9aa48 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 18:48:12 +0200
Subject: [PATCH 248/392] Check should not trigger for edges with empety
 Memlets.

---
 dace/sdfg/validation.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index f3b76b2683..3bac646479 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -563,10 +563,11 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         dst_node = path[-1].dst
 
         # NestedSDFGs must connect to AccessNodes
-        if isinstance(src_node, nd.NestedSDFG) and not isinstance(dst_node, nd.AccessNode):
-            raise InvalidSDFGEdgeError("Nested SDFG source nodes must be AccessNodes", sdfg, state_id, eid)
-        if isinstance(dst_node, nd.NestedSDFG) and not isinstance(src_node, nd.AccessNode):
-            raise InvalidSDFGEdgeError("Nested SDFG destination nodes must be AccessNodes", sdfg, state_id, eid)
+        if not e.data.is_empty():
+            if isinstance(src_node, nd.NestedSDFG) and not isinstance(dst_node, nd.AccessNode):
+                raise InvalidSDFGEdgeError("Nested SDFG source nodes must be AccessNodes", sdfg, state_id, eid)
+            if isinstance(dst_node, nd.NestedSDFG) and not isinstance(src_node, nd.AccessNode):
+                raise InvalidSDFGEdgeError("Nested SDFG destination nodes must be AccessNodes", sdfg, state_id, eid)
 
         # Set up memlet-specific SDFG context
         memlet_context = copy.copy(context)

From c3f953522d6d5acf969d55dba729a937b412a08e Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 12 Jul 2023 21:36:28 +0200
Subject: [PATCH 249/392] Drop connectors/arguments from (nested) Program/SDFG
 call, if the connector is not in the SDFG's arrays.

---
 dace/frontend/python/newast.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 52a6862083..e1629e20c6 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3740,6 +3740,15 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
         for arg in args_to_remove:
             args.remove(arg)
 
+        # Drop args that are not in the SDFG
+        filtered_args = []
+        for conn, arg in args:
+            if conn not in sdfg.arrays:
+                warnings.warn(f'Connector {conn} not found in SDFG; dropping it')
+            else:
+                filtered_args.append((conn, arg))
+        args = filtered_args
+
         # Change connector names
         updated_args = []
         arrays_before = list(sdfg.arrays.items())
@@ -3829,6 +3838,12 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
             for k, v in argdict.items() if self._is_outputnode(sdfg, k)
         }
 
+        # If an argument does not register as input nor as output, put it in the inputs.
+        # This may happen with input arguments that are used to set a promoted scalar.
+        for k, v in argdict.items():
+            if k not in inputs.keys() and k not in outputs.keys():
+                inputs[k] = v
+
         # Add closure to global inputs/outputs (e.g., if processed as part of a map)
         for arrname in closure_arrays.keys():
             if arrname not in names_to_replace:
@@ -3840,13 +3855,6 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
             if narrname in outputs:
                 self.outputs[arrname] = (state, outputs[narrname], [])
 
-        # If an argument does not register as input nor as output,
-        # put it in the inputs.
-        # This may happen with input argument that are used to set
-        # a promoted scalar.
-        for k, v in argdict.items():
-            if k not in inputs.keys() and k not in outputs.keys():
-                inputs[k] = v
         # Unset parent inputs/read accesses that
         # turn out to be outputs/write accesses.
         for memlet in outputs.values():

From 866e38f605cde529acbb598d496c8271e2cdc463 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 13 Jul 2023 13:39:24 +0200
Subject: [PATCH 250/392] Ensure that the access node exists in the SDFGState.

---
 dace/transformation/passes/array_elimination.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py
index e313f7bf66..d1b80c2327 100644
--- a/dace/transformation/passes/array_elimination.py
+++ b/dace/transformation/passes/array_elimination.py
@@ -170,6 +170,9 @@ def remove_redundant_copies(self, sdfg: SDFG, state: SDFGState, removable_data:
                 for anode in access_nodes[aname]:
                     if anode in removed_nodes:
                         continue
+                    if anode not in state.nodes():
+                        removed_nodes.add(anode)
+                        continue
 
                     if state.out_degree(anode) == 1:
                         succ = state.successors(anode)[0]

From c53b95b297337060da79d159561c9a312b6d63e8 Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 13 Jul 2023 13:39:57 +0200
Subject: [PATCH 251/392] Added get_free_symbols_by_indices methods to Memlet
 and Ranges

---
 dace/memlet.py                                | 21 +++++++++++++++++++
 dace/subsets.py                               | 16 ++++++++++++++
 .../transformation/interstate/sdfg_nesting.py |  2 +-
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/dace/memlet.py b/dace/memlet.py
index 71a246224b..35b689381d 100644
--- a/dace/memlet.py
+++ b/dace/memlet.py
@@ -522,6 +522,27 @@ def free_symbols(self) -> Set[str]:
             result |= self.dst_subset.free_symbols
         return result
 
+    def get_free_symbols_by_indices(self, indices_src: List[int], indices_dst: List[int]) -> Set[str]:
+        """
+        Returns set of free symbols used in this edges properties but only taking certain indices of the src and dst
+        subset into account
+
+        :param indices_src: The indices of the src subset to take into account
+        :type indices_src: List[int]
+        :param indices_dst: The indices of the dst subset to take into account
+        :type indices_dst: List[int]
+        :return: The set of free symbols
+        :rtype: Set[str]
+        """
+        # Symbolic properties are in volume, and the two subsets
+        result = set()
+        result |= set(map(str, self.volume.free_symbols))
+        if self.src_subset:
+            result |= self.src_subset.get_free_symbols_by_indices(indices_src)
+        if self.dst_subset:
+            result |= self.dst_subset.get_free_symbols_by_indices(indices_dst)
+        return result
+
     def get_stride(self, sdfg: 'dace.sdfg.SDFG', map: 'dace.sdfg.nodes.Map', dim: int = -1) -> 'dace.symbolic.SymExpr':
         """ Returns the stride of the underlying memory when traversing a Map.
             
diff --git a/dace/subsets.py b/dace/subsets.py
index 05918edf9b..f8b66a565d 100644
--- a/dace/subsets.py
+++ b/dace/subsets.py
@@ -360,6 +360,22 @@ def free_symbols(self) -> Set[str]:
                 result |= symbolic.symlist(d).keys()
         return result
 
+    def get_free_symbols_by_indices(self, indices: List[int]) -> Set[str]:
+        """
+        Get set of free symbols by only looking at the dimension given by the indices list
+
+        :param indices: The indices of the dimensions to look at
+        :type indices: List[int]
+        :return: The set of free symbols
+        :rtype: Set[str]
+        """
+        result = set()
+        for i, dim in enumerate(self.ranges):
+            if i in indices:
+                for d in dim:
+                    result |= symbolic.symlist(d).keys()
+        return result
+
     def reorder(self, order):
         """ Re-orders the dimensions in-place according to a permutation list.
 
diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index 3ae7d51db8..33247ac863 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -1022,7 +1022,7 @@ def _check_cand(candidates, outer_edges):
 
                 # If there are any symbols here that are not defined
                 # in "defined_symbols"
-                missing_symbols = (memlet.free_symbols - set(nsdfg.symbol_mapping.keys()))
+                missing_symbols = (memlet.get_free_symbols_by_indices(list(indices), list(indices)) - set(nsdfg.symbol_mapping.keys()))
                 if missing_symbols:
                     ignore.add(cname)
                     continue

From aaffdf227cadfcfa26afc03c27aafec597b3f45f Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 13 Jul 2023 14:33:41 +0200
Subject: [PATCH 252/392] Added testcase

---
 .../refine_nested_access_test.py              | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
index 725a438ae5..d6d0921da4 100644
--- a/tests/transformations/refine_nested_access_test.py
+++ b/tests/transformations/refine_nested_access_test.py
@@ -98,6 +98,38 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]
     assert np.allclose(B, lower.T + lower - diag)
 
 
+def test_free_sybmols_only_by_indices():
+    i = dace.symbol('i')
+    idx_a = dace.symbol('idx_a')
+    idx_b = dace.symbol('idx_b')
+    sdfg = dace.SDFG('refine_free_symbols_only_by_indices')
+    sdfg.add_array('A', [5], dace.int32)
+    sdfg.add_array('B', [5, 5], dace.int32)
+
+    @dace.program
+    def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int):
+        if A[i] > 0.5:
+            B[i, idx_a] = 1
+        else:
+            B[i, idx_b] = 0
+
+    state = sdfg.add_state()
+    A = state.add_access('A')
+    B = state.add_access('B')
+    map_entry, map_exit = state.add_map('map', dict(i='0:5'))
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A'}, {'B'}, {'i': 'i'})
+    state.add_memlet_path(A, map_entry, nsdfg,  dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
+    state.add_memlet_path(nsdfg, map_exit, B,  src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
+
+    num = sdfg.apply_transformations_repeated(RefineNestedAccess)
+    assert num == 1
+
+    assert len(state.in_edges(map_exit)) == 1
+    edge = state.in_edges(map_exit)[0]
+    assert edge.data.subset == dace.subsets.Range([(i, i, 1), (0, 4, 1)])
+
+
 if __name__ == '__main__':
     test_refine_dataflow()
     test_refine_interstate()
+    test_free_sybmols_only_by_indices()

From 7ae787d7db85b6bbf04ae0be62d3c644b59b9dde Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 14 Jul 2023 14:27:29 +0200
Subject: [PATCH 253/392] Don't add child SDFG's closure arrays to parent
 SDFG's arrays and to child SDFG's arguments if the array is not actually in
 the child SDFG.

---
 dace/frontend/python/newast.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index e1629e20c6..23e1bd9134 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3653,6 +3653,11 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
                     # If the symbol is a callback, but is not used in the nested SDFG, skip it
                     continue
 
+                # NOTE: Is it possible that an array in the SDFG's closure is not in the SDFG?
+                # NOTE: Perhaps its use was simplified/optimized away?
+                if aname not in sdfg.arrays:
+                    continue
+
                 # First, we do an inverse lookup on the already added closure arrays for `arr`.
                 is_new_arr = True
                 for k, v in self.nested_closure_arrays.items():
@@ -3740,15 +3745,6 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
         for arg in args_to_remove:
             args.remove(arg)
 
-        # Drop args that are not in the SDFG
-        filtered_args = []
-        for conn, arg in args:
-            if conn not in sdfg.arrays:
-                warnings.warn(f'Connector {conn} not found in SDFG; dropping it')
-            else:
-                filtered_args.append((conn, arg))
-        args = filtered_args
-
         # Change connector names
         updated_args = []
         arrays_before = list(sdfg.arrays.items())

From 4ec4451fd11319064f849b8bc3d7138080bab929 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 14 Jul 2023 15:05:05 +0200
Subject: [PATCH 254/392] Added missing check for "None" state.

---
 dace/transformation/interstate/sdfg_nesting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index 33247ac863..1b9324546a 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -1000,7 +1000,7 @@ def _check_cand(candidates, outer_edges):
                     continue
 
                 # Check w.r.t. loops
-                if len(nstate.ranges) > 0:
+                if nstate is not None and len(nstate.ranges) > 0:
                     # Re-annotate loop ranges, in case someone changed them
                     # TODO: Move out of here!
                     for ns in nsdfg.sdfg.states():

From c6040b0bc27b3ebf815241f7437b51a6f7e7538a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 14 Jul 2023 18:29:58 +0200
Subject: [PATCH 255/392] Yield the edge regardless if the destination has been
 visited.

---
 dace/sdfg/graph.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/graph.py b/dace/sdfg/graph.py
index eee98afa8b..5c93149529 100644
--- a/dace/sdfg/graph.py
+++ b/dace/sdfg/graph.py
@@ -346,11 +346,13 @@ def dfs_edges(self,
                 parent, children = stack[-1]
                 try:
                     e = next(children)
+                    to_yield = condition is None or condition(e.src, e.dst, e.data)
                     if e.dst not in visited:
                         visited.add(e.dst)
-                        if condition is None or condition(e.src, e.dst, e.data):
-                            yield e
+                        if to_yield:
                             stack.append((e.dst, self.out_edges(e.dst).__iter__()))
+                    if to_yield:
+                        yield e
                 except StopIteration:
                     stack.pop()
 

From 0867aaa064247a38e6d654634f7391ecc59065ff Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 14 Jul 2023 18:30:14 +0200
Subject: [PATCH 256/392] Added test.

---
 tests/graph_test.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/graph_test.py b/tests/graph_test.py
index 3346a7a786..9313b2e3cc 100644
--- a/tests/graph_test.py
+++ b/tests/graph_test.py
@@ -105,6 +105,15 @@ def test_ordered_multidigraph(self):
         self.assertEqual(next(bfs_edges), e3)
         self.assertEqual(next(bfs_edges), e6)
         self.assertEqual(next(bfs_edges), e7)
+    
+    def test_dfs_edges(self):
+
+        sdfg = dace.SDFG('test_dfs_edges')
+        before, _, _ = sdfg.add_loop(sdfg.add_state(), sdfg.add_state(), sdfg.add_state(), 'i', '0', 'i < 10', 'i + 1')
+        
+        visited_edges = list(sdfg.dfs_edges(before))
+        assert len(visited_edges) == len(set(visited_edges))
+        assert all(e in visited_edges for e in sdfg.edges())
 
 
 if __name__ == "__main__":

From 686562bc56a75498fa38a6dd0c8b0850f915074b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 12:22:45 +0200
Subject: [PATCH 257/392] InterstateEdge.new_symbols now uses a similar
 algorithm to InterstateEdge.free_symbols to return only newly defined symbols
 instead of all the LHS.

---
 dace/sdfg/sdfg.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 3abef05dc9..18763e385a 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -292,7 +292,19 @@ def new_symbols(self, sdfg, symbols) -> Dict[str, dtypes.typeclass]:
         else:
             alltypes = symbols
 
-        return {k: infer_expr_type(v, alltypes) for k, v in self.assignments.items()}
+        inferred_lhs_symbols = {k: infer_expr_type(v, alltypes) for k, v in self.assignments.items()}
+    
+        # Symbols in assignment keys are candidate newly defined symbols
+        lhs_symbols = set()
+        # Symbols already defined
+        rhs_symbols = set()
+        for lhs, rhs in self.assignments.items():
+            rhs_symbols |= symbolic.free_symbols_and_functions(rhs)
+            # Only add LHS to the set of candidate newly defined symbols if it has not been defined yet
+            if lhs not in rhs_symbols:
+                lhs_symbols.add(lhs)
+        
+        return {k: v for k, v in inferred_lhs_symbols.items() if k in lhs_symbols}
 
     def get_read_memlets(self, arrays: Dict[str, dt.Data]) -> List[mm.Memlet]:
         """

From 7918903489dd00b01ac8fa3bcb28af8da6a645ff Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 12:49:27 +0200
Subject: [PATCH 258/392] Moved Transpose LibraryNode to STD

---
 dace/frontend/python/replacements.py                |  4 ++--
 dace/libraries/blas/nodes/__init__.py               |  3 +--
 dace/libraries/linalg/nodes/cholesky.py             |  7 ++++---
 dace/libraries/linalg/nodes/solve.py                |  7 ++++---
 dace/libraries/linalg/nodes/tensordot.py            |  3 ++-
 dace/libraries/standard/nodes/__init__.py           |  3 ++-
 .../libraries/{blas => standard}/nodes/transpose.py | 13 ++++++-------
 .../dataflow/matrix_product_transpose.py            |  9 +++++----
 tests/library/lapack_getrf_test.py                  | 13 +++++++------
 tests/library/lapack_getrs_test.py                  | 13 +++++++------
 tests/library/lapack_potrf_test.py                  | 13 +++++++------
 tests/transformations/redundant_copy_test.py        |  4 ++--
 12 files changed, 49 insertions(+), 43 deletions(-)
 rename dace/libraries/{blas => standard}/nodes/transpose.py (96%)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 8e8d1bb5e8..0a2a12bd35 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -778,8 +778,8 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
     if axes == (1, 0):  # Special case for 2D transposition
         acc1 = state.add_read(inpname)
         acc2 = state.add_write(outname)
-        import dace.libraries.blas  # Avoid import loop
-        tasklet = dace.libraries.blas.Transpose('_Transpose_', restype)
+        import dace.libraries.standard  # Avoid import loop
+        tasklet = dace.libraries.standard.Transpose('_Transpose_', restype)
         state.add_node(tasklet)
         state.add_edge(acc1, None, tasklet, '_inp', Memlet.from_array(inpname, arr1))
         state.add_edge(tasklet, '_out', acc2, None, Memlet.from_array(outname, arr2))
diff --git a/dace/libraries/blas/nodes/__init__.py b/dace/libraries/blas/nodes/__init__.py
index 0f27f8f463..75a217fede 100644
--- a/dace/libraries/blas/nodes/__init__.py
+++ b/dace/libraries/blas/nodes/__init__.py
@@ -1,11 +1,10 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from .matmul import MatMul
 from .dot import Dot
 from .gemv import Gemv
 from .gemm import Gemm
 from .ger import Ger
 from .batched_matmul import BatchedMatMul
-from .transpose import Transpose
 
 from .axpy import Axpy
 from .einsum import Einsum
diff --git a/dace/libraries/linalg/nodes/cholesky.py b/dace/libraries/linalg/nodes/cholesky.py
index 38dbc12189..55411f08ef 100644
--- a/dace/libraries/linalg/nodes/cholesky.py
+++ b/dace/libraries/linalg/nodes/cholesky.py
@@ -1,11 +1,12 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import copy
 import dace.library
 import dace.properties
 import dace.sdfg.nodes
-from dace import dtypes, Memlet
-from dace.libraries.blas import Transpose
+
+from dace import Memlet
 from dace.libraries.lapack import Potrf
+from dace.libraries.standard import Transpose
 from dace.transformation.transformation import ExpandTransformation
 from dace.libraries.lapack import environments
 from dace.libraries.blas import environments as blas_environments
diff --git a/dace/libraries/linalg/nodes/solve.py b/dace/libraries/linalg/nodes/solve.py
index 8fb755c161..5849ed410e 100644
--- a/dace/libraries/linalg/nodes/solve.py
+++ b/dace/libraries/linalg/nodes/solve.py
@@ -1,13 +1,14 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import copy
 import dace
 import dace.library
 import dace.properties
 import dace.sdfg.nodes
 import numpy as np
+
 from dace import Memlet
-from dace.libraries.blas.nodes import Transpose
-from dace.libraries.lapack.nodes import Getrf, Getrs
+from dace.libraries.lapack import Getrf, Getrs
+from dace.libraries.standard import Transpose
 from dace.transformation.transformation import ExpandTransformation
 from dace.libraries.lapack import environments
 from dace.libraries.blas import environments as blas_environments
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 7ab588d78c..6e20b676d1 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -1,12 +1,13 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import collections
 import dace
+import dace.libraries.linalg.environments as environments
+
 from dace import library, nodes, properties
 from dace.data import _prod
 from dace.libraries.blas import blas_helpers
 from dace.symbolic import symstr
 from dace.transformation.transformation import ExpandTransformation
-import dace.libraries.linalg.environments as environments
 
 
 @library.expansion
diff --git a/dace/libraries/standard/nodes/__init__.py b/dace/libraries/standard/nodes/__init__.py
index fd31c7b7e4..3ad0d505d1 100644
--- a/dace/libraries/standard/nodes/__init__.py
+++ b/dace/libraries/standard/nodes/__init__.py
@@ -1,4 +1,5 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from .code import CodeLibraryNode
 from .gearbox import Gearbox
 from .reduce import Reduce
+from .transpose import Transpose
diff --git a/dace/libraries/blas/nodes/transpose.py b/dace/libraries/standard/nodes/transpose.py
similarity index 96%
rename from dace/libraries/blas/nodes/transpose.py
rename to dace/libraries/standard/nodes/transpose.py
index 21785f5175..8f4b55b6be 100644
--- a/dace/libraries/blas/nodes/transpose.py
+++ b/dace/libraries/standard/nodes/transpose.py
@@ -1,13 +1,12 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import functools
 from copy import deepcopy as dc
-from dace.config import Config
 import dace.library
 import dace.properties
 import dace.sdfg.nodes
 from dace.libraries.blas import blas_helpers
+from dace.libraries.blas import environments as blas_environments
 from dace.transformation.transformation import ExpandTransformation
-from .. import environments
 import warnings
 
 
@@ -96,7 +95,7 @@ def expansion(node, state, sdfg):
 @dace.library.expansion
 class ExpandTransposeMKL(ExpandTransformation):
 
-    environments = [environments.intel_mkl.IntelMKL]
+    environments = [blas_environments.intel_mkl.IntelMKL]
 
     @staticmethod
     def expansion(node, state, sdfg):
@@ -136,7 +135,7 @@ def expansion(node, state, sdfg):
 @dace.library.expansion
 class ExpandTransposeOpenBLAS(ExpandTransformation):
 
-    environments = [environments.openblas.OpenBLAS]
+    environments = [blas_environments.openblas.OpenBLAS]
 
     @staticmethod
     def expansion(node, state, sdfg):
@@ -173,7 +172,7 @@ def expansion(node, state, sdfg):
 @dace.library.expansion
 class ExpandTransposeCuBLAS(ExpandTransformation):
 
-    environments = [environments.cublas.cuBLAS]
+    environments = [blas_environments.cublas.cuBLAS]
 
     @staticmethod
     def expansion(node, state, sdfg, **kwargs):
@@ -192,7 +191,7 @@ def expansion(node, state, sdfg, **kwargs):
         beta = f"__state->cublas_handle.Constants(__dace_cuda_device).{factort}Zero()"
         _, _, (m, n) = _get_transpose_input(node, state, sdfg)
 
-        code = (environments.cublas.cuBLAS.handle_setup_code(node) + f"""cublas{func}(
+        code = (blas_environments.cublas.cuBLAS.handle_setup_code(node) + f"""cublas{func}(
                     __dace_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
                     {m}, {n}, {alpha}, ({cdtype}*)_inp, {n}, {beta}, ({cdtype}*)_inp, {m}, ({cdtype}*)_out, {m});
                 """)
diff --git a/dace/transformation/dataflow/matrix_product_transpose.py b/dace/transformation/dataflow/matrix_product_transpose.py
index c97f40f540..5d8bf04fb6 100644
--- a/dace/transformation/dataflow/matrix_product_transpose.py
+++ b/dace/transformation/dataflow/matrix_product_transpose.py
@@ -17,10 +17,11 @@ class MatrixProductTranspose(transformation.SingleStateTransformation):
         T(A) @ T(B) = T(B @ A)
     """
     import dace.libraries.blas as blas  # Avoid slow imports
+    import dace.libraries.standard as std  # Avoid slow imports
 
-    transpose_a = transformation.PatternNode(blas.Transpose)
+    transpose_a = transformation.PatternNode(std.Transpose)
     at = transformation.PatternNode(nodes.AccessNode)
-    transpose_b = transformation.PatternNode(blas.Transpose)
+    transpose_b = transformation.PatternNode(std.Transpose)
     bt = transformation.PatternNode(nodes.AccessNode)
     a_times_b = transformation.PatternNode(blas.MatMul)
 
@@ -57,7 +58,7 @@ def match_to_str(self, graph):
         return f"{transpose_a.name} -> {a_times_b.name} <- {transpose_b.name}"
 
     def apply(self, graph: SDFGState, sdfg: SDFG):
-        import dace.libraries.blas as blas
+        import dace.libraries.standard as std
 
         transpose_a = self.transpose_a
         _at = self.at
@@ -82,7 +83,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
             break
         tmp_name, tmp_arr = sdfg.add_temp_transient(shape, a_times_b.dtype)
         tmp_acc = graph.add_access(tmp_name)
-        transpose_c = blas.Transpose('_Transpose_', a_times_b.dtype)
+        transpose_c = std.Transpose('_Transpose_', a_times_b.dtype)
         for edge in graph.out_edges(a_times_b):
             _, _, dst, dst_conn, memlet = edge
             graph.remove_edge(edge)
diff --git a/tests/library/lapack_getrf_test.py b/tests/library/lapack_getrf_test.py
index 04bf9cf4f8..ffe66f9c1d 100644
--- a/tests/library/lapack_getrf_test.py
+++ b/tests/library/lapack_getrf_test.py
@@ -1,11 +1,12 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
-from dace.memlet import Memlet
-import dace.libraries.blas as blas
 import dace.libraries.lapack as lapack
+import dace.libraries.standard as std
 import numpy as np
 import pytest
 
+from dace.memlet import Memlet
+
 ###############################################################################
 
 
@@ -34,9 +35,9 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
         xo = state.add_access("x" + suffix)
         xin = state.add_access("xt" + suffix)
         xout = state.add_access("xt" + suffix)
-        transpose_in = blas.nodes.transpose.Transpose("transpose_in", dtype=dtype)
+        transpose_in = std.Transpose("transpose_in", dtype=dtype)
         transpose_in.implementation = "cuBLAS"
-        transpose_out = blas.nodes.transpose.Transpose("transpose_out", dtype=dtype)
+        transpose_out = std.Transpose("transpose_out", dtype=dtype)
         transpose_out.implementation = "cuBLAS"
         state.add_nedge(xhi, xi, Memlet.from_array(*xhost_arr))
         state.add_nedge(xo, xho, Memlet.from_array(*xhost_arr))
@@ -50,7 +51,7 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
     pivots = state.add_access("pivots" + suffix)
     result = state.add_access("result" + suffix)
 
-    getrf_node = lapack.nodes.getrf.Getrf("getrf")
+    getrf_node = lapack.Getrf("getrf")
     getrf_node.implementation = implementation
 
     state.add_memlet_path(xin, getrf_node, dst_conn="_xin", memlet=Memlet.simple(xin, "0:n, 0:n", num_accesses=n * n))
diff --git a/tests/library/lapack_getrs_test.py b/tests/library/lapack_getrs_test.py
index 6b3f9d3d67..97a2a35a6b 100644
--- a/tests/library/lapack_getrs_test.py
+++ b/tests/library/lapack_getrs_test.py
@@ -1,11 +1,12 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
-from dace.memlet import Memlet
-import dace.libraries.blas as blas
 import dace.libraries.lapack as lapack
+import dace.libraries.standard as std
 import numpy as np
 import pytest
 
+from dace.memlet import Memlet
+
 ###############################################################################
 
 
@@ -39,7 +40,7 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
         Bho = state.add_read("B")
         Bin = state.add_access("B" + suffix)
         Bout = state.add_access("B" + suffix)
-        transpose_in = blas.nodes.transpose.Transpose("transpose_in", dtype=dtype)
+        transpose_in = std.Transpose("transpose_in", dtype=dtype)
         transpose_in.implementation = "cuBLAS"
         state.add_nedge(Ahi, Ai, Memlet.from_array(*Ahost_arr))
         state.add_nedge(Bhi, Bin, Memlet.from_array(*Bhost_arr))
@@ -55,9 +56,9 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
     res_getrf = state.add_access("result_getrf" + suffix)
     res_getrs = state.add_access("result_getrs" + suffix)
 
-    getrf_node = lapack.nodes.getrf.Getrf("getrf")
+    getrf_node = lapack.Getrf("getrf")
     getrf_node.implementation = implementation
-    getrs_node = lapack.nodes.getrs.Getrs("getrs")
+    getrs_node = lapack.Getrs("getrs")
     getrs_node.implementation = implementation
 
     state.add_memlet_path(Ain, getrf_node, dst_conn="_xin", memlet=Memlet.simple(Ain, "0:n, 0:n", num_accesses=n * n))
diff --git a/tests/library/lapack_potrf_test.py b/tests/library/lapack_potrf_test.py
index a977d347ae..7f7f6714d1 100644
--- a/tests/library/lapack_potrf_test.py
+++ b/tests/library/lapack_potrf_test.py
@@ -1,11 +1,12 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
-from dace.memlet import Memlet
-import dace.libraries.blas as blas
 import dace.libraries.lapack as lapack
+import dace.libraries.standard as std
 import numpy as np
 import pytest
 
+from dace.memlet import Memlet
+
 ###############################################################################
 
 
@@ -40,9 +41,9 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
         xo = state.add_access("x" + suffix)
         xin = state.add_access("xt" + suffix)
         xout = state.add_access("xt" + suffix)
-        transpose_in = blas.nodes.transpose.Transpose("transpose_in", dtype=dtype)
+        transpose_in = std.Transpose("transpose_in", dtype=dtype)
         transpose_in.implementation = "cuBLAS"
-        transpose_out = blas.nodes.transpose.Transpose("transpose_out", dtype=dtype)
+        transpose_out = std.Transpose("transpose_out", dtype=dtype)
         transpose_out.implementation = "cuBLAS"
         state.add_nedge(xhi, xi, Memlet.from_array(*xhost_arr))
         state.add_nedge(xo, xho, Memlet.from_array(*xhost_arr))
@@ -55,7 +56,7 @@ def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):
         xout = state.add_access("x" + suffix)
     result = state.add_access("result" + suffix)
 
-    potrf_node = lapack.nodes.potrf.Potrf("potrf")
+    potrf_node = lapack.Potrf("potrf")
     potrf_node.implementation = implementation
 
     state.add_memlet_path(xin, potrf_node, dst_conn="_xin", memlet=Memlet.simple(xin, "0:n, 0:n", num_accesses=n * n))
diff --git a/tests/transformations/redundant_copy_test.py b/tests/transformations/redundant_copy_test.py
index 494149fae0..ecf25e07d4 100644
--- a/tests/transformations/redundant_copy_test.py
+++ b/tests/transformations/redundant_copy_test.py
@@ -1,10 +1,10 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import numpy as np
 import pytest
 
 import dace
 from dace import nodes
-from dace.libraries.blas import Transpose
+from dace.libraries.standard import Transpose
 from dace.transformation.dataflow import (RedundantArray, RedundantSecondArray, RedundantArrayCopying,
                                           RedundantArrayCopyingIn)
 

From ab4b79fb95ef7f4aea3aef1e2758c916d70e443c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 13:08:47 +0200
Subject: [PATCH 259/392] Move TensorTranspose LibraryNode to STD.

---
 dace/frontend/python/replacements.py                        | 2 +-
 dace/libraries/linalg/nodes/tensordot.py                    | 2 +-
 dace/libraries/standard/environments/__init__.py            | 5 +++--
 .../libraries/{ttranspose => standard}/environments/hptt.py | 1 -
 dace/libraries/standard/nodes/__init__.py                   | 1 +
 dace/libraries/{ttranspose => standard}/nodes/ttranspose.py | 0
 dace/libraries/ttranspose/__init__.py                       | 6 ------
 dace/libraries/ttranspose/environments/__init__.py          | 2 --
 dace/libraries/ttranspose/nodes/__init__.py                 | 2 --
 9 files changed, 6 insertions(+), 15 deletions(-)
 rename dace/libraries/{ttranspose => standard}/environments/hptt.py (98%)
 rename dace/libraries/{ttranspose => standard}/nodes/ttranspose.py (100%)
 delete mode 100644 dace/libraries/ttranspose/__init__.py
 delete mode 100644 dace/libraries/ttranspose/environments/__init__.py
 delete mode 100644 dace/libraries/ttranspose/nodes/__init__.py

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 0a2a12bd35..29aea390f3 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -796,7 +796,7 @@ def _transpose(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, inpname: str, a
 
         read = state.add_read(inpname)
         write = state.add_write(outname)
-        from dace.libraries.ttranspose import TensorTranspose
+        from dace.libraries.standard import TensorTranspose
         tasklet = TensorTranspose('_TensorTranspose', axes or list(range(len(arr1.shape))))
         state.add_node(tasklet)
         state.add_edge(read, None, tasklet, '_inp_tensor', Memlet.from_array(inpname, arr1))
diff --git a/dace/libraries/linalg/nodes/tensordot.py b/dace/libraries/linalg/nodes/tensordot.py
index 6e20b676d1..99fbc5bdc5 100644
--- a/dace/libraries/linalg/nodes/tensordot.py
+++ b/dace/libraries/linalg/nodes/tensordot.py
@@ -232,7 +232,7 @@ def expansion(node, parent_state, parent_sdfg):
             state.add_edge(tasklet, '_c', dot_vnode, None, dace.Memlet.from_array(dot_vname, dot_view))
             state.add_edge(dot_vnode, 'views', dot_anode, None, dace.Memlet.from_array(dot_name, dot_arr))
             out_node = state.add_write('_out_tensor')
-            from dace.libraries.ttranspose import TensorTranspose
+            from dace.libraries.standard import TensorTranspose
             tasklet = TensorTranspose('_TensorTranspose', node.permutation)
             state.add_edge(dot_anode, None, tasklet, '_inp_tensor', dace.Memlet.from_array(dot_name, dot_arr))
             state.add_edge(tasklet, '_out_tensor', out_node, None, dace.Memlet.from_array('_out_tensor', out_arr))
diff --git a/dace/libraries/standard/environments/__init__.py b/dace/libraries/standard/environments/__init__.py
index a0204b85ad..d8f585ebd9 100644
--- a/dace/libraries/standard/environments/__init__.py
+++ b/dace/libraries/standard/environments/__init__.py
@@ -1,2 +1,3 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-from .cuda import CUDA
\ No newline at end of file
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+from .cuda import CUDA
+from .hptt import HPTT
diff --git a/dace/libraries/ttranspose/environments/hptt.py b/dace/libraries/standard/environments/hptt.py
similarity index 98%
rename from dace/libraries/ttranspose/environments/hptt.py
rename to dace/libraries/standard/environments/hptt.py
index 2ee7695437..9d65da7111 100644
--- a/dace/libraries/ttranspose/environments/hptt.py
+++ b/dace/libraries/standard/environments/hptt.py
@@ -1,5 +1,4 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-import ctypes
 import os
 from dace import config, library
 
diff --git a/dace/libraries/standard/nodes/__init__.py b/dace/libraries/standard/nodes/__init__.py
index 3ad0d505d1..c367a4a109 100644
--- a/dace/libraries/standard/nodes/__init__.py
+++ b/dace/libraries/standard/nodes/__init__.py
@@ -3,3 +3,4 @@
 from .gearbox import Gearbox
 from .reduce import Reduce
 from .transpose import Transpose
+from .ttranspose import TensorTranspose
diff --git a/dace/libraries/ttranspose/nodes/ttranspose.py b/dace/libraries/standard/nodes/ttranspose.py
similarity index 100%
rename from dace/libraries/ttranspose/nodes/ttranspose.py
rename to dace/libraries/standard/nodes/ttranspose.py
diff --git a/dace/libraries/ttranspose/__init__.py b/dace/libraries/ttranspose/__init__.py
deleted file mode 100644
index 868af39ac6..0000000000
--- a/dace/libraries/ttranspose/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from dace.library import register_library
-from .nodes import *
-from .environments import *
-
-register_library(__name__, "ttranspose")
diff --git a/dace/libraries/ttranspose/environments/__init__.py b/dace/libraries/ttranspose/environments/__init__.py
deleted file mode 100644
index 267e46a5e4..0000000000
--- a/dace/libraries/ttranspose/environments/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from .hptt import *
diff --git a/dace/libraries/ttranspose/nodes/__init__.py b/dace/libraries/ttranspose/nodes/__init__.py
deleted file mode 100644
index a6d558f2ed..0000000000
--- a/dace/libraries/ttranspose/nodes/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserv
-from .ttranspose import TensorTranspose

From 97c2b8a4d1d97afac1ec74c3cbb40ecde82bb364 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 13:09:50 +0200
Subject: [PATCH 260/392] Remove ttranspose library from configuration schema.

---
 dace/config_schema.yml | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index ea862e5237..e378b6c1f2 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -1016,18 +1016,3 @@ required:
                         description: >
                             Force the default implementation, even if an
                             implementation has been explicitly set on a node.
-            ttranspose:
-                type: dict
-                title: ttranspose
-                description: Built-in Tensor-Transpose DaCe library.
-                required:
-                    default_implementation:
-                        type: str
-                        default: pure
-                        description: Default implementation of Tensor-Transpose library nodes.
-                    override:
-                        type: bool
-                        default: false
-                        description: >
-                            Force the default implementation, even if an
-                            implementation has been explicitly set on a node.

From 39397dedd2b11e90876d9f9b7d99977800e25e2d Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 13:23:59 +0200
Subject: [PATCH 261/392] Transpose LibraryNodes now have their default
 implementation set to 'pure'.

---
 dace/libraries/standard/nodes/transpose.py  | 2 +-
 dace/libraries/standard/nodes/ttranspose.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/libraries/standard/nodes/transpose.py b/dace/libraries/standard/nodes/transpose.py
index 8f4b55b6be..9963fc823b 100644
--- a/dace/libraries/standard/nodes/transpose.py
+++ b/dace/libraries/standard/nodes/transpose.py
@@ -215,7 +215,7 @@ class Transpose(dace.sdfg.nodes.LibraryNode):
         "OpenBLAS": ExpandTransposeOpenBLAS,
         "cuBLAS": ExpandTransposeCuBLAS
     }
-    default_implementation = None
+    default_implementation = 'pure'
 
     dtype = dace.properties.TypeClassProperty(allow_none=True)
 
diff --git a/dace/libraries/standard/nodes/ttranspose.py b/dace/libraries/standard/nodes/ttranspose.py
index 9d87a84343..e11012e3ad 100644
--- a/dace/libraries/standard/nodes/ttranspose.py
+++ b/dace/libraries/standard/nodes/ttranspose.py
@@ -86,7 +86,7 @@ class TensorTranspose(nodes.LibraryNode):
     """ Implements out-of-place tensor transpositions. """
 
     implementations = {"pure": ExpandPure, "HPTT": ExpandHPTT}
-    default_implementation = None
+    default_implementation = 'pure'
 
     axes = properties.ListProperty(element_type=int, default=[], desc="Permutation of input tensor's modes")
     alpha = properties.Property(dtype=Number, default=1, desc="Input tensor scaling factor")

From f5727cfa723ffaaed2d038176d09cd4fa9c9ec8f Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 15:56:49 +0200
Subject: [PATCH 262/392] Added method that tests equality between two symbolic
 expressions.

---
 dace/symbolic.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/dace/symbolic.py b/dace/symbolic.py
index ec2c9806c2..01440d465e 100644
--- a/dace/symbolic.py
+++ b/dace/symbolic.py
@@ -1337,3 +1337,27 @@ def inequal_symbols(a: Union[sympy.Expr, Any], b: Union[sympy.Expr, Any]) -> boo
         # We subtract and compare to zero according to the SymPy documentation
         # (https://docs.sympy.org/latest/tutorial/gotchas.html).
         return (a - b).simplify() != 0
+
+
+def equal(a: SymbolicType, b: SymbolicType, is_length: bool = True) -> Union[bool, None]:
+    """
+    Compares 2 symbolic expressions and returns True if they are equal, False if they are inequal,
+    and None if the comparison is inconclusive.
+
+    :param a: First symbolic expression.
+    :param b: Second symbolic expression.
+    :param is_length: If True, the assumptions that a, b are integers and positive are made.
+    """
+
+    args = [arg.expr if isinstance(arg, SymExpr) else arg for arg in (a, b)]
+
+    if any([args is None for args in args]):
+        return False
+
+    facts = []
+    if is_length:
+        for arg in args:
+            facts += [sympy.Q.integer(arg), sympy.Q.positive(arg)]
+    
+    with sympy.assuming(*facts):
+        return sympy.ask(sympy.Q.is_true(sympy.Eq(*args)))

From 92872fe4d5c41a39e7dbc3e7d2707b671e3f5f82 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 15:57:25 +0200
Subject: [PATCH 263/392] The frontend's @ replacement now makes use of the new
 symbolic method.

---
 dace/frontend/python/replacements.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 528aef1ec8..1fcf1d321e 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -2133,10 +2133,13 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
     if len(arr1.shape) > 1 and len(arr2.shape) > 1:  # matrix * matrix
 
         if len(arr1.shape) > 3 or len(arr2.shape) > 3:
-            raise SyntaxError('Matrix multiplication of tensors of dimensions > 3 '
-                              'not supported')
+            raise SyntaxError('Matrix multiplication of tensors of dimensions > 3 not supported')
 
-        if arr1.shape[-1] != arr2.shape[-2]:
+        res = symbolic.equal(arr1.shape[-1], arr2.shape[-2])
+        if res is None:
+            warnings.warn(f'Last mode of first tesnsor/matrix {arr1.shape[-1]} and second-last mode of '
+                          f'second tensor/matrix {arr2.shape[-2]} may not match', UserWarning)
+        elif not res:
             raise SyntaxError('Matrix dimension mismatch %s != %s' % (arr1.shape[-1], arr2.shape[-2]))
 
         from dace.libraries.blas.nodes.matmul import _get_batchmm_opts
@@ -2150,7 +2153,11 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
 
     elif len(arr1.shape) == 2 and len(arr2.shape) == 1:  # matrix * vector
 
-        if arr1.shape[1] != arr2.shape[0]:
+        res = symbolic.equal(arr1.shape[-1], arr2.shape[0])
+        if res is None:
+            warnings.warn(f'Number of matrix columns {arr1.shape[-1]} and length of vector {arr2.shape[0]} '
+                          f'may not match', UserWarning)
+        elif not res:
             raise SyntaxError("Number of matrix columns {} must match"
                               "size of vector {}.".format(arr1.shape[1], arr2.shape[0]))
 
@@ -2158,7 +2165,11 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
 
     elif len(arr1.shape) == 1 and len(arr2.shape) == 2:  # vector * matrix
 
-        if arr1.shape[0] != arr2.shape[0]:
+        res = symbolic.equal(arr1.shape[0], arr2.shape[0])
+        if res is None:
+            warnings.warn(f'Length of vector {arr1.shape[0]} and number of matrix rows {arr2.shape[0]} '
+                          f'may not match', UserWarning)
+        elif not res:
             raise SyntaxError("Size of vector {} must match number of matrix "
                               "rows {} must match".format(arr1.shape[0], arr2.shape[0]))
 
@@ -2166,7 +2177,11 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
 
     elif len(arr1.shape) == 1 and len(arr2.shape) == 1:  # vector * vector
 
-        if arr1.shape[0] != arr2.shape[0]:
+        res = symbolic.equal(arr1.shape[0], arr2.shape[0])
+        if res is None:
+            warnings.warn(f'Length of first vector {arr1.shape[0]} and length of second vector {arr2.shape[0]} '
+                          f'may not match', UserWarning)
+        elif not res:
             raise SyntaxError("Vectors in vector product must have same size: "
                               "{} vs. {}".format(arr1.shape[0], arr2.shape[0]))
 

From 01759d1dad08202e4f9105f6565c93d873bf19be Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 15:58:06 +0200
Subject: [PATCH 264/392] Added to matmul-related library nodes partial support
 for the new symbolic method.

---
 dace/libraries/blas/nodes/batched_matmul.py | 23 +++++++----
 dace/libraries/blas/nodes/gemm.py           | 45 +++++++++++++++------
 dace/libraries/blas/nodes/matmul.py         | 15 ++++---
 3 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/dace/libraries/blas/nodes/batched_matmul.py b/dace/libraries/blas/nodes/batched_matmul.py
index 1c9e3dc2c5..1ced5b0cba 100644
--- a/dace/libraries/blas/nodes/batched_matmul.py
+++ b/dace/libraries/blas/nodes/batched_matmul.py
@@ -1,8 +1,7 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from copy import deepcopy as dc
 from dace import dtypes, memlet as mm, properties, data as dt
-from typing import Any, Dict, Optional
-from dace.symbolic import symstr
+from dace.symbolic import symstr, equal
 import dace.library
 import dace.properties
 from dace.frontend.common import op_repository as oprepo
@@ -12,6 +11,7 @@
                                               to_cublas_computetype)
 from dace.libraries.blas.nodes.matmul import (_get_matmul_operands, _get_batchmm_opts, _get_codegen_gemm_opts)
 from .. import environments
+import warnings
 
 
 @dace.library.expansion
@@ -28,8 +28,12 @@ def make_sdfg(node, parent_state, parent_sdfg):
         cdesc = parent_sdfg.arrays[outedge.data.data]
         bopt = _get_batchmm_opts(shape_a, strides_a, shape_b, strides_b, cdesc.shape, cdesc.strides)
 
-        if shape_a[-1] != shape_b[-2]:
-            raise SyntaxError('Matrix sizes must match')
+        res = equal(shape_a[-1], shape_b[-2])
+        if res is None:
+            warnings.warn(f"First matrix columns {shape_a[-1]} may not match second matrix rows {shape_b[-2]}",
+                          UserWarning)
+        elif not res:
+            raise SyntaxError("Matrix sizes must match")
         if bopt:
             shape_c = (bopt['b'], shape_a[-2], shape_b[-1])
         else:
@@ -436,9 +440,12 @@ def validate(self, sdfg, state):
             raise ValueError("Batched matrix-matrix product only supported on matrices")
         if len(size1) != 3:
             raise ValueError("Batched matrix-matrix product only supported on matrices")
-        if size0[-1] != size1[-2]:
-            raise ValueError("Inputs to matrix-matrix product "
-                             "must agree in the k-dimension")
+        res = equal(size0[-1], size1[-2])
+        if res is None:
+            warnings.warn(f'First tensor\'s last mode {size0[-1]} and second tensor\'s second-last mode {size1[-2]} '
+                          f'may not match', UserWarning)
+        elif not res:
+            raise ValueError("Inputs to matrix-matrix product must agree in the k-dimension")
         out_subset = dc(out_memlet.subset)
         out_subset.squeeze()
         size2 = out_subset.size()
diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py
index 767cd53429..2db2055ae5 100644
--- a/dace/libraries/blas/nodes/gemm.py
+++ b/dace/libraries/blas/nodes/gemm.py
@@ -1,8 +1,7 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from copy import deepcopy as dc
-from typing import Any, Dict, Optional
 from dace import dtypes, memlet as mm, properties, data as dt
-from dace.symbolic import symstr
+from dace.symbolic import symstr, equal
 import dace.library
 from dace import SDFG, SDFGState
 from dace.frontend.common import op_repository as oprepo
@@ -13,7 +12,7 @@
 from dace.libraries.blas.nodes.matmul import (_get_matmul_operands, _get_codegen_gemm_opts)
 from .. import environments
 import numpy as np
-from numbers import Number
+import warnings
 
 
 def _is_complex(dtype):
@@ -65,7 +64,13 @@ def make_sdfg(node, parent_state, parent_sdfg):
         else:
             trans_shape_b = shape_b
 
-        if (len(trans_shape_a) != 2 or len(trans_shape_b) != 2 or trans_shape_a[1] != trans_shape_b[0]):
+        if len(trans_shape_a) != 2 or len(trans_shape_b) != 2:
+            raise SyntaxError("Matrix sizes must match")
+        res = equal(trans_shape_a[1], trans_shape_b[0])
+        if res is None:
+            warnings.warn(f"First matrix columns {trans_shape_a[1]} may not match "
+                          f"second matrix rows {trans_shape_b[0]}", UserWarning)
+        elif not res:
             raise SyntaxError("Matrix sizes must match")
         M, K, N = trans_shape_a[0], trans_shape_a[1], trans_shape_b[1]
         shape_c = (M, N)
@@ -1032,19 +1037,33 @@ def validate(self, sdfg, state):
         # Function is symmetric, edge order does not matter
         if len(size0) != 2 or len(size1) != 2:
             raise ValueError("matrix-matrix product only supported on matrices")
-        if size0[1] != size1[0]:
-            raise ValueError("Inputs to matrix-matrix product "
-                             "must agree in the k-dimension")
+        res = equal(size0[1], size1[0])
+        if res is None:
+            warnings.warn(f'First matrix columns {size0[1]} and second matrix rows {size1[0]} may not match',
+                          UserWarning)
+        elif not res:
+            raise ValueError("Inputs to matrix-matrix product must agree in the k-dimension")
         out_subset = dc(out_memlet.subset)
         out_subset.squeeze()
         size3 = out_subset.size()
-        if size2 is not None and size2 != size3:
-            raise ValueError("Input C matrix must match output matrix.")
+        if size2 is not None:
+            res = [equal(s0, s1) for s0, s1 in zip(size2, size3)]
+            fail = any([r is False for r in res])
+            success = all([r is True for r in res])
+            if fail:
+                raise ValueError("Input C matrix must match output matrix.")
+            elif not success:
+                warnings.warn(f"Size of input C matrix {size2} may not match output matrix size {size3}", UserWarning)
         if len(size3) != 2:
             raise ValueError("matrix-matrix product only supported on matrices")
-        if len(size3) == 2 and list(size3) != [size0[-2], size1[-1]]:
-            raise ValueError("Output to matrix-matrix product must agree in the m and n "
-                             "dimensions")
+        if len(size3) == 2:
+            res = [equal(s0, s1) for s0, s1 in zip(size3, [size0[-2], size1[-1]])]
+            fail = any([r is False for r in res])
+            success = all([r is True for r in res])
+            if fail:
+                raise ValueError("Output to matrix-matrix product must agree in the m and n dimensions")
+            elif not success:
+                warnings.warn(f'Size of output {size3} may not match input {size0} @ {size1}', UserWarning)
 
 
 # Numpy replacement
diff --git a/dace/libraries/blas/nodes/matmul.py b/dace/libraries/blas/nodes/matmul.py
index 185beee1a0..f0767a0473 100644
--- a/dace/libraries/blas/nodes/matmul.py
+++ b/dace/libraries/blas/nodes/matmul.py
@@ -1,8 +1,8 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
-from dace import properties
+from dace import properties, symbolic
 from copy import deepcopy as dc
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 import warnings
 
 
@@ -58,8 +58,13 @@ def _get_batchmm_opts(a_shape, a_strides, b_shape, b_strides, c_shape, c_strides
         batch = a_shape[0]
         stride_a = a_strides[0]
     if len(b_shape) == 3:
-        if batch and batch != b_shape[0]:
-            raise ValueError('Batch size mismatch for matrix multiplication')
+        if batch is not None:
+            res = symbolic.equal(batch, b_shape[0])
+            if res is None:
+                warnings.warn(f'Batch size of first tensor ({batch}) may not match second tensor ({b_shape[0]})',
+                              UserWarning)
+            elif not res:
+                raise ValueError('Batch size mismatch for matrix multiplication')
         batch = b_shape[0]
         stride_b = b_strides[0]
     if c_shape and len(c_shape) == 3:

From 03245d405b5ad1774aec66c7e5cc3002465e546a Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Sat, 15 Jul 2023 15:58:31 +0200
Subject: [PATCH 265/392] Added new tests.

---
 tests/library/gemm_test.py                | 53 ++++++++++++++++++++++-
 tests/numpy/matrix_multiplication_test.py | 26 ++++++++++-
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/tests/library/gemm_test.py b/tests/library/gemm_test.py
index 0754121e98..df60d1aa43 100644
--- a/tests/library/gemm_test.py
+++ b/tests/library/gemm_test.py
@@ -12,6 +12,8 @@
 M = dace.symbol('M')
 K = dace.symbol('K')
 N = dace.symbol('N')
+L = dace.symbol('L')
+O = dace.symbol('O')
 
 
 @pytest.mark.parametrize(
@@ -171,8 +173,55 @@ def params_generator(grid):
                       "misconfigured, skipping test for {}.".format(implementation))
 
 
+def test_gemm_symbolic():
+    sdfg = dace.SDFG("gemm")
+    state = sdfg.add_state()
+    A, A_arr = sdfg.add_array("A", [M, K], dace.float64)
+    B, B_arr = sdfg.add_array("B", [L, N], dace.float64)
+    C, C_arr = sdfg.add_array("C", [O, N], dace.float64)
+
+    rA = state.add_read("A")
+    rB = state.add_read("B")
+    wC = state.add_write("C")
+
+    libnode = Gemm('_Gemm_', transA=False, transB=False, alpha=1.0, beta=0.0)
+    state.add_node(libnode)
+
+    state.add_edge(rA, None, libnode, '_a', dace.Memlet.from_array(A, A_arr))
+    state.add_edge(rB, None, libnode, '_b', dace.Memlet.from_array(B, B_arr))
+    state.add_edge(libnode, '_c', wC, None, dace.Memlet.from_array(C, C_arr))
+
+    sdfg.validate()
+
+
+def test_gemm_symbolic_1():
+    sdfg = dace.SDFG("gemm")
+    state = sdfg.add_state()
+    A, A_arr = sdfg.add_array("A", [M, K], dace.float64)
+    B, B_arr = sdfg.add_array("B", [K + 2, N], dace.float64)
+    C, C_arr = sdfg.add_array("C", [M, N], dace.float64)
+
+    rA = state.add_read("A")
+    rB = state.add_read("B")
+    wC = state.add_write("C")
+
+    libnode = Gemm('_Gemm_', transA=False, transB=False, alpha=1.0, beta=0.0)
+    state.add_node(libnode)
+
+    state.add_edge(rA, None, libnode, '_a', dace.Memlet.from_array(A, A_arr))
+    state.add_edge(rB, None, libnode, '_b', dace.Memlet.from_array(B, B_arr))
+    state.add_edge(libnode, '_c', wC, None, dace.Memlet.from_array(C, C_arr))
+
+    try:
+        sdfg.validate()
+    except dace.sdfg.InvalidSDFGError:
+        pass
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1 and sys.argv[1] == 'gpu':
         test_library_gemm('cuBLAS')
-    test_library_gemm('pure')
-    test_library_gemm('MKL')
+    # test_library_gemm('pure')
+    # test_library_gemm('MKL')
+    test_gemm_symbolic()
+    test_gemm_symbolic_1()
diff --git a/tests/numpy/matrix_multiplication_test.py b/tests/numpy/matrix_multiplication_test.py
index 1fa23c7268..a825d5d0ab 100644
--- a/tests/numpy/matrix_multiplication_test.py
+++ b/tests/numpy/matrix_multiplication_test.py
@@ -1,9 +1,9 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import unittest
 import dace
 import numpy as np
 
-B, M, N, K = tuple(dace.symbol(k) for k in 'BMNK')
+B, M, N, K, L, O = tuple(dace.symbol(k) for k in 'BMNKLO')
 
 
 class MatrixMultiplication(unittest.TestCase):
@@ -39,6 +39,28 @@ def mmmtest(a: dace.float64[M, K], b: dace.float64[B, K, N]):
         c = mmmtest(a, b)
         self.assertEqual(list(c.shape), [3, 34, 31])
         self.assertTrue(np.allclose(c, a @ b))
+    
+    def test_mm_symbolic(self):
+        @dace.program
+        def mmtest_symbolic(a: dace.float64[M, K], b: dace.float64[O, N]):
+            return a @ b
+
+        a = np.random.rand(32, 33)
+        b = np.random.rand(33, 34)
+        c = mmtest_symbolic(a, b)
+        self.assertEqual(list(c.shape), [32, 34])
+        self.assertTrue(np.allclose(c, a @ b))
+    
+    def test_mmm_batch_symbolic(self):
+        @dace.program
+        def mmmtest_symbolic(a: dace.float64[B, M, K], b: dace.float64[L, O, N]):
+            return a @ b
+
+        a = np.random.rand(3, 34, 32)
+        b = np.random.rand(3, 32, 31)
+        c = mmmtest_symbolic(a, b)
+        self.assertEqual(list(c.shape), [3, 34, 31])
+        self.assertTrue(np.allclose(c, a @ b))
 
 
 if __name__ == '__main__':

From adf32d76ca2ea136e4f450533b01fd34c86891cf Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 07:56:22 -0700
Subject: [PATCH 266/392] Support None sets in SetProperty

---
 dace/properties.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dace/properties.py b/dace/properties.py
index 0e8a010d71..6e883f8549 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -894,11 +894,18 @@ def from_json(self, l, sdfg=None):
         return set(l)
 
     def __get__(self, obj, objtype=None):
+        val = super(SetProperty, self).__get__(obj, objtype)
+        if val is None:
+            return val
+        
         # Copy to avoid changes in the set at callee to be reflected in
         # the node directly
-        return set(super(SetProperty, self).__get__(obj, objtype))
+        return set(val)
 
     def __set__(self, obj, val):
+        if val is None:
+            return super(SetProperty, self).__set__(obj, val)
+        
         # Check for uniqueness
         if len(val) != len(set(val)):
             dups = set([x for x in val if val.count(x) > 1])

From 5d7c67b0e13e73452e19845725c922fbadf88a39 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 07:56:37 -0700
Subject: [PATCH 267/392] Filter symbols in PruneSymbols

---
 dace/transformation/passes/prune_symbols.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dace/transformation/passes/prune_symbols.py b/dace/transformation/passes/prune_symbols.py
index 26530b87dc..05220763a9 100644
--- a/dace/transformation/passes/prune_symbols.py
+++ b/dace/transformation/passes/prune_symbols.py
@@ -23,6 +23,7 @@ class RemoveUnusedSymbols(ppl.Pass):
     CATEGORY: str = 'Simplification'
 
     recursive = properties.Property(dtype=bool, default=True, desc='Prune nested SDFGs recursively')
+    symbols = properties.SetProperty(element_type=str, allow_none=True, desc='Limit considered symbols to this set')
 
     def modifies(self) -> ppl.Modifies:
         return ppl.Modifies.Symbols
@@ -43,11 +44,13 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]:
         """
         result: Set[str] = set()
 
+        symbols_to_consider = self.symbols or set(sdfg.symbols.keys())
+
         # Compute used symbols
         used_symbols = self.used_symbols(sdfg)
 
         # Remove unused symbols
-        for sym in set(sdfg.symbols.keys()) - used_symbols:
+        for sym in symbols_to_consider - used_symbols:
             sdfg.remove_symbol(sym)
             result.add(sym)
 

From 25839427a4e7415bd10bd37155aa364abc7d03d5 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 07:57:35 -0700
Subject: [PATCH 268/392] Persistent fusion: Remove now-unused symbols and do
 not remove used scalars

---
 .../subgraph/gpu_persistent_fusion.py         | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py
index 1cf93469bb..df8511a288 100644
--- a/dace/transformation/subgraph/gpu_persistent_fusion.py
+++ b/dace/transformation/subgraph/gpu_persistent_fusion.py
@@ -45,7 +45,7 @@ class GPUPersistentKernel(SubgraphTransformation):
     validate = Property(
         desc="Validate the sdfg and the nested sdfg",
         dtype=bool,
-        default=True,
+        default=False,
     )
 
     include_in_assignment = Property(
@@ -172,10 +172,15 @@ def apply(self, sdfg: SDFG):
         # create sdfg for kernel and fill it with states and edges from
         # ssubgraph dfg will be nested at the end
         kernel_sdfg = SDFG('{}kernel'.format(self.kernel_prefix + '_' if self.kernel_prefix != '' else ''))
+        new_symbols = set()
 
         edges = subgraph.edges()
         for edge in edges:
             kernel_sdfg.add_edge(edge.src, edge.dst, edge.data)
+            for k in entry_edge.data.assignments:
+                new_symbols.add(k)
+                if k in sdfg.symbols and k not in kernel_sdfg.symbols:
+                    kernel_sdfg.add_symbol(k, sdfg.symbols[k])
 
         # Setting entry node in nested SDFG if no entry guard was created
         if entry_guard_state is None:
@@ -187,6 +192,7 @@ def apply(self, sdfg: SDFG):
         # remove the now nested nodes from the outer sdfg and make sure the
         # launch state is properly connected to remaining states
         sdfg.remove_nodes_from(subgraph.nodes())
+        other_states = sdfg.nodes()
 
         if entry_state_out is not None \
                 and len(sdfg.edges_between(entry_state_out, launch_state)) == 0:
@@ -199,13 +205,16 @@ def apply(self, sdfg: SDFG):
         # Handle data for kernel
         kernel_data = set(node.data for state in kernel_sdfg for node in state.nodes()
                           if isinstance(node, nodes.AccessNode))
+        other_data = set(node.data for state in other_states for node in state.nodes()
+                         if isinstance(node, nodes.AccessNode))
 
         # move Streams and Register data into the nested SDFG
         # normal data will be added as kernel argument
         kernel_args = []
         for data in kernel_data:
-            if (isinstance(sdfg.arrays[data], dace.data.Stream) or
-                (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage == StorageType.Register)):
+            if data not in other_data and (isinstance(sdfg.arrays[data], dace.data.Stream) or
+                                           (isinstance(sdfg.arrays[data], dace.data.Array)
+                                            and sdfg.arrays[data].storage == StorageType.Register)):
                 kernel_sdfg.add_datadesc(data, sdfg.arrays[data])
                 del sdfg.arrays[data]
             else:
@@ -266,6 +275,12 @@ def apply(self, sdfg: SDFG):
                                          src_conn=arg,
                                          memlet=Memlet.from_array(arg, sdfg.arrays[arg]))
 
+        # Remove no-longer-used symbols in parent SDFG
+        from dace.transformation.passes.prune_symbols import RemoveUnusedSymbols
+        p = RemoveUnusedSymbols()
+        p.symbols = new_symbols
+        p.apply_pass(sdfg, {})
+
         # Transformation is done
         if self.validate:
             sdfg.validate()

From 174eb8ed8f64f6a6a7fa54b4893a0d20ff96b933 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 09:07:59 -0700
Subject: [PATCH 269/392] CUDA codegen: persistent free tasklet write changes
 generated scope

---
 dace/codegen/targets/cuda.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 8f0139f8fb..ee49f04d03 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1306,10 +1306,31 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre
             for c in components:
 
                 has_map = any(isinstance(node, dace.nodes.MapEntry) for node in c.nodes())
+                # If a global is modified, execute once per global state,
+                # if a shared memory element is modified, execute once per block,
+                # if a local scalar is modified, execute in every thread.
                 if not has_map:
-                    callsite_stream.write("if (blockIdx.x == 0 "
-                                          "&& threadIdx.x == 0) "
-                                          "{  // sub-graph begin", sdfg, state.node_id)
+                    written_nodes = [n for n in c if state.in_degree(n) > 0 and isinstance(n, dace.nodes.AccessNode)]
+
+                    # The order of the branching below matters - it reduces the scope with every detected write
+                    write_scope = 'thread'  # General case acts in every thread
+                    if any(sdfg.arrays[n.data].storage in (dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned)
+                           for n in written_nodes):
+                        write_scope = 'grid'
+                    if any(sdfg.arrays[n.data].storage == dtypes.StorageType.GPU_Shared for n in written_nodes):
+                        write_scope = 'block'
+                    if any(sdfg.arrays[n.data].storage == dtypes.StorageType.Register for n in written_nodes):
+                        write_scope = 'thread'
+
+                    if write_scope == 'grid':
+                        callsite_stream.write("if (blockIdx.x == 0 "
+                                            "&& threadIdx.x == 0) "
+                                            "{  // sub-graph begin", sdfg, state.node_id)
+                    elif write_scope == 'block':
+                        callsite_stream.write("if (threadIdx.x == 0) "
+                                            "{  // sub-graph begin", sdfg, state.node_id)
+                    else:
+                        callsite_stream.write("{  // subgraph begin", sdfg, state.node_id)
                 else:
                     callsite_stream.write("{  // subgraph begin", sdfg, state.node_id)
 

From aae25a772793e2e9ddbe538b1d4325090c39c90e Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 09:08:17 -0700
Subject: [PATCH 270/392] Handle empty inputs/outputs

---
 .../subgraph/gpu_persistent_fusion.py         | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/dace/transformation/subgraph/gpu_persistent_fusion.py b/dace/transformation/subgraph/gpu_persistent_fusion.py
index df8511a288..ff4812d0af 100644
--- a/dace/transformation/subgraph/gpu_persistent_fusion.py
+++ b/dace/transformation/subgraph/gpu_persistent_fusion.py
@@ -213,8 +213,8 @@ def apply(self, sdfg: SDFG):
         kernel_args = []
         for data in kernel_data:
             if data not in other_data and (isinstance(sdfg.arrays[data], dace.data.Stream) or
-                                           (isinstance(sdfg.arrays[data], dace.data.Array)
-                                            and sdfg.arrays[data].storage == StorageType.Register)):
+                                           (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage
+                                            in (StorageType.Register, StorageType.GPU_Shared))):
                 kernel_sdfg.add_datadesc(data, sdfg.arrays[data])
                 del sdfg.arrays[data]
             else:
@@ -257,23 +257,29 @@ def apply(self, sdfg: SDFG):
         )
         nested_sdfg.schedule = ScheduleType.GPU_Persistent
 
+        # If no inputs or outputs were given, connect with an empty memlet
+        if not kernel_args_read:
+            launch_state.add_nedge(map_entry, nested_sdfg, dace.Memlet())
+        if not kernel_args_write:
+            launch_state.add_nedge(nested_sdfg, map_exit, dace.Memlet())
+
         # Create and connect read only data access nodes
         for arg in kernel_args_read:
             read_node = launch_state.add_read(arg)
-            launch_state.add_memlet_path(read_node,
-                                         map_entry,
-                                         nested_sdfg,
-                                         dst_conn=arg,
-                                         memlet=Memlet.from_array(arg, sdfg.arrays[arg]))
+            launch_state.add_edge_pair(map_entry,
+                                       nested_sdfg,
+                                       read_node,
+                                       internal_connector=arg,
+                                       internal_memlet=Memlet.from_array(arg, sdfg.arrays[arg]))
 
         # Create and connect writable data access nodes
         for arg in kernel_args_write:
             write_node = launch_state.add_write(arg)
-            launch_state.add_memlet_path(nested_sdfg,
-                                         map_exit,
-                                         write_node,
-                                         src_conn=arg,
-                                         memlet=Memlet.from_array(arg, sdfg.arrays[arg]))
+            launch_state.add_edge_pair(map_exit,
+                                       nested_sdfg,
+                                       write_node,
+                                       internal_connector=arg,
+                                       internal_memlet=Memlet.from_array(arg, sdfg.arrays[arg]))
 
         # Remove no-longer-used symbols in parent SDFG
         from dace.transformation.passes.prune_symbols import RemoveUnusedSymbols
@@ -318,6 +324,12 @@ def is_gpu_state(sdfg: SDFG, state: SDFGState) -> bool:
 
     @staticmethod
     def get_entry_states(sdfg: SDFG, subgraph):
+        """
+        Returns a 2-tuple of the (internal, external) states inside and outside of the SDFG,
+        around which the new nested SDFG will be created. The first element will be a set
+        of source nodes in the internal SDFG; and the second element will be a set of
+        predecessor nodes to the nested SDFG.
+        """
         entry_states_in = set()
         entry_states_out = set()
 
@@ -333,6 +345,12 @@ def get_entry_states(sdfg: SDFG, subgraph):
 
     @staticmethod
     def get_exit_states(sdfg: SDFG, subgraph):
+        """
+        Returns a 2-tuple of the (internal, external) states inside and outside of the SDFG,
+        around which the new nested SDFG will be created. The first element will be a set
+        of sink nodes in the internal SDFG; and the second element will be a set of
+        successor nodes to the nested SDFG.
+        """
         exit_states_in = set()
         exit_states_out = set()
 

From f11e5c44e0a796e5af1702dd08cfe32bb787af9b Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 09:08:28 -0700
Subject: [PATCH 271/392] Add test

---
 tests/persistent_fusion_cudatest.py | 266 ++++++++++++++++------------
 1 file changed, 152 insertions(+), 114 deletions(-)

diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py
index ac05761bee..415162e4f8 100644
--- a/tests/persistent_fusion_cudatest.py
+++ b/tests/persistent_fusion_cudatest.py
@@ -10,118 +10,152 @@
 N = dace.symbol('N')
 nnz = dace.symbol('nnz')
 
-bfs = dace.SDFG('bfs')
 
-# Inputs to the BFS SDFG
-bfs.add_array('col_index', shape=[nnz], dtype=dace.int32)
-bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32)
-bfs.add_scalar('root', dtype=dace.int32)
-bfs.add_array('result', shape=[N], dtype=dace.int32)
+def _make_sdfg():
+    bfs = dace.SDFG('bfs')
 
-# Transients fot interstate data transfers
-# TODO: Replace may_alias with better code generation
-bfs.add_transient('count1', shape=[1], dtype=dace.int32, may_alias=True)
-bfs.add_transient('frontier1', shape=[N], dtype=dace.int32, may_alias=True)
+    # Inputs to the BFS SDFG
+    bfs.add_array('col_index', shape=[nnz], dtype=dace.int32)
+    bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32)
+    bfs.add_scalar('root', dtype=dace.int32)
+    bfs.add_array('result', shape=[N], dtype=dace.int32)
 
-bfs.add_transient('count2', shape=[1], dtype=dace.int32, may_alias=True)
-bfs.add_transient('frontier2', shape=[N], dtype=dace.int32, may_alias=True)
+    # Transients fot interstate data transfers
+    # TODO: Replace may_alias with better code generation
+    bfs.add_transient('count1', shape=[1], dtype=dace.int32, may_alias=True)
+    bfs.add_transient('frontier1', shape=[N], dtype=dace.int32, may_alias=True)
 
-# Transient streams to accommodate dynamic size of frontier arrays
-bfs.add_stream('stream1', dtype=dace.int32, transient=True, buffer_size=N)
-bfs.add_stream('stream2', dtype=dace.int32, transient=True, buffer_size=N)
+    bfs.add_transient('count2', shape=[1], dtype=dace.int32, may_alias=True)
+    bfs.add_transient('frontier2', shape=[N], dtype=dace.int32, may_alias=True)
 
-# Transients needed for update states
-bfs.add_transient('temp_ids1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
-bfs.add_transient('temp_ide1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
+    # Transient streams to accommodate dynamic size of frontier arrays
+    bfs.add_stream('stream1', dtype=dace.int32, transient=True, buffer_size=N)
+    bfs.add_stream('stream2', dtype=dace.int32, transient=True, buffer_size=N)
 
-bfs.add_transient('temp_ids2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
-bfs.add_transient('temp_ide2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
+    # Transients needed for update states
+    bfs.add_transient('temp_ids1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
+    bfs.add_transient('temp_ide1', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
 
-# Adding states
-# init data
-s_init = bfs.add_state('init')
+    bfs.add_transient('temp_ids2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
+    bfs.add_transient('temp_ide2', shape=[1], dtype=dace.int32, storage=dace.StorageType.Register)
 
-# copy of the states because we don't want to copy the data
-s_reset1 = bfs.add_state('reset1')
-s_update1 = bfs.add_state('update1')
+    # Adding states
+    # init data
+    s_init = bfs.add_state('init')
 
-s_reset2 = bfs.add_state('reset2')
-s_update2 = bfs.add_state('update2')
+    # copy of the states because we don't want to copy the data
+    s_reset1 = bfs.add_state('reset1')
+    s_update1 = bfs.add_state('update1')
 
-# end state to make transformation work
-s_end = bfs.add_state('end')
+    s_reset2 = bfs.add_state('reset2')
+    s_update2 = bfs.add_state('update2')
 
-# Connecting states with appropriate conditions and depth updates
-bfs.add_edge(s_init, s_reset1, dace.InterstateEdge(None, {'depth': '1'}))
-bfs.add_edge(s_reset1, s_update1, dace.InterstateEdge(None))
-bfs.add_edge(s_update1, s_reset2, dace.InterstateEdge('count2[0] > 0', {'depth': 'depth + 1'}))
-bfs.add_edge(s_update1, s_end, dace.InterstateEdge('count2[0] <= 0'))
-bfs.add_edge(s_reset2, s_update2, dace.InterstateEdge(None))
-bfs.add_edge(s_update2, s_reset1, dace.InterstateEdge('count1[0] > 0', {'depth': 'depth + 1'}))
-bfs.add_edge(s_update2, s_end, dace.InterstateEdge('count1[0] <= 0'))
+    # end state to make transformation work
+    s_end = bfs.add_state('end')
 
-# -----------------------------
-# Helper functions to init data
-# -----------------------------
+    # Connecting states with appropriate conditions and depth updates
+    bfs.add_edge(s_init, s_reset1, dace.InterstateEdge(None, {'depth': '1'}))
+    bfs.add_edge(s_reset1, s_update1, dace.InterstateEdge(None))
+    bfs.add_edge(s_update1, s_reset2, dace.InterstateEdge('count2[0] > 0', {'depth': 'depth + 1'}))
+    bfs.add_edge(s_update1, s_end, dace.InterstateEdge('count2[0] <= 0'))
+    bfs.add_edge(s_reset2, s_update2, dace.InterstateEdge(None))
+    bfs.add_edge(s_update2, s_reset1, dace.InterstateEdge('count1[0] > 0', {'depth': 'depth + 1'}))
+    bfs.add_edge(s_update2, s_end, dace.InterstateEdge('count1[0] <= 0'))
 
+    # =============================================================
+    # State: init
+    # Filling init state with init of result, frontier1, and count1
 
-def init_scalar(state, node, value):
-    tasklet = state.add_tasklet('set_%s' % node.data, {}, {'out'}, '''
-out = %d
-        ''' % value)
+    root_in = s_init.add_read('root')
 
-    state.add_memlet_path(tasklet, node, src_conn='out', memlet=dace.Memlet.simple(node.data, '0'))
+    count1_out = s_init.add_write('count1')
+    result_out = s_init.add_write('result')
+    frontier_out = s_init.add_write('frontier1')
+
+    s_init.add_memlet_path(root_in, frontier_out, memlet=dace.Memlet.simple(root_in.data, '0', other_subset_str='0'))
+
+    tasklet = s_init.add_tasklet(
+        'set_count1',
+        {},
+        {'out'},
+        'out = 1',
+    )
+
+    s_init.add_memlet_path(tasklet, count1_out, src_conn='out', memlet=dace.Memlet.simple(count1_out.data, '0'))
+
+    map_entry, map_exit = s_init.add_map(
+        'set_result_map',
+        dict(i='0:N'),
+    )
+
+    tasklet = s_init.add_tasklet('set_result', {'root_idx'}, {'result_out'}, 'result_out = 0 if i == root_idx else -1')
+
+    s_init.add_memlet_path(root_in, map_entry, tasklet, dst_conn='root_idx', memlet=dace.Memlet.simple(root_in.data, '0'))
 
+    s_init.add_memlet_path(tasklet,
+                        map_exit,
+                        result_out,
+                        src_conn='result_out',
+                        memlet=dace.Memlet.simple(result_out.data, 'i'))
 
-# =============================================================
-# State: init
-# Filling init state with init of result, frontier1, and count1
+    # -------------------------------------------------------------
 
-root_in = s_init.add_read('root')
+    # =============================================================
+    # State: reset
+    # Filling reset states, respective count is reset to 0
 
-count1_out = s_init.add_write('count1')
-result_out = s_init.add_write('result')
-frontier_out = s_init.add_write('frontier1')
+    count2_out = s_reset1.add_write('count2')
+    init_scalar(s_reset1, count2_out, 0)
 
-s_init.add_memlet_path(root_in, frontier_out, memlet=dace.Memlet.simple(root_in.data, '0', other_subset_str='0'))
+    count1_out = s_reset2.add_write('count1')
+    init_scalar(s_reset2, count1_out, 0)
 
-tasklet = s_init.add_tasklet(
-    'set_count1',
-    {},
-    {'out'},
-    'out = 1',
-)
+    # -------------------------------------------------------------
 
-s_init.add_memlet_path(tasklet, count1_out, src_conn='out', memlet=dace.Memlet.simple(count1_out.data, '0'))
+    # Filling update states, only difference is which frontier/count they read/write from/to
 
-map_entry, map_exit = s_init.add_map(
-    'set_result_map',
-    dict(i='0:N'),
-)
+    front_in = s_update1.add_read('frontier1')
+    count_in = s_update1.add_read('count1')
 
-tasklet = s_init.add_tasklet('set_result', {'root_idx'}, {'result_out'}, 'result_out = 0 if i == root_idx else -1')
+    front_out = s_update1.add_write('frontier2')
+    count_out = s_update1.add_write('count2')
 
-s_init.add_memlet_path(root_in, map_entry, tasklet, dst_conn='root_idx', memlet=dace.Memlet.simple(root_in.data, '0'))
+    stream2_io = s_update1.add_access('stream2')
 
-s_init.add_memlet_path(tasklet,
-                       map_exit,
-                       result_out,
-                       src_conn='result_out',
-                       memlet=dace.Memlet.simple(result_out.data, 'i'))
+    temp_ids1_io = s_update1.add_access('temp_ids1')
+    temp_ide1_io = s_update1.add_access('temp_ide1')
 
-# -------------------------------------------------------------
+    fill_update_state(s_update1, front_in, count_in, front_out, count_out, stream2_io, temp_ids1_io, temp_ide1_io)
 
-# =============================================================
-# State: reset
-# Filling reset states, respective count is reset to 0
+    front_in = s_update2.add_read('frontier2')
+    count_in = s_update2.add_read('count2')
 
-count2_out = s_reset1.add_write('count2')
-init_scalar(s_reset1, count2_out, 0)
+    front_out = s_update2.add_write('frontier1')
+    count_out = s_update2.add_write('count1')
 
-count1_out = s_reset2.add_write('count1')
-init_scalar(s_reset2, count1_out, 0)
+    stream1_io = s_update2.add_access('stream1')
 
-# -------------------------------------------------------------
+    temp_ids2_io = s_update2.add_access('temp_ids2')
+    temp_ide2_io = s_update2.add_access('temp_ide2')
+
+    fill_update_state(s_update2, front_in, count_in, front_out, count_out, stream1_io, temp_ids2_io, temp_ide2_io)
+
+    # validate and generate sdfg
+    bfs.fill_scope_connectors()
+    bfs.validate()
+    return bfs, s_init
+
+# -----------------------------
+# Helper functions to init data
+# -----------------------------
+
+
+def init_scalar(state, node, value):
+    tasklet = state.add_tasklet('set_%s' % node.data, {}, {'out'}, '''
+out = %d
+        ''' % value)
+
+    state.add_memlet_path(tasklet, node, src_conn='out', memlet=dace.Memlet.simple(node.data, '0'))
 
 
 # Here the state is duplicated so the memory doesn't have to be copied from one to another
@@ -233,42 +267,10 @@ def fill_update_state(state, front_in, front_in_count, front_out, front_out_coun
     state.add_memlet_path(s_frontier_io, front_out, memlet=dace.Memlet.simple(front_out.data, '0'))
 
 
-# Filling update states, only difference is which frontier/count they read/write from/to
-
-front_in = s_update1.add_read('frontier1')
-count_in = s_update1.add_read('count1')
-
-front_out = s_update1.add_write('frontier2')
-count_out = s_update1.add_write('count2')
-
-stream2_io = s_update1.add_access('stream2')
-
-temp_ids1_io = s_update1.add_access('temp_ids1')
-temp_ide1_io = s_update1.add_access('temp_ide1')
-
-fill_update_state(s_update1, front_in, count_in, front_out, count_out, stream2_io, temp_ids1_io, temp_ide1_io)
-
-front_in = s_update2.add_read('frontier2')
-count_in = s_update2.add_read('count2')
-
-front_out = s_update2.add_write('frontier1')
-count_out = s_update2.add_write('count1')
-
-stream1_io = s_update2.add_access('stream1')
-
-temp_ids2_io = s_update2.add_access('temp_ids2')
-temp_ide2_io = s_update2.add_access('temp_ide2')
-
-fill_update_state(s_update2, front_in, count_in, front_out, count_out, stream1_io, temp_ids2_io, temp_ide2_io)
-
-# validate and generate sdfg
-bfs.fill_scope_connectors()
-bfs.validate()
-
 
 @pytest.mark.gpu
 def test_persistent_fusion():
-    sdfg = bfs
+    sdfg, s_init = _make_sdfg()
 
     sdfg.apply_gpu_transformations(validate=False, simplify=False)  # Only validate after fusion
 
@@ -320,7 +322,43 @@ def test_persistent_fusion():
 
     assert np.allclose(depth, reference), "Result doesn't match!"
 
+def test_persistent_fusion_interstate():
+    N = dace.symbol('N', dtype=dace.int64)
+
+
+    @dace.program(auto_optimize=False, device=dace.DeviceType.GPU)
+    def func(A: dace.float64[N], B: dace.float64[N]):
+        a = 10.2
+
+        for t in range(1, 10):
+            if t < N:
+                A[:] = (A + B + a) / 2
+                a += 1
+
+    # Initialization
+    N = 100
+    A = np.random.rand(N)
+    B = np.random.rand(N)
+
+    sdfg = func.to_sdfg()
+    sdfg.apply_gpu_transformations()
+    content_nodes = set(sdfg.nodes()) - {sdfg.start_state, sdfg.sink_nodes()[0]}
+    subgraph = SubgraphView(sdfg, content_nodes)
+
+    transform = GPUPersistentKernel()
+    transform.setup_match(subgraph)
+    transform.kernel_prefix = 'stuff'
+    transform.apply(sdfg)
+
+    aref = np.copy(A)
+    func.f(aref, B)
+
+    sdfg(A=A, B=B, N=N)
+    
+    assert np.allclose(A, aref)
+
 
 # Actual execution
 if __name__ == "__main__":
     test_persistent_fusion()
+    test_persistent_fusion_interstate()

From 37b65669927c3476ee3bfa2ca60bd44d4999a427 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Mon, 17 Jul 2023 09:13:06 -0700
Subject: [PATCH 272/392] Add test

---
 .../gpu_scalar_execution_context_test.py      | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 tests/codegen/gpu_scalar_execution_context_test.py

diff --git a/tests/codegen/gpu_scalar_execution_context_test.py b/tests/codegen/gpu_scalar_execution_context_test.py
new file mode 100644
index 0000000000..f738bfe26c
--- /dev/null
+++ b/tests/codegen/gpu_scalar_execution_context_test.py
@@ -0,0 +1,91 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+Tests how code is generated for free tasklets inside a GPU kernel nested SDFG.
+"""
+
+import dace
+from dace.sdfg.graph import SubgraphView
+from dace.transformation.subgraph import GPUPersistentKernel
+import numpy as np
+import pytest
+
+
+def _tester(A: dace.float64[64]):
+    t = 12.3
+    for _ in range(5):
+        A += t
+        t += 1.01
+
+
+def _modify_array(sdfg: dace.SDFG, storage: dace.StorageType):
+    for nsdfg, aname, aval in sdfg.arrays_recursive():
+        if aname == 't':
+            if storage == dace.StorageType.GPU_Shared:
+                aval = dace.data.Array(aval.dtype, [1], transient=aval.transient)
+                nsdfg.arrays[aname] = aval
+            aval.storage = storage
+            break
+    else:
+        raise ValueError('Array not found')
+
+
+def _make_program(storage: dace.StorageType, persistent=False):
+    sdfg = dace.program(_tester).to_sdfg()
+    sdfg.apply_gpu_transformations(simplify=False)
+    _modify_array(sdfg, storage)
+
+    if persistent:
+        content_nodes = set(sdfg.nodes()) - {sdfg.start_state, sdfg.sink_nodes()[0]}
+        subgraph = SubgraphView(sdfg, content_nodes)
+        transform = GPUPersistentKernel()
+        transform.setup_match(subgraph)
+        transform.apply(sdfg)
+
+    return sdfg
+
+
+@pytest.mark.gpu
+def test_global_scalar_update():
+    sdfg = _make_program(dace.StorageType.GPU_Global, True)
+    a = np.random.rand(64)
+    aref = np.copy(a)
+    _tester(aref)
+    sdfg(a)
+    assert np.allclose(a, aref)
+
+
+@pytest.mark.gpu
+def test_shared_scalar_update():
+    sdfg = _make_program(dace.StorageType.GPU_Shared, persistent=True)
+
+    a = np.random.rand(64)
+    aref = np.copy(a)
+    _tester(aref)
+
+    # Ensure block size will create at least two thread-blocks
+    with dace.config.set_temporary('compiler', 'cuda', 'persistent_map_SM_fraction', value=0.0001):
+        with dace.config.set_temporary('compiler', 'cuda', 'persistent_map_occupancy', value=2):
+            with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32,1,1'):
+                sdfg(a)
+
+    assert np.allclose(a, aref)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('persistent', (False, True))
+def test_register_scalar_update(persistent):
+    sdfg = _make_program(dace.StorageType.Register, persistent)
+
+    a = np.random.rand(64)
+    aref = np.copy(a)
+    _tester(aref)
+    sdfg(a)
+
+    assert np.allclose(a, aref)
+
+
+if __name__ == '__main__':
+    test_global_scalar_update()
+    test_shared_scalar_update()
+    test_register_scalar_update(False)
+    test_register_scalar_update(True)

From 6ae012ab285d92944bcd036b7edc97a758ce1952 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Mon, 17 Jul 2023 11:01:21 -0700
Subject: [PATCH 273/392] Fix test condition

---
 tests/persistent_fusion_cudatest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py
index 415162e4f8..4d2f38ceb1 100644
--- a/tests/persistent_fusion_cudatest.py
+++ b/tests/persistent_fusion_cudatest.py
@@ -322,6 +322,8 @@ def test_persistent_fusion():
 
     assert np.allclose(depth, reference), "Result doesn't match!"
 
+
+@pytest.mark.gpu
 def test_persistent_fusion_interstate():
     N = dace.symbol('N', dtype=dace.int64)
 

From 152b69c9121a2fdd277674fd9e678e413e1bb1f7 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Mon, 17 Jul 2023 12:26:17 -0700
Subject: [PATCH 274/392] Handle missing symbols better

---
 dace/transformation/passes/prune_symbols.py | 8 ++++++--
 tests/persistent_fusion_cudatest.py         | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/passes/prune_symbols.py b/dace/transformation/passes/prune_symbols.py
index 05220763a9..94fcbdbc58 100644
--- a/dace/transformation/passes/prune_symbols.py
+++ b/dace/transformation/passes/prune_symbols.py
@@ -51,8 +51,9 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]:
 
         # Remove unused symbols
         for sym in symbols_to_consider - used_symbols:
-            sdfg.remove_symbol(sym)
-            result.add(sym)
+            if sym in sdfg.symbols:
+                sdfg.remove_symbol(sym)
+                result.add(sym)
 
         if self.recursive:
             # Prune nested SDFGs recursively
@@ -62,7 +63,10 @@ def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[Tuple[int, str]]]:
             for state in sdfg.nodes():
                 for node in state.nodes():
                     if isinstance(node, nodes.NestedSDFG):
+                        old_symbols = self.symbols
+                        self.symbols = set()
                         nres = self.apply_pass(node.sdfg, _)
+                        self.symbols = old_symbols
                         if nres:
                             result.update(nres)
 
diff --git a/tests/persistent_fusion_cudatest.py b/tests/persistent_fusion_cudatest.py
index 4d2f38ceb1..e193b7431c 100644
--- a/tests/persistent_fusion_cudatest.py
+++ b/tests/persistent_fusion_cudatest.py
@@ -19,6 +19,7 @@ def _make_sdfg():
     bfs.add_array('row_index', shape=[N + 1], dtype=dace.int32)
     bfs.add_scalar('root', dtype=dace.int32)
     bfs.add_array('result', shape=[N], dtype=dace.int32)
+    bfs.add_symbol('depth', dace.int32)
 
     # Transients fot interstate data transfers
     # TODO: Replace may_alias with better code generation

From 57abd284500c1990ad2744160eb92aab5d08756d Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 18 Jul 2023 14:41:47 +0200
Subject: [PATCH 275/392] Added NestedDataClassProperty for nested data.

---
 dace/properties.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/dace/properties.py b/dace/properties.py
index 6e883f8549..30a3e0913b 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -1381,6 +1381,45 @@ def from_json(obj, context=None):
             raise TypeError("Cannot parse type from: {}".format(obj))
 
 
+class NestedDataClassProperty(Property):
+    """ Custom property type for nested data. """
+
+    def __get__(self, obj, objtype=None) -> 'Data':
+        return super().__get__(obj, objtype)
+
+    @property
+    def dtype(self):
+        return pydoc.locate("dace.data.Data")
+
+    @staticmethod
+    def from_string(s):
+        dtype = pydoc.locate("dace.data.{}".format(s))
+        if dtype is None or not isinstance(dtype, pydoc.locate("dace.data.Data")):
+            raise ValueError("Not a valid data type: {}".format(s))
+        return dtype
+
+    @staticmethod
+    def to_string(obj):
+        return obj.to_string()
+
+    def to_json(self, obj):
+        if obj is None:
+            return None
+        return obj.dtype.to_json()
+
+    @staticmethod
+    def from_json(obj, context=None):
+        if obj is None:
+            return None
+        elif isinstance(obj, str):
+            return NestedDataClassProperty.from_string(obj)
+        elif isinstance(obj, dict):
+            # Let the deserializer handle this
+            return dace.serialize.from_json(obj)
+        else:
+            raise TypeError("Cannot parse type from: {}".format(obj))
+
+
 class LibraryImplementationProperty(Property):
     """
     Property for choosing an implementation type for a library node. On the

From 09465d242fbf33036ebf35e1c9b43357c60648ca Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 18 Jul 2023 14:42:33 +0200
Subject: [PATCH 276/392] Added Structures and StructArrays.

---
 dace/data.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 6 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 2fc5f334c6..886fed75de 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1,10 +1,10 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import copy as cp
 import ctypes
 import functools
-import re
+
 from numbers import Number
-from typing import Any, Dict, Optional, Sequence, Set, Tuple
+from typing import Any, Dict, Optional, Sequence, Set, Tuple, Union
 
 import numpy
 import sympy as sp
@@ -17,9 +17,8 @@
 import dace.dtypes as dtypes
 from dace import serialize, symbolic
 from dace.codegen import cppunparse
-from dace.properties import (CodeProperty, DebugInfoProperty, DictProperty, EnumProperty, ListProperty, Property,
-                             ReferenceProperty, ShapeProperty, SubsetProperty, SymbolicProperty, TypeClassProperty,
-                             make_properties)
+from dace.properties import (DebugInfoProperty, DictProperty, EnumProperty, ListProperty, NestedDataClassProperty,
+                             Property, ShapeProperty, SymbolicProperty, TypeClassProperty, make_properties)
 
 
 def create_datadescriptor(obj, no_custom_desc=False):
@@ -342,6 +341,86 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
         return new_desc
 
 
+class Structure(Data):
+    """ Base class for structures. """
+
+    def __init__(self,
+                 shape: Sequence[Union[int, symbolic.SymbolicType]] = None,
+                 transient: bool = False,
+                 storage: dtypes.StorageType = dtypes.StorageType.Default,
+                 location: Dict[str, str] = None,
+                 lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
+                 debuginfo: dtypes.DebugInfo = None):
+        fields = {
+            attr: getattr(self, attr)
+            for attr in dir(self) if (
+                not attr in dir(Data) and
+                not attr.startswith("_") and
+                not attr in ('total_size', 'offset', 'start_offset', 'strides'))}
+        fields_and_types = dict()
+        symbols = set()
+        for attr in dir(self):
+            if (attr in dir(Data) or attr.startswith("__") or
+                    attr in ('total_size', 'offset', 'start_offset', 'strides')):
+                continue
+            value = getattr(self, attr)
+            if isinstance(value, Array):
+                symbols |= value.free_symbols
+                fields_and_types[attr] = (dtypes.pointer(value.dtype), str(_prod(value.shape)))
+            elif isinstance(value, Scalar):
+                symbols |= value.free_symbols
+                fields_and_types[attr] = value.dtype
+            elif isinstance(value, (sp.Basic, symbolic.SymExpr)):
+                symbols |= value.free_symbols
+                fields_and_types[attr] = symbolic.symtype(value)
+            elif isinstance(value, (int, numpy.integer)):
+                fields_and_types[attr] = dtypes.typeclass(type(value))
+            else:
+                raise TypeError(f"Attribute {attr}'s value {value} has unsupported type: {type(value)}")
+        for s in symbols:
+            if str(s) in fields_and_types:
+                continue
+            if hasattr(s, "dtype"):
+                fields_and_types[str(s)] = s.dtype
+            else:
+                fields_and_types[str(s)] = dtypes.int32
+        dtype = dtypes.struct(self.__class__.__name__, **fields_and_types)
+        shape = shape or (1,)
+        super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo)
+
+    @property
+    def total_size(self):
+        return -1
+
+    @property
+    def offset(self):
+        return [0]
+
+    @property
+    def start_offset(self):
+        return 0
+
+    @property
+    def strides(self):
+        return [1]
+
+    def as_arg(self, with_types=True, for_call=False, name=None):
+        if self.storage is dtypes.StorageType.GPU_Global:
+            return Array(self.dtype, [1]).as_arg(with_types, for_call, name)
+        if not with_types or for_call:
+            return name
+        return self.dtype.as_arg(name)
+
+    def __getitem__(self, s):
+        """ This is syntactic sugar that allows us to define an array type
+            with the following syntax: ``Structure[N,M]``
+            :return: A ``data.Array`` data descriptor.
+        """
+        if isinstance(s, list) or isinstance(s, tuple):
+            return StructArray(self, tuple(s))
+        return StructArray(self, (s, ))
+
+
 @make_properties
 class Scalar(Data):
     """ Data descriptor of a scalar value. """
@@ -902,6 +981,36 @@ def free_symbols(self):
         return result
 
 
+@make_properties
+class StructArray(Array):
+    """ Array of Structures. """
+
+    stype = NestedDataClassProperty(allow_none=True, default=None)
+
+    def __init__(self,
+                 stype,
+                 shape,
+                 transient=False,
+                 allow_conflicts=False,
+                 storage=dtypes.StorageType.Default,
+                 location=None,
+                 strides=None,
+                 offset=None,
+                 may_alias=False,
+                 lifetime=dtypes.AllocationLifetime.Scope,
+                 alignment=0,
+                 debuginfo=None,
+                 total_size=-1,
+                 start_offset=None,
+                 optional=None,
+                 pool=False):
+
+        self.stype = stype
+        dtype = stype.dtype
+        super(StructArray, self).__init__(dtype, shape, transient, allow_conflicts, storage, location, strides, offset,
+                                          may_alias, lifetime, alignment, debuginfo, total_size, start_offset, optional, pool)
+
+
 @make_properties
 class View(Array):
     """ 

From 51776a1b746126194fc1eebcece20adbe88be302 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 18 Jul 2023 15:09:00 +0200
Subject: [PATCH 277/392] Break array lengths down to their symbolic tokents.

---
 dace/dtypes.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index dee2283f25..230197bc6f 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -791,6 +791,7 @@ def from_json(json_obj, context=None):
         return ret
 
     def _parse_field_and_types(self, **fields_and_types):
+        from dace.symbolic import pystr_to_symbolic
         self._data = dict()
         self._length = dict()
         self.bytes = 0
@@ -799,8 +800,12 @@ def _parse_field_and_types(self, **fields_and_types):
                 t, l = v
                 if not isinstance(t, pointer):
                     raise TypeError("Only pointer types may have a length.")
-                if l not in fields_and_types.keys():
-                    raise ValueError("Length {} not a field of struct {}".format(l, self.name))
+                sym_tokens = pystr_to_symbolic(l).free_symbols
+                for sym in sym_tokens:
+                    if str(sym) not in fields_and_types.keys():
+                        raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}")
+                # if l not in fields_and_types.keys():
+                #     raise ValueError("Length {} not a field of struct {}".format(l, self.name))
                 self._data[k] = t
                 self._length[k] = l
                 self.bytes += t.bytes

From b23ed86de823398321ef6f620e3db0d3fd7f857b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 18 Jul 2023 15:11:09 +0200
Subject: [PATCH 278/392] Allow structures to have fields whose name doesn't
 start with underscore.

---
 dace/properties.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dace/properties.py b/dace/properties.py
index 30a3e0913b..679c0b9596 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import ast
 from collections import OrderedDict
 import copy
@@ -412,12 +412,12 @@ def initialize_properties(obj, *args, **kwargs):
             except AttributeError:
                 if not prop.unmapped:
                     raise PropertyError("Property {} is unassigned in __init__ for {}".format(name, cls.__name__))
-        # Assert that there are no fields in the object not captured by
-        # properties, unless they are prefixed with "_"
-        for name, prop in obj.__dict__.items():
-            if (name not in properties and not name.startswith("_") and name not in dir(type(obj))):
-                raise PropertyError("{} : Variable {} is neither a Property nor "
-                                    "an internal variable (prefixed with \"_\")".format(str(type(obj)), name))
+        # Assert that there are no fields in the object not captured by properties, unless they are prefixed with "_"
+        if not isinstance(obj, dace.data.Structure):
+            for name, prop in obj.__dict__.items():
+                if (name not in properties and not name.startswith("_") and name not in dir(type(obj))):
+                    raise PropertyError("{} : Variable {} is neither a Property nor "
+                                        "an internal variable (prefixed with \"_\")".format(str(type(obj)), name))
 
     # Replace the __init__ method
     cls.__init__ = initialize_properties

From 777821f0a940bc2f981ef5c04749c0f49968e0d1 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:21:54 +0200
Subject: [PATCH 279/392] Structures now have a "members" dictionary. Their
 dtype is a pointer to the corresponding dtypes.struct typeclass.

---
 dace/data.py | 64 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 886fed75de..0f1ef1f266 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -341,42 +341,54 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
         return new_desc
 
 
+def _arrays_to_json(arrays):
+    if arrays is None:
+        return None
+    return {k: serialize.to_json(v) for k, v in arrays.items()}
+
+
+def _arrays_from_json(obj, context=None):
+    if obj is None:
+        return {}
+    return {k: serialize.from_json(v, context) for k, v in obj.items()}
+
+
+@make_properties
 class Structure(Data):
     """ Base class for structures. """
 
+    members = Property(dtype=dict,
+                       desc="Dictionary of structure members",
+                       from_json=_arrays_from_json,
+                       to_json=_arrays_to_json)
+
     def __init__(self,
-                 shape: Sequence[Union[int, symbolic.SymbolicType]] = None,
+                 members: Dict[str, Any],
                  transient: bool = False,
                  storage: dtypes.StorageType = dtypes.StorageType.Default,
                  location: Dict[str, str] = None,
                  lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
                  debuginfo: dtypes.DebugInfo = None):
-        fields = {
-            attr: getattr(self, attr)
-            for attr in dir(self) if (
-                not attr in dir(Data) and
-                not attr.startswith("_") and
-                not attr in ('total_size', 'offset', 'start_offset', 'strides'))}
+        self.members = members or {}
         fields_and_types = dict()
         symbols = set()
-        for attr in dir(self):
-            if (attr in dir(Data) or attr.startswith("__") or
-                    attr in ('total_size', 'offset', 'start_offset', 'strides')):
-                continue
-            value = getattr(self, attr)
-            if isinstance(value, Array):
-                symbols |= value.free_symbols
-                fields_and_types[attr] = (dtypes.pointer(value.dtype), str(_prod(value.shape)))
-            elif isinstance(value, Scalar):
-                symbols |= value.free_symbols
-                fields_and_types[attr] = value.dtype
-            elif isinstance(value, (sp.Basic, symbolic.SymExpr)):
-                symbols |= value.free_symbols
-                fields_and_types[attr] = symbolic.symtype(value)
-            elif isinstance(value, (int, numpy.integer)):
-                fields_and_types[attr] = dtypes.typeclass(type(value))
+        for k, v in members.items():
+            if isinstance(v, Structure):
+                symbols |= v.free_symbols
+                fields_and_types[k] = (v.dtype, str(v.total_size))
+            elif isinstance(v, Array):
+                symbols |= v.free_symbols
+                fields_and_types[k] = (dtypes.pointer(v.dtype), str(_prod(v.shape)))
+            elif isinstance(v, Scalar):
+                symbols |= v.free_symbols
+                fields_and_types[k] = v.dtype
+            elif isinstance(v, (sp.Basic, symbolic.SymExpr)):
+                symbols |= v.free_symbols
+                fields_and_types[k] = symbolic.symtype(v)
+            elif isinstance(v, (int, numpy.integer)):
+                fields_and_types[k] = dtypes.typeclass(type(v))
             else:
-                raise TypeError(f"Attribute {attr}'s value {value} has unsupported type: {type(value)}")
+                raise TypeError(f"Attribute {k}'s value {v} has unsupported type: {type(v)}")
         for s in symbols:
             if str(s) in fields_and_types:
                 continue
@@ -384,8 +396,8 @@ def __init__(self,
                 fields_and_types[str(s)] = s.dtype
             else:
                 fields_and_types[str(s)] = dtypes.int32
-        dtype = dtypes.struct(self.__class__.__name__, **fields_and_types)
-        shape = shape or (1,)
+        dtype = dtypes.pointer(dtypes.struct(self.__class__.__name__, **fields_and_types))
+        shape = (1,)
         super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo)
 
     @property

From ebf72068e4b27ed777fb835bc75c835980d502d6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:24:37 +0200
Subject: [PATCH 280/392] dtype.structs store their ctype in `_FFI_CTYPES`.

---
 dace/dtypes.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index 230197bc6f..d01209469f 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """ A module that contains various DaCe type definitions. """
 from __future__ import print_function
 import ctypes
@@ -654,6 +654,8 @@ def from_json(json_obj, context=None):
 
     def as_ctypes(self):
         """ Returns the ctypes version of the typeclass. """
+        if isinstance(self._typeclass, struct):
+            return ctypes.POINTER(self._typeclass.as_ctypes())
         return ctypes.POINTER(_FFI_CTYPES[self.type])
 
     def as_numpy_dtype(self):
@@ -804,8 +806,6 @@ def _parse_field_and_types(self, **fields_and_types):
                 for sym in sym_tokens:
                     if str(sym) not in fields_and_types.keys():
                         raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}")
-                # if l not in fields_and_types.keys():
-                #     raise ValueError("Length {} not a field of struct {}".format(l, self.name))
                 self._data[k] = t
                 self._length[k] = l
                 self.bytes += t.bytes
@@ -817,16 +817,24 @@ def _parse_field_and_types(self, **fields_and_types):
 
     def as_ctypes(self):
         """ Returns the ctypes version of the typeclass. """
+        if self in _FFI_CTYPES:
+            return _FFI_CTYPES[self]
         # Populate the ctype fields for the struct class.
         fields = []
         for k, v in self._data.items():
             if isinstance(v, pointer):
-                fields.append((k, ctypes.c_void_p))  # ctypes.POINTER(_FFI_CTYPES[v.type])))
+                if isinstance(v._typeclass, struct):
+                    fields.append((k, ctypes.POINTER(v._typeclass.as_ctypes())))
+                else:
+                    fields.append((k, ctypes.c_void_p))
+            elif isinstance(v, struct):
+                fields.append((k, v.as_ctypes()))
             else:
                 fields.append((k, _FFI_CTYPES[v.type]))
         fields = sorted(fields, key=lambda f: f[0])
         # Create new struct class.
         struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields})
+        _FFI_CTYPES[self] = struct_class
         return struct_class
 
     def as_numpy_dtype(self):

From c52a48257ffbb7933aec3b04fd7029cdafce77a8 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:26:03 +0200
Subject: [PATCH 281/392] Reverted underscore exception for Structures.

---
 dace/properties.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dace/properties.py b/dace/properties.py
index 679c0b9596..2225b6d853 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -413,11 +413,10 @@ def initialize_properties(obj, *args, **kwargs):
                 if not prop.unmapped:
                     raise PropertyError("Property {} is unassigned in __init__ for {}".format(name, cls.__name__))
         # Assert that there are no fields in the object not captured by properties, unless they are prefixed with "_"
-        if not isinstance(obj, dace.data.Structure):
-            for name, prop in obj.__dict__.items():
-                if (name not in properties and not name.startswith("_") and name not in dir(type(obj))):
-                    raise PropertyError("{} : Variable {} is neither a Property nor "
-                                        "an internal variable (prefixed with \"_\")".format(str(type(obj)), name))
+        for name, prop in obj.__dict__.items():
+            if (name not in properties and not name.startswith("_") and name not in dir(type(obj))):
+                raise PropertyError("{} : Variable {} is neither a Property nor "
+                                    "an internal variable (prefixed with \"_\")".format(str(type(obj)), name))
 
     # Replace the __init__ method
     cls.__init__ = initialize_properties

From 40cc858f992d71a49730d934268c31d380d8e82b Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:26:40 +0200
Subject: [PATCH 282/392] Small fixes.

---
 dace/codegen/compiled_sdfg.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index d0d29cfa1e..863e804802 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -452,9 +452,10 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
                 # GPU scalars are pointers, so this is fine
                 if atype.storage != dtypes.StorageType.GPU_Global:
                     raise TypeError('Passing an array to a scalar (type %s) in argument "%s"' % (atype.dtype.ctype, a))
-            elif not isinstance(atype, dt.Array) and not isinstance(atype.dtype, dtypes.callback) and not isinstance(
-                    arg,
-                (atype.dtype.type, sp.Basic)) and not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype):
+            elif (not isinstance(atype, (dt.Array, dt.Structure)) and
+                  not isinstance(atype.dtype, dtypes.callback) and
+                  not isinstance(arg, (atype.dtype.type, sp.Basic)) and
+                  not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype)):
                 if isinstance(arg, int) and atype.dtype.type == np.int64:
                     pass
                 elif isinstance(arg, float) and atype.dtype.type == np.float64:
@@ -521,7 +522,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
         # Construct init args, which only consist of the symbols
         symbols = self._free_symbols
         initargs = tuple(
-            actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg
+            actype(arg) if not isinstance(arg, ctypes._SimpleCData) else arg
             for arg, actype, atype, aname in callparams if aname in symbols)
 
         # Replace arrays with their base host/device pointers
@@ -531,7 +532,8 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
 
         try:
             newargs = tuple(
-                actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg for arg, actype, atype in newargs)
+                actype(arg) if not isinstance(arg, (ctypes._SimpleCData)) else arg
+                for arg, actype, atype in newargs)
         except TypeError:
             # Pinpoint bad argument
             for i, (arg, actype, _) in enumerate(newargs):

From dd73aaa8816864958fc4fd547e16d5372519f167 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:27:17 +0200
Subject: [PATCH 283/392] WIP: Replace ',' with '->' to quickly support nested
 data.

---
 dace/codegen/targets/cpp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index afbc6fca12..7d54e985f5 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -370,6 +370,8 @@ def make_const(expr: str) -> str:
     # Register defined variable
     dispatcher.defined_vars.add(pointer_name, defined_type, typedef, allow_shadowing=True)
 
+    expr = expr.replace('.', '->')
+
     return (typedef + ref, pointer_name, expr)
 
 

From 623a7f88838f0a3bc033333bef28e4de03544d37 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:28:08 +0200
Subject: [PATCH 284/392] Recursively add to arglist nested data descriptors.

---
 dace/codegen/targets/cpu.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index eb7d232966..2759c9744c 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -55,10 +55,30 @@ def __init__(self, frame_codegen, sdfg):
         # Keep track of generated NestedSDG, and the name of the assigned function
         self._generated_nested_sdfg = dict()
 
-        # Keeps track of generated connectors, so we know how to access them in
-        # nested scopes
+        def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
+            for k, v in struct.members.items():
+                if isinstance(v, data.Structure):
+                    _visit_structure(v, args, f'{prefix}.{k}')
+                elif isinstance(v, data.Data):
+                    args[f'{prefix}.{k}'] = v
+
+        # Keeps track of generated connectors, so we know how to access them in nested scopes
+        arglist = dict(self._frame.arglist)
         for name, arg_type in self._frame.arglist.items():
-            if isinstance(arg_type, data.Scalar):
+            if isinstance(arg_type, data.Structure):
+                desc = sdfg.arrays[name]
+                _visit_structure(arg_type, arglist, name)
+            elif isinstance(arg_type, data.StructArray):
+                desc = sdfg.arrays[name]
+                desc = desc.stype
+                for attr in dir(desc):
+                    value = getattr(desc, attr)
+                    if isinstance(value, data.Data):
+                        assert attr in sdfg.arrays
+                        arglist[attr] = value
+
+        for name, arg_type in arglist.items():
+            if isinstance(arg_type, (data.Scalar, data.Structure)):
                 # GPU global memory is only accessed via pointers
                 # TODO(later): Fix workaround somehow
                 if arg_type.storage is dtypes.StorageType.GPU_Global:

From 1e5baddcbda6e0d78bd9526af7e1a0b78627a4e3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:28:50 +0200
Subject: [PATCH 285/392] Recursively look into nested data to emit
 definitions.

---
 dace/codegen/targets/framecode.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 6f302c11ba..be6b85602a 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -150,15 +150,23 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend:
         for _, arrname, arr in sdfg.arrays_recursive():
             if arr is not None:
                 datatypes.add(arr.dtype)
+        
+        def _emit_definitions(dtype: dtypes.typeclass, wrote_something: bool) -> bool:
+            if isinstance(dtype, dtypes.pointer):
+                wrote_something = _emit_definitions(dtype._typeclass, wrote_something)
+            elif isinstance(dtype, dtypes.struct):
+                for field in dtype.fields.values():
+                    wrote_something = _emit_definitions(field, wrote_something)
+            if hasattr(dtype, 'emit_definition'):
+                if not wrote_something:
+                    global_stream.write("", sdfg)
+                global_stream.write(dtype.emit_definition(), sdfg)
+            return wrote_something
 
         # Emit unique definitions
         wrote_something = False
         for typ in datatypes:
-            if hasattr(typ, 'emit_definition'):
-                if not wrote_something:
-                    global_stream.write("", sdfg)
-                wrote_something = True
-                global_stream.write(typ.emit_definition(), sdfg)
+            wrote_something = _emit_definitions(typ, wrote_something)
         if wrote_something:
             global_stream.write("", sdfg)
 

From 36d4e826ac769f1cb99ecc3c8fe8206c0690cdab Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:30:21 +0200
Subject: [PATCH 286/392] SDFG data (_arrays) are now stored in a NestedDict.

---
 dace/sdfg/sdfg.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 18763e385a..6e4c3587f4 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -48,6 +48,35 @@
     from dace.codegen.compiled_sdfg import CompiledSDFG
 
 
+class NestedDict(dict):
+
+    def __init__(self):
+        super(NestedDict, self).__init__()
+
+    def __getitem__(self, key):
+        tokens = key.split('.')
+        token = tokens.pop(0)
+        result = super(NestedDict, self).__getitem__(token)
+        while tokens:
+            token = tokens.pop(0)
+            result = result.members[token]
+        return result
+    
+    def __contains__(self, key):
+        tokens = key.split('.')
+        token = tokens.pop(0)
+        result = super(NestedDict, self).__contains__(token)
+        desc = None
+        while tokens and result:
+            if desc is None:
+                desc = super(NestedDict, self).__getitem__(token)
+            else:
+                desc = desc.members[token]
+            token = tokens.pop(0)
+            result = token in desc.members
+        return result
+
+
 def _arrays_to_json(arrays):
     if arrays is None:
         return None
@@ -375,7 +404,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]):
     name = Property(dtype=str, desc="Name of the SDFG")
     arg_names = ListProperty(element_type=str, desc='Ordered argument names (used for calling conventions).')
     constants_prop = Property(dtype=dict, default={}, desc="Compile-time constants")
-    _arrays = Property(dtype=dict,
+    _arrays = Property(dtype=NestedDict,
                        desc="Data descriptors for this SDFG",
                        to_json=_arrays_to_json,
                        from_json=_arrays_from_json)
@@ -456,7 +485,7 @@ def __init__(self,
         self._sdfg_list = [self]
         self._start_state: Optional[int] = None
         self._cached_start_state: Optional[SDFGState] = None
-        self._arrays = {}  # type: Dict[str, dt.Array]
+        self._arrays = NestedDict()  # type: Dict[str, dt.Array]
         self._labels: Set[str] = set()
         self.global_code = {'frame': CodeBlock("", dtypes.Language.CPP)}
         self.init_code = {'frame': CodeBlock("", dtypes.Language.CPP)}

From 38a4265a29c64f6100e03f536aecdd09fd160dca Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:31:11 +0200
Subject: [PATCH 287/392] Adjusted the matching check for memlet data and
 src/dst nodes to not fail for Structures.

---
 dace/sdfg/validation.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 3bac646479..c963df9d7e 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -587,9 +587,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         break
 
         # Check if memlet data matches src or dst nodes
-        if (e.data.data is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
-                and (not isinstance(src_node, nd.AccessNode) or e.data.data != src_node.data)
-                and (not isinstance(dst_node, nd.AccessNode) or e.data.data != dst_node.data)):
+        name = e.data.data
+        if isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Structure):
+            name = None
+        if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure):
+            name = None
+        if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
+                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn))
+                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))):
             raise InvalidSDFGEdgeError(
                 "Memlet data does not match source or destination "
                 "data nodes)",

From 479cb2ad240dd167a7b26d2665527e04727cffe6 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 19:32:51 +0200
Subject: [PATCH 288/392] Added tests.

---
 tests/sdfg/data/structure_test.py | 240 ++++++++++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 tests/sdfg/data/structure_test.py

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
new file mode 100644
index 0000000000..3783a98068
--- /dev/null
+++ b/tests/sdfg/data/structure_test.py
@@ -0,0 +1,240 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import numpy as np
+
+from scipy import sparse
+
+
+def create_structure(name: str, **members) -> dace.data.Structure:
+
+    StructureClass = type(name, (dace.data.Structure, ), {})
+    return StructureClass(members)
+
+
+def test_read_structure():
+
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix',
+                           indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz)
+
+    sdfg = dace.SDFG('csr_to_dense')
+
+    sdfg.add_datadesc('A', CSR)
+    sdfg.add_array('B', [M, N], dace.float32)
+
+    sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype)
+    sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype)
+    sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    A = state.add_access('A')
+    B = state.add_access('B')
+
+    indptr = state.add_access('vindptr')
+    indices = state.add_access('vindices')
+    data = state.add_access('vdata')
+
+    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', CSR.members['indptr']))
+    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', CSR.members['indices']))
+    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', CSR.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    B = np.zeros((20, 20), dtype=np.float32)
+
+    inpA = CSR.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
+                                            indices=A.indices.__array_interface__['data'][0],
+                                            data=A.data.__array_interface__['data'][0],
+                                            rows=A.shape[0],
+                                            cols=A.shape[1],
+                                            M=A.shape[0],
+                                            N=A.shape[1],
+                                            nnz=A.nnz)
+
+    func(A=inpA, B=B, M=20, N=20, nnz=A.nnz)
+    ref = A.toarray()
+
+    assert np.allclose(B, ref)
+
+
+def test_write_structure():
+
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix',
+                           indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz)
+
+    sdfg = dace.SDFG('dense_to_csr')
+
+    sdfg.add_array('A', [M, N], dace.float32)
+    sdfg.add_datadesc('B', CSR)
+
+    sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype)
+    sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype)
+    sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype)
+
+    # Make If
+    if_before = sdfg.add_state('if_before')
+    if_guard = sdfg.add_state('if_guard')
+    if_body = sdfg.add_state('if_body')
+    if_after = sdfg.add_state('if_after')
+    sdfg.add_edge(if_before, if_guard, dace.InterstateEdge())
+    sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0'))
+    sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'}))
+    sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0'))
+    A = if_body.add_access('A')
+    B = if_body.add_access('B')
+    indices = if_body.add_access('vindices')
+    data = if_body.add_access('vdata')
+    if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
+    if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz'))
+    t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
+    if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx'))
+    if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz'))
+    # Make For Loop  for j
+    j_before, j_guard, j_after = sdfg.add_loop(None,
+                                               if_before,
+                                               None,
+                                               'j',
+                                               '0',
+                                               'j < N',
+                                               'j + 1',
+                                               loop_end_state=if_after)
+    # Make For Loop  for i
+    i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after)
+    sdfg.start_state = sdfg.node_id(i_before)
+    i_before_guard = sdfg.edges_between(i_before, i_guard)[0]
+    i_before_guard.data.assignments['idx'] = '0'
+    B = i_guard.add_access('B')
+    indptr = i_guard.add_access('vindptr')
+    t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
+    i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i'))
+    i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
+    B = i_after.add_access('B')
+    indptr = i_after.add_access('vindptr')
+    t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
+    i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
+    i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    A = tmp.toarray()
+    B = tmp.tocsr(copy=True)
+    B.indptr[:] = -1
+    B.indices[:] = -1
+    B.data[:] = -1
+
+    outB = CSR.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
+                                            indices=B.indices.__array_interface__['data'][0],
+                                            data=B.data.__array_interface__['data'][0],
+                                            rows=tmp.shape[0],
+                                            cols=tmp.shape[1],
+                                            M=tmp.shape[0],
+                                            N=tmp.shape[1],
+                                            nnz=tmp.nnz)
+
+    func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
+
+    assert np.allclose(A, B.toarray())
+
+
+def test_read_nested_structure():
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix',
+                           indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz)
+    Wrapper = create_structure('WrapperClass', csr=CSR)
+
+    sdfg = dace.SDFG('nested_csr_to_dense')
+
+    sdfg.add_datadesc('A', Wrapper)
+    sdfg.add_array('B', [M, N], dace.float32)
+
+    spmat = Wrapper.members['csr']
+    sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
+    sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
+    sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    A = state.add_access('A')
+    B = state.add_access('B')
+
+    indptr = state.add_access('vindptr')
+    indices = state.add_access('vindices')
+    data = state.add_access('vdata')
+
+    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
+    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
+    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    B = np.zeros((20, 20), dtype=np.float32)
+
+    structclass = CSR.dtype._typeclass.as_ctypes()
+    inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
+                         indices=A.indices.__array_interface__['data'][0],
+                         data=A.data.__array_interface__['data'][0],
+                         rows=A.shape[0],
+                         cols=A.shape[1],
+                         M=A.shape[0],
+                         K=A.shape[1],
+                         nnz=A.nnz)
+    import ctypes
+    inpW = Wrapper.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
+
+    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
+    ref = A.toarray()
+
+    assert np.allclose(B, ref)
+
+
+if __name__ == "__main__":
+    test_read_structure()
+    test_write_structure()
+    test_read_nested_structure()

From 8365ab34926a01d65a67d93d1b1bbaf2e67eac11 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 20:25:26 +0200
Subject: [PATCH 289/392] Serialization fixes.

---
 dace/sdfg/sdfg.py                 | 13 ++++++++++---
 tests/sdfg/data/structure_test.py | 17 +++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 6e4c3587f4..b5598870ec 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -50,8 +50,9 @@
 
 class NestedDict(dict):
 
-    def __init__(self):
-        super(NestedDict, self).__init__()
+    def __init__(self, mapping=None):
+        mapping = mapping or {}
+        super(NestedDict, self).__init__(mapping)
 
     def __getitem__(self, key):
         tokens = key.split('.')
@@ -89,6 +90,12 @@ def _arrays_from_json(obj, context=None):
     return {k: dace.serialize.from_json(v, context) for k, v in obj.items()}
 
 
+def _nested_arrays_from_json(obj, context=None):
+    if obj is None:
+        return NestedDict({})
+    return NestedDict({k: dace.serialize.from_json(v, context) for k, v in obj.items()})
+
+
 def _replace_dict_keys(d, old, new):
     if old in d:
         if new in d:
@@ -407,7 +414,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]):
     _arrays = Property(dtype=NestedDict,
                        desc="Data descriptors for this SDFG",
                        to_json=_arrays_to_json,
-                       from_json=_arrays_from_json)
+                       from_json=_nested_arrays_from_json)
     symbols = DictProperty(str, dtypes.typeclass, desc="Global symbols for this SDFG")
 
     instrument = EnumProperty(dtype=dtypes.InstrumentationType,
diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 3783a98068..5348ecaa5a 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -2,12 +2,29 @@
 import dace
 import numpy as np
 
+from dace import serialize
+from dace.properties import make_properties
 from scipy import sparse
 
 
 def create_structure(name: str, **members) -> dace.data.Structure:
 
     StructureClass = type(name, (dace.data.Structure, ), {})
+
+    @staticmethod
+    def from_json(json_obj, context=None):
+        if json_obj['type'] != name:
+            raise TypeError("Invalid data type")
+
+        # Create dummy object
+        ret = StructureClass({})
+        serialize.set_properties_from_json(ret, json_obj, context=context)
+
+        return ret
+    
+    setattr(StructureClass, 'from_json', from_json)
+    StructureClass = make_properties(StructureClass)
+
     return StructureClass(members)
 
 

From 14ba6655c883f2f0761ca4ccacfb722d82b7eac3 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 19 Jul 2023 20:29:36 +0200
Subject: [PATCH 290/392] Fixed NestedDict for non-str keys.

---
 dace/sdfg/sdfg.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index b5598870ec..a4c29c2e89 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -55,16 +55,17 @@ def __init__(self, mapping=None):
         super(NestedDict, self).__init__(mapping)
 
     def __getitem__(self, key):
-        tokens = key.split('.')
+        tokens = key.split('.') if isinstance(key, str) else [key]
         token = tokens.pop(0)
         result = super(NestedDict, self).__getitem__(token)
         while tokens:
             token = tokens.pop(0)
             result = result.members[token]
         return result
+
     
     def __contains__(self, key):
-        tokens = key.split('.')
+        tokens = key.split('.') if isinstance(key, str) else [key]
         token = tokens.pop(0)
         result = super(NestedDict, self).__contains__(token)
         desc = None

From 7343f55e2dca5c06721c7b9bf3448b2ca0f1637e Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 20 Jul 2023 13:00:02 +0200
Subject: [PATCH 291/392] state.read_and_write_sets() take into account when
 read happens directly after write

---
 dace/sdfg/state.py       | 16 +++++++++++++---
 tests/sdfg/state_test.py | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 tests/sdfg/state_test.py

diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 0796bf00d0..0354dd107b 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -298,10 +298,11 @@ def scope_tree(self) -> 'dace.sdfg.scope.ScopeTree':
 
         # Get scopes
         for node, scopenodes in sdc.items():
+            scope_exit_nodes = [v for v in scopenodes if isinstance(v, nd.ExitNode)]
             if node is None:
                 exit_node = None
             else:
-                exit_node = next(v for v in scopenodes if isinstance(v, nd.ExitNode))
+                exit_node = next(iter(scope_exit_nodes))
             scope = ScopeTree(node, exit_node)
             result[node] = scope
 
@@ -502,13 +503,22 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr,
             # is read is not counted in the read set
             for n in utils.dfs_topological_sort(sg, sources=sg.source_nodes()):
                 if isinstance(n, nd.AccessNode):
-                    for e in sg.in_edges(n):
+                    in_edges = sg.in_edges(n)
+                    out_edges = sg.out_edges(n)
+                    # Filter out memlets which go out but the same data is written to the AccessNode by another memlet
+                    for out_edge in out_edges:
+                        for in_edge in in_edges:
+                            if in_edge.data.data == out_edge.data.data and \
+                                    in_edge.data.dst_subset.covers(out_edge.data.src_subset):
+                                out_edges.remove(out_edge)
+
+                    for e in in_edges:
                         # skip empty memlets
                         if e.data.is_empty():
                             continue
                         # Store all subsets that have been written
                         ws[n.data].append(e.data.subset)
-                    for e in sg.out_edges(n):
+                    for e in out_edges:
                         # skip empty memlets
                         if e.data.is_empty():
                             continue
diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py
new file mode 100644
index 0000000000..07e2e8c4c7
--- /dev/null
+++ b/tests/sdfg/state_test.py
@@ -0,0 +1,23 @@
+import dace
+
+
+def test_read_write_set():
+    sdfg = dace.SDFG('graph')
+    A = sdfg.add_array('A', [10], dace.float64)
+    B = sdfg.add_array('B', [10], dace.float64)
+    C = sdfg.add_array('C', [10], dace.float64)
+    state = sdfg.add_state('state')
+    task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1')
+    task2 = state.add_tasklet('work2', {'B'},  {'C'}, 'C = B + 1')
+    read_a = state.add_access('A')
+    rw_b = state.add_access('B')
+    write_c = state.add_access('C')
+    state.add_memlet_path(read_a, task1, dst_conn='A', memlet=dace.Memlet('A[2]'))
+    state.add_memlet_path(task1, rw_b, src_conn='B', memlet=dace.Memlet('B[2]'))
+    state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet('B[2]'))
+    state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet('C[2]'))
+
+    assert 'B' not in state.read_and_write_sets()[0]
+
+if __name__ == '__main__':
+    test_read_write_set()

From bb00fea35013b66c153d89c5ba31ede70c0120d2 Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 20 Jul 2023 14:07:48 +0200
Subject: [PATCH 292/392] Change RefineNestedAccess to only look at memlets
 which are in the read and write set

---
 dace/transformation/interstate/sdfg_nesting.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index 1b9324546a..71d9e22aca 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -925,7 +925,12 @@ def _candidates(
                     continue
 
                 # For now we only detect one element
+                read_set, write_set = nstate.read_and_write_sets()
                 for e in nstate.in_edges(dnode):
+                    if e.data.data not in write_set:
+                        # Skip data which is not in the read and write set of the state -> there also won't be a
+                        # connector
+                        continue
                     # If more than one unique element detected, remove from
                     # candidates
                     if e.data.data in out_candidates:
@@ -941,6 +946,10 @@ def _candidates(
                         continue
                     out_candidates[e.data.data] = (e.data, nstate, set(range(len(e.data.subset))))
                 for e in nstate.out_edges(dnode):
+                    if e.data.data not in read_set:
+                        # Skip data which is not in the read and write set of the state -> there also won't be a
+                        # connector
+                        continue
                     # If more than one unique element detected, remove from
                     # candidates
                     if e.data.data in in_candidates:

From 80d6f10af1efe172560d64b976c451a91670b2fb Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 14:56:21 +0200
Subject: [PATCH 293/392] Added support for transient Structures.

---
 dace/codegen/targets/cpu.py | 16 ++++++++++++++--
 dace/data.py                | 28 ++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 2759c9744c..7ff91cbc7b 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -286,16 +286,17 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         name = node.data
         alloc_name = cpp.ptr(name, nodedesc, sdfg, self._frame)
         name = alloc_name
+        alloc_name = alloc_name.replace('.', '->')
 
         if nodedesc.transient is False:
             return
 
         # Check if array is already allocated
-        if self._dispatcher.defined_vars.has(alloc_name):
+        if self._dispatcher.defined_vars.has(name):
             return
 
         # Check if array is already declared
-        declared = self._dispatcher.declared_arrays.has(alloc_name)
+        declared = self._dispatcher.declared_arrays.has(name)
 
         define_var = self._dispatcher.defined_vars.add
         if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
@@ -308,6 +309,17 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         if not isinstance(nodedesc.dtype, dtypes.opaque):
             arrsize_bytes = arrsize * nodedesc.dtype.bytes
 
+        if isinstance(nodedesc, data.Structure):
+            declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n")
+            define_var(name, DefinedType.Pointer, nodedesc.ctype)
+            for k, v in nodedesc.members.items():
+                if isinstance(v, data.Data):
+                    ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype
+                    defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer
+                    self._dispatcher.declared_arrays.add(f"{name}.{k}", defined_type, ctypedef)
+                    self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream,
+                                        declaration_stream, allocation_stream)
+            return
         if isinstance(nodedesc, data.View):
             return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream)
         if isinstance(nodedesc, data.Reference):
diff --git a/dace/data.py b/dace/data.py
index 0f1ef1f266..838fc43542 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -369,7 +369,10 @@ def __init__(self,
                  location: Dict[str, str] = None,
                  lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
                  debuginfo: dtypes.DebugInfo = None):
+        # TODO: Should we make a deep-copy here?
         self.members = members or {}
+        for k, v in self.members.items():
+            v.transient = transient
         fields_and_types = dict()
         symbols = set()
         for k, v in members.items():
@@ -433,6 +436,31 @@ def __getitem__(self, s):
         return StructArray(self, (s, ))
 
 
+@make_properties
+class StructureView(Structure):
+    """ 
+    Data descriptor that acts as a reference (or view) of another structure.
+    """
+
+    @staticmethod
+    def from_json(json_obj, context=None):
+        if json_obj['type'] != 'StructureView':
+            raise TypeError("Invalid data type")
+
+        # Create dummy object
+        ret = StructureView({})
+        serialize.set_properties_from_json(ret, json_obj, context=context)
+
+        return ret
+
+    def validate(self):
+        super().validate()
+
+        # We ensure that allocation lifetime is always set to Scope, since the
+        # view is generated upon "allocation"
+        if self.lifetime != dtypes.AllocationLifetime.Scope:
+            raise ValueError('Only Scope allocation lifetime is supported for Views')
+
 @make_properties
 class Scalar(Data):
     """ Data descriptor of a scalar value. """

From 9658c2236b7ba154bccbbd3b839944f4f88c2668 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 14:56:40 +0200
Subject: [PATCH 294/392] Edited tests.

---
 tests/sdfg/data/structure_test.py | 346 +++++++++++++++++++++++++++---
 1 file changed, 321 insertions(+), 25 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 5348ecaa5a..462c6a8e7b 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -7,7 +7,7 @@
 from scipy import sparse
 
 
-def create_structure(name: str, **members) -> dace.data.Structure:
+def create_structure(name: str) -> dace.data.Structure:
 
     StructureClass = type(name, (dace.data.Structure, ), {})
 
@@ -25,28 +25,28 @@ def from_json(json_obj, context=None):
     setattr(StructureClass, 'from_json', from_json)
     StructureClass = make_properties(StructureClass)
 
-    return StructureClass(members)
+    return StructureClass
 
 
 def test_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix',
-                           indptr=dace.int32[M + 1],
+    CSR = create_structure('CSRMatrix')
+    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
                            indices=dace.int32[nnz],
                            data=dace.float32[nnz],
                            rows=M,
                            cols=N,
-                           nnz=nnz)
+                           nnz=nnz))
 
     sdfg = dace.SDFG('csr_to_dense')
 
-    sdfg.add_datadesc('A', CSR)
+    sdfg.add_datadesc('A', csr_obj)
     sdfg.add_array('B', [M, N], dace.float32)
 
-    sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype)
-    sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype)
-    sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype)
+    sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
 
     state = sdfg.add_state()
 
@@ -57,9 +57,9 @@ def test_read_structure():
     indices = state.add_access('vindices')
     data = state.add_access('vdata')
 
-    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', CSR.members['indptr']))
-    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', CSR.members['indices']))
-    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', CSR.members['data']))
+    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr']))
+    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices']))
+    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data']))
 
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
@@ -79,7 +79,7 @@ def test_read_structure():
     A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
     B = np.zeros((20, 20), dtype=np.float32)
 
-    inpA = CSR.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
+    inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
                                             indices=A.indices.__array_interface__['data'][0],
                                             data=A.data.__array_interface__['data'][0],
                                             rows=A.shape[0],
@@ -97,22 +97,22 @@ def test_read_structure():
 def test_write_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix',
-                           indptr=dace.int32[M + 1],
+    CSR = create_structure('CSRMatrix')
+    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
                            indices=dace.int32[nnz],
                            data=dace.float32[nnz],
                            rows=M,
                            cols=N,
-                           nnz=nnz)
+                           nnz=nnz))
 
     sdfg = dace.SDFG('dense_to_csr')
 
     sdfg.add_array('A', [M, N], dace.float32)
-    sdfg.add_datadesc('B', CSR)
+    sdfg.add_datadesc('B', csr_obj)
 
-    sdfg.add_view('vindptr', CSR.members['indptr'].shape, CSR.members['indptr'].dtype)
-    sdfg.add_view('vindices', CSR.members['indices'].shape, CSR.members['indices'].dtype)
-    sdfg.add_view('vdata', CSR.members['data'].shape, CSR.members['data'].dtype)
+    sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
 
     # Make If
     if_before = sdfg.add_state('if_before')
@@ -167,7 +167,7 @@ def test_write_structure():
     B.indices[:] = -1
     B.data[:] = -1
 
-    outB = CSR.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
+    outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
                                             indices=B.indices.__array_interface__['data'][0],
                                             data=B.data.__array_interface__['data'][0],
                                             rows=tmp.shape[0],
@@ -181,7 +181,204 @@ def test_write_structure():
     assert np.allclose(A, B.toarray())
 
 
+def test_local_structure():
+    
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix')
+    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz))
+    tmp_obj = CSR(dict(indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz), transient=True)
+
+    sdfg = dace.SDFG('dense_to_csr')
+
+    sdfg.add_array('A', [M, N], dace.float32)
+    sdfg.add_datadesc('B', csr_obj)
+    sdfg.add_datadesc('tmp', tmp_obj)
+
+    sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
+
+    sdfg.add_view('tmp_vindptr', tmp_obj.members['indptr'].shape, tmp_obj.members['indptr'].dtype)
+    sdfg.add_view('tmp_vindices', tmp_obj.members['indices'].shape, tmp_obj.members['indices'].dtype)
+    sdfg.add_view('tmp_vdata', tmp_obj.members['data'].shape, tmp_obj.members['data'].dtype)
+
+    # Make If
+    if_before = sdfg.add_state('if_before')
+    if_guard = sdfg.add_state('if_guard')
+    if_body = sdfg.add_state('if_body')
+    if_after = sdfg.add_state('if_after')
+    sdfg.add_edge(if_before, if_guard, dace.InterstateEdge())
+    sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0'))
+    sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'}))
+    sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0'))
+    A = if_body.add_access('A')
+    tmp = if_body.add_access('tmp')
+    indices = if_body.add_access('tmp_vindices')
+    data = if_body.add_access('tmp_vdata')
+    if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
+    if_body.add_edge(data, 'views', tmp, 'data', dace.Memlet(data='tmp.data', subset='0:nnz'))
+    t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
+    if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='tmp_vindices', subset='idx'))
+    if_body.add_edge(indices, 'views', tmp, 'indices', dace.Memlet(data='tmp.indices', subset='0:nnz'))
+    # Make For Loop  for j
+    j_before, j_guard, j_after = sdfg.add_loop(None,
+                                               if_before,
+                                               None,
+                                               'j',
+                                               '0',
+                                               'j < N',
+                                               'j + 1',
+                                               loop_end_state=if_after)
+    # Make For Loop  for i
+    i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after)
+    sdfg.start_state = sdfg.node_id(i_before)
+    i_before_guard = sdfg.edges_between(i_before, i_guard)[0]
+    i_before_guard.data.assignments['idx'] = '0'
+    tmp = i_guard.add_access('tmp')
+    indptr = i_guard.add_access('tmp_vindptr')
+    t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
+    i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='i'))
+    i_guard.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+    tmp = i_after.add_access('tmp')
+    indptr = i_after.add_access('tmp_vindptr')
+    t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
+    i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='M'))
+    i_after.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+
+    set_B = sdfg.add_state('set_B')
+    sdfg.add_edge(i_after, set_B, dace.InterstateEdge())
+    tmp = set_B.add_access('tmp')
+    tmp_indptr = set_B.add_access('tmp_vindptr')
+    tmp_indices = set_B.add_access('tmp_vindices')
+    tmp_data = set_B.add_access('tmp_vdata')
+    set_B.add_edge(tmp, 'indptr', tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+    set_B.add_edge(tmp, 'indices', tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz'))
+    set_B.add_edge(tmp, 'data', tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz'))
+    B = set_B.add_access('B')
+    B_indptr = set_B.add_access('vindptr')
+    B_indices = set_B.add_access('vindices')
+    B_data = set_B.add_access('vdata')
+    set_B.add_edge(B_indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
+    set_B.add_edge(B_indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz'))
+    set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz'))
+    set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1'))
+    set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz'))
+    t, me, mx = set_B.add_mapped_tasklet('set_data',
+                                         {'idx': '0:nnz'},
+                                         {'__inp': dace.Memlet(data='tmp_vdata', subset='idx')},
+                                         '__out = 2 * __inp',
+                                         {'__out': dace.Memlet(data='vdata', subset='idx')},
+                                         external_edges=True,
+                                         input_nodes={'tmp_vdata': tmp_data},
+                                         output_nodes={'vdata': B_data})
+
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    A = tmp.toarray()
+    B = tmp.tocsr(copy=True)
+    B.indptr[:] = -1
+    B.indices[:] = -1
+    B.data[:] = -1
+
+    outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
+                                            indices=B.indices.__array_interface__['data'][0],
+                                            data=B.data.__array_interface__['data'][0],
+                                            rows=tmp.shape[0],
+                                            cols=tmp.shape[1],
+                                            M=tmp.shape[0],
+                                            N=tmp.shape[1],
+                                            nnz=tmp.nnz)
+
+    func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
+
+    assert np.allclose(A * 2, B.toarray())
+
+
 def test_read_nested_structure():
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix')
+    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz))
+    Wrapper = create_structure('WrapperClass')
+    wrapper_obj = Wrapper(dict(csr=csr_obj))
+
+    sdfg = dace.SDFG('nested_csr_to_dense')
+
+    sdfg.add_datadesc('A', wrapper_obj)
+    sdfg.add_array('B', [M, N], dace.float32)
+
+    spmat = wrapper_obj.members['csr']
+    sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
+    sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
+    sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    A = state.add_access('A')
+    B = state.add_access('B')
+
+    indptr = state.add_access('vindptr')
+    indices = state.add_access('vindices')
+    data = state.add_access('vdata')
+
+    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
+    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
+    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    B = np.zeros((20, 20), dtype=np.float32)
+
+    structclass = csr_obj.dtype._typeclass.as_ctypes()
+    inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
+                         indices=A.indices.__array_interface__['data'][0],
+                         data=A.data.__array_interface__['data'][0],
+                         rows=A.shape[0],
+                         cols=A.shape[1],
+                         M=A.shape[0],
+                         K=A.shape[1],
+                         nnz=A.nnz)
+    import ctypes
+    inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
+
+    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
+    ref = A.toarray()
+
+    assert np.allclose(B, ref)
+
+
+def test_read_nested_structure_2():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     CSR = create_structure('CSRMatrix',
                            indptr=dace.int32[M + 1],
@@ -190,14 +387,16 @@ def test_read_nested_structure():
                            rows=M,
                            cols=N,
                            nnz=nnz)
+    CSRView = dace.data.StructureView(CSR.members, transient=True)
     Wrapper = create_structure('WrapperClass', csr=CSR)
 
-    sdfg = dace.SDFG('nested_csr_to_dense')
+    sdfg = dace.SDFG('nested_csr_to_dense_2')
 
     sdfg.add_datadesc('A', Wrapper)
     sdfg.add_array('B', [M, N], dace.float32)
 
     spmat = Wrapper.members['csr']
+    sdfg.add_datadesc('vcsr', CSRView)
     sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
     sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
     sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
@@ -207,13 +406,15 @@ def test_read_nested_structure():
     A = state.add_access('A')
     B = state.add_access('B')
 
+    csr = state.add_access('vcsr')
     indptr = state.add_access('vindptr')
     indices = state.add_access('vindices')
     data = state.add_access('vdata')
 
-    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
-    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
-    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
+    state.add_edge(A, 'csr', csr, 'views', dace.Memlet.from_array('A.csr', spmat))
+    state.add_edge(csr, 'indptr', indptr, 'views', dace.Memlet.from_array('vcsr.indptr', spmat.members['indptr']))
+    state.add_edge(csr, 'indices', indices, 'views', dace.Memlet.from_array('vcsr.indices', spmat.members['indices']))
+    state.add_edge(csr, 'data', data, 'views', dace.Memlet.from_array('vcsr.data', spmat.members['data']))
 
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
@@ -251,7 +452,102 @@ def test_read_nested_structure():
     assert np.allclose(B, ref)
 
 
+def test_write_nested_structure():
+
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    CSR = create_structure('CSRMatrix')
+    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
+                           indices=dace.int32[nnz],
+                           data=dace.float32[nnz],
+                           rows=M,
+                           cols=N,
+                           nnz=nnz))
+    Wrapper = create_structure('WrapperClass')
+    wrapper_obj = Wrapper(dict(csr=csr_obj))
+
+    sdfg = dace.SDFG('dense_to_csr')
+
+    sdfg.add_array('A', [M, N], dace.float32)
+    sdfg.add_datadesc('B', wrapper_obj)
+
+    spmat = wrapper_obj.members['csr']
+    sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
+    sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
+    sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
+
+    # Make If
+    if_before = sdfg.add_state('if_before')
+    if_guard = sdfg.add_state('if_guard')
+    if_body = sdfg.add_state('if_body')
+    if_after = sdfg.add_state('if_after')
+    sdfg.add_edge(if_before, if_guard, dace.InterstateEdge())
+    sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[i, j] != 0'))
+    sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'}))
+    sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[i, j] == 0'))
+    A = if_body.add_access('A')
+    B = if_body.add_access('B')
+    indices = if_body.add_access('vindices')
+    data = if_body.add_access('vdata')
+    if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
+    if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.csr.data', subset='0:nnz'))
+    t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
+    if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx'))
+    if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.csr.indices', subset='0:nnz'))
+    # Make For Loop  for j
+    j_before, j_guard, j_after = sdfg.add_loop(None,
+                                               if_before,
+                                               None,
+                                               'j',
+                                               '0',
+                                               'j < N',
+                                               'j + 1',
+                                               loop_end_state=if_after)
+    # Make For Loop  for i
+    i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after)
+    sdfg.start_state = sdfg.node_id(i_before)
+    i_before_guard = sdfg.edges_between(i_before, i_guard)[0]
+    i_before_guard.data.assignments['idx'] = '0'
+    B = i_guard.add_access('B')
+    indptr = i_guard.add_access('vindptr')
+    t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
+    i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i'))
+    i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
+    B = i_after.add_access('B')
+    indptr = i_after.add_access('vindptr')
+    t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
+    i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
+    i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    tmp = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    A = tmp.toarray()
+    B = tmp.tocsr(copy=True)
+    B.indptr[:] = -1
+    B.indices[:] = -1
+    B.data[:] = -1
+
+    outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
+                                            indices=B.indices.__array_interface__['data'][0],
+                                            data=B.data.__array_interface__['data'][0],
+                                            rows=tmp.shape[0],
+                                            cols=tmp.shape[1],
+                                            M=tmp.shape[0],
+                                            N=tmp.shape[1],
+                                            nnz=tmp.nnz)
+    import ctypes
+    outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR))
+
+    func(A=A, B=outW, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
+
+    assert np.allclose(A, B.toarray())
+
+
 if __name__ == "__main__":
     test_read_structure()
     test_write_structure()
+    test_local_structure()
     test_read_nested_structure()
+    # test_read_nested_structure_2()
+    test_write_nested_structure()

From b1dbb6b385c5186ac16b5be1ea3d394953c6bf17 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 15:32:40 +0200
Subject: [PATCH 295/392] Structures have name attribute (instead of
 subclassing).

---
 dace/data.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/dace/data.py b/dace/data.py
index 838fc43542..e424aca66a 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -361,9 +361,11 @@ class Structure(Data):
                        desc="Dictionary of structure members",
                        from_json=_arrays_from_json,
                        to_json=_arrays_to_json)
+    name = Property(dtype=str, desc="Structure name")
 
     def __init__(self,
                  members: Dict[str, Any],
+                 name: str = 'Structure',
                  transient: bool = False,
                  storage: dtypes.StorageType = dtypes.StorageType.Default,
                  location: Dict[str, str] = None,
@@ -373,6 +375,7 @@ def __init__(self,
         self.members = members or {}
         for k, v in self.members.items():
             v.transient = transient
+        self.name = name
         fields_and_types = dict()
         symbols = set()
         for k, v in members.items():
@@ -399,9 +402,20 @@ def __init__(self,
                 fields_and_types[str(s)] = s.dtype
             else:
                 fields_and_types[str(s)] = dtypes.int32
-        dtype = dtypes.pointer(dtypes.struct(self.__class__.__name__, **fields_and_types))
+        dtype = dtypes.pointer(dtypes.struct(name, **fields_and_types))
         shape = (1,)
         super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo)
+    
+    @staticmethod
+    def from_json(json_obj, context=None):
+        if json_obj['type'] != 'Structure':
+            raise TypeError("Invalid data type")
+
+        # Create dummy object
+        ret = Structure({})
+        serialize.set_properties_from_json(ret, json_obj, context=context)
+
+        return ret
 
     @property
     def total_size(self):

From 5de2ae35d25b9f78eeecb0080504be34b6577cec Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 15:32:59 +0200
Subject: [PATCH 296/392] Updated tests.

---
 tests/sdfg/data/structure_test.py | 192 +++++++++++++++---------------
 1 file changed, 96 insertions(+), 96 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 462c6a8e7b..b3d72b9d7a 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import numpy as np
+import pytest
 
 from dace import serialize
 from dace.properties import make_properties
@@ -21,7 +22,7 @@ def from_json(json_obj, context=None):
         serialize.set_properties_from_json(ret, json_obj, context=context)
 
         return ret
-    
+
     setattr(StructureClass, 'from_json', from_json)
     StructureClass = make_properties(StructureClass)
 
@@ -31,13 +32,13 @@ def from_json(json_obj, context=None):
 def test_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix')
-    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense')
 
@@ -80,13 +81,13 @@ def test_read_structure():
     B = np.zeros((20, 20), dtype=np.float32)
 
     inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
-                                            indices=A.indices.__array_interface__['data'][0],
-                                            data=A.data.__array_interface__['data'][0],
-                                            rows=A.shape[0],
-                                            cols=A.shape[1],
-                                            M=A.shape[0],
-                                            N=A.shape[1],
-                                            nnz=A.nnz)
+                                                indices=A.indices.__array_interface__['data'][0],
+                                                data=A.data.__array_interface__['data'][0],
+                                                rows=A.shape[0],
+                                                cols=A.shape[1],
+                                                M=A.shape[0],
+                                                N=A.shape[1],
+                                                nnz=A.nnz)
 
     func(A=inpA, B=B, M=20, N=20, nnz=A.nnz)
     ref = A.toarray()
@@ -97,13 +98,13 @@ def test_read_structure():
 def test_write_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix')
-    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
 
     sdfg = dace.SDFG('dense_to_csr')
 
@@ -168,13 +169,13 @@ def test_write_structure():
     B.data[:] = -1
 
     outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
-                                            indices=B.indices.__array_interface__['data'][0],
-                                            data=B.data.__array_interface__['data'][0],
-                                            rows=tmp.shape[0],
-                                            cols=tmp.shape[1],
-                                            M=tmp.shape[0],
-                                            N=tmp.shape[1],
-                                            nnz=tmp.nnz)
+                                                indices=B.indices.__array_interface__['data'][0],
+                                                data=B.data.__array_interface__['data'][0],
+                                                rows=tmp.shape[0],
+                                                cols=tmp.shape[1],
+                                                M=tmp.shape[0],
+                                                N=tmp.shape[1],
+                                                nnz=tmp.nnz)
 
     func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
 
@@ -182,23 +183,25 @@ def test_write_structure():
 
 
 def test_local_structure():
-    
-    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix')
-    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz))
-    tmp_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz), transient=True)
 
-    sdfg = dace.SDFG('dense_to_csr')
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+    tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix',
+                                  transient=True)
+
+    sdfg = dace.SDFG('dense_to_csr_local')
 
     sdfg.add_array('A', [M, N], dace.float32)
     sdfg.add_datadesc('B', csr_obj)
@@ -273,16 +276,13 @@ def test_local_structure():
     set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz'))
     set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1'))
     set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz'))
-    t, me, mx = set_B.add_mapped_tasklet('set_data',
-                                         {'idx': '0:nnz'},
+    t, me, mx = set_B.add_mapped_tasklet('set_data', {'idx': '0:nnz'},
                                          {'__inp': dace.Memlet(data='tmp_vdata', subset='idx')},
-                                         '__out = 2 * __inp',
-                                         {'__out': dace.Memlet(data='vdata', subset='idx')},
+                                         '__out = 2 * __inp', {'__out': dace.Memlet(data='vdata', subset='idx')},
                                          external_edges=True,
                                          input_nodes={'tmp_vdata': tmp_data},
                                          output_nodes={'vdata': B_data})
 
-
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -294,13 +294,13 @@ def test_local_structure():
     B.data[:] = -1
 
     outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
-                                            indices=B.indices.__array_interface__['data'][0],
-                                            data=B.data.__array_interface__['data'][0],
-                                            rows=tmp.shape[0],
-                                            cols=tmp.shape[1],
-                                            M=tmp.shape[0],
-                                            N=tmp.shape[1],
-                                            nnz=tmp.nnz)
+                                                indices=B.indices.__array_interface__['data'][0],
+                                                data=B.data.__array_interface__['data'][0],
+                                                rows=tmp.shape[0],
+                                                cols=tmp.shape[1],
+                                                M=tmp.shape[0],
+                                                N=tmp.shape[1],
+                                                nnz=tmp.nnz)
 
     func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
 
@@ -309,15 +309,14 @@ def test_local_structure():
 
 def test_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix')
-    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz))
-    Wrapper = create_structure('WrapperClass')
-    wrapper_obj = Wrapper(dict(csr=csr_obj))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+    wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
     sdfg = dace.SDFG('nested_csr_to_dense')
 
@@ -378,24 +377,25 @@ def test_read_nested_structure():
     assert np.allclose(B, ref)
 
 
+@pytest.mark.skip
 def test_read_nested_structure_2():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix',
-                           indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz)
-    CSRView = dace.data.StructureView(CSR.members, transient=True)
-    Wrapper = create_structure('WrapperClass', csr=CSR)
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+    CSRView = dace.data.StructureView(csr_obj.members, transient=True)
+    wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
     sdfg = dace.SDFG('nested_csr_to_dense_2')
 
-    sdfg.add_datadesc('A', Wrapper)
+    sdfg.add_datadesc('A', wrapper_obj)
     sdfg.add_array('B', [M, N], dace.float32)
 
-    spmat = Wrapper.members['csr']
+    spmat = wrapper_obj.members['csr']
     sdfg.add_datadesc('vcsr', CSRView)
     sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
     sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
@@ -428,13 +428,14 @@ def test_read_nested_structure_2():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
     A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
     B = np.zeros((20, 20), dtype=np.float32)
 
-    structclass = CSR.dtype._typeclass.as_ctypes()
+    structclass = csr_obj.dtype._typeclass.as_ctypes()
     inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
                          indices=A.indices.__array_interface__['data'][0],
                          data=A.data.__array_interface__['data'][0],
@@ -444,7 +445,7 @@ def test_read_nested_structure_2():
                          K=A.shape[1],
                          nnz=A.nnz)
     import ctypes
-    inpW = Wrapper.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
+    inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
 
     func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
     ref = A.toarray()
@@ -455,15 +456,14 @@ def test_read_nested_structure_2():
 def test_write_nested_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    CSR = create_structure('CSRMatrix')
-    csr_obj = CSR(dict(indptr=dace.int32[M + 1],
-                           indices=dace.int32[nnz],
-                           data=dace.float32[nnz],
-                           rows=M,
-                           cols=N,
-                           nnz=nnz))
-    Wrapper = create_structure('WrapperClass')
-    wrapper_obj = Wrapper(dict(csr=csr_obj))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+    wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
     sdfg = dace.SDFG('dense_to_csr')
 
@@ -529,13 +529,13 @@ def test_write_nested_structure():
     B.data[:] = -1
 
     outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
-                                            indices=B.indices.__array_interface__['data'][0],
-                                            data=B.data.__array_interface__['data'][0],
-                                            rows=tmp.shape[0],
-                                            cols=tmp.shape[1],
-                                            M=tmp.shape[0],
-                                            N=tmp.shape[1],
-                                            nnz=tmp.nnz)
+                                                  indices=B.indices.__array_interface__['data'][0],
+                                                  data=B.data.__array_interface__['data'][0],
+                                                  rows=tmp.shape[0],
+                                                  cols=tmp.shape[1],
+                                                  M=tmp.shape[0],
+                                                  N=tmp.shape[1],
+                                                  nnz=tmp.nnz)
     import ctypes
     outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR))
 
@@ -549,5 +549,5 @@ def test_write_nested_structure():
     test_write_structure()
     test_local_structure()
     test_read_nested_structure()
-    # test_read_nested_structure_2()
+    test_read_nested_structure_2()
     test_write_nested_structure()

From 1fbc45f66ebcff4979f7cb05566de56b70e2b1b9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 20:08:26 +0200
Subject: [PATCH 297/392] Removed nested data connectors.

---
 tests/sdfg/data/structure_test.py | 56 +++++++++++++++++--------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index b3d72b9d7a..8636dc1602 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -58,9 +58,9 @@ def test_read_structure():
     indices = state.add_access('vindices')
     data = state.add_access('vdata')
 
-    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr']))
-    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices']))
-    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data']))
+    state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr']))
+    state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices']))
+    state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data']))
 
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
@@ -74,6 +74,7 @@ def test_read_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -129,10 +130,10 @@ def test_write_structure():
     indices = if_body.add_access('vindices')
     data = if_body.add_access('vdata')
     if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
-    if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz'))
+    if_body.add_edge(data, 'views', B, None, dace.Memlet(data='B.data', subset='0:nnz'))
     t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
     if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx'))
-    if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz'))
+    if_body.add_edge(indices, 'views', B, None, dace.Memlet(data='B.indices', subset='0:nnz'))
     # Make For Loop  for j
     j_before, j_guard, j_after = sdfg.add_loop(None,
                                                if_before,
@@ -151,13 +152,14 @@ def test_write_structure():
     indptr = i_guard.add_access('vindptr')
     t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
     i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i'))
-    i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
+    i_guard.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1'))
     B = i_after.add_access('B')
     indptr = i_after.add_access('vindptr')
     t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
     i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
-    i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
+    i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1'))
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -229,10 +231,10 @@ def test_local_structure():
     indices = if_body.add_access('tmp_vindices')
     data = if_body.add_access('tmp_vdata')
     if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
-    if_body.add_edge(data, 'views', tmp, 'data', dace.Memlet(data='tmp.data', subset='0:nnz'))
+    if_body.add_edge(data, 'views', tmp, None, dace.Memlet(data='tmp.data', subset='0:nnz'))
     t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
     if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='tmp_vindices', subset='idx'))
-    if_body.add_edge(indices, 'views', tmp, 'indices', dace.Memlet(data='tmp.indices', subset='0:nnz'))
+    if_body.add_edge(indices, 'views', tmp, None, dace.Memlet(data='tmp.indices', subset='0:nnz'))
     # Make For Loop  for j
     j_before, j_guard, j_after = sdfg.add_loop(None,
                                                if_before,
@@ -251,12 +253,12 @@ def test_local_structure():
     indptr = i_guard.add_access('tmp_vindptr')
     t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
     i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='i'))
-    i_guard.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+    i_guard.add_edge(indptr, 'views', tmp, None, dace.Memlet(data='tmp.indptr', subset='0:M+1'))
     tmp = i_after.add_access('tmp')
     indptr = i_after.add_access('tmp_vindptr')
     t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
     i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='tmp_vindptr', subset='M'))
-    i_after.add_edge(indptr, 'views', tmp, 'indptr', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+    i_after.add_edge(indptr, 'views', tmp, None, dace.Memlet(data='tmp.indptr', subset='0:M+1'))
 
     set_B = sdfg.add_state('set_B')
     sdfg.add_edge(i_after, set_B, dace.InterstateEdge())
@@ -264,16 +266,16 @@ def test_local_structure():
     tmp_indptr = set_B.add_access('tmp_vindptr')
     tmp_indices = set_B.add_access('tmp_vindices')
     tmp_data = set_B.add_access('tmp_vdata')
-    set_B.add_edge(tmp, 'indptr', tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
-    set_B.add_edge(tmp, 'indices', tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz'))
-    set_B.add_edge(tmp, 'data', tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz'))
+    set_B.add_edge(tmp, None, tmp_indptr, 'views', dace.Memlet(data='tmp.indptr', subset='0:M+1'))
+    set_B.add_edge(tmp, None, tmp_indices, 'views', dace.Memlet(data='tmp.indices', subset='0:nnz'))
+    set_B.add_edge(tmp, None, tmp_data, 'views', dace.Memlet(data='tmp.data', subset='0:nnz'))
     B = set_B.add_access('B')
     B_indptr = set_B.add_access('vindptr')
     B_indices = set_B.add_access('vindices')
     B_data = set_B.add_access('vdata')
-    set_B.add_edge(B_indptr, 'views', B, 'indptr', dace.Memlet(data='B.indptr', subset='0:M+1'))
-    set_B.add_edge(B_indices, 'views', B, 'indices', dace.Memlet(data='B.indices', subset='0:nnz'))
-    set_B.add_edge(B_data, 'views', B, 'data', dace.Memlet(data='B.data', subset='0:nnz'))
+    set_B.add_edge(B_indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1'))
+    set_B.add_edge(B_indices, 'views', B, None, dace.Memlet(data='B.indices', subset='0:nnz'))
+    set_B.add_edge(B_data, 'views', B, None, dace.Memlet(data='B.data', subset='0:nnz'))
     set_B.add_edge(tmp_indptr, None, B_indptr, None, dace.Memlet(data='tmp_vindptr', subset='0:M+1'))
     set_B.add_edge(tmp_indices, None, B_indices, None, dace.Memlet(data='tmp_vindices', subset='0:nnz'))
     t, me, mx = set_B.add_mapped_tasklet('set_data', {'idx': '0:nnz'},
@@ -283,6 +285,7 @@ def test_local_structure():
                                          input_nodes={'tmp_vdata': tmp_data},
                                          output_nodes={'vdata': B_data})
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -337,9 +340,9 @@ def test_read_nested_structure():
     indices = state.add_access('vindices')
     data = state.add_access('vdata')
 
-    state.add_edge(A, 'indptr', indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
-    state.add_edge(A, 'indices', indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
-    state.add_edge(A, 'data', data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
+    state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
+    state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
+    state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
 
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
@@ -353,6 +356,7 @@ def test_read_nested_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -429,6 +433,7 @@ def test_read_nested_structure_2():
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
     sdfg.view()
+    return
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -489,10 +494,10 @@ def test_write_nested_structure():
     indices = if_body.add_access('vindices')
     data = if_body.add_access('vdata')
     if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='i, j', other_subset='idx'))
-    if_body.add_edge(data, 'views', B, 'data', dace.Memlet(data='B.csr.data', subset='0:nnz'))
+    if_body.add_edge(data, 'views', B, None, dace.Memlet(data='B.csr.data', subset='0:nnz'))
     t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
     if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx'))
-    if_body.add_edge(indices, 'views', B, 'indices', dace.Memlet(data='B.csr.indices', subset='0:nnz'))
+    if_body.add_edge(indices, 'views', B, None, dace.Memlet(data='B.csr.indices', subset='0:nnz'))
     # Make For Loop  for j
     j_before, j_guard, j_after = sdfg.add_loop(None,
                                                if_before,
@@ -511,13 +516,14 @@ def test_write_nested_structure():
     indptr = i_guard.add_access('vindptr')
     t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
     i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i'))
-    i_guard.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
+    i_guard.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
     B = i_after.add_access('B')
     indptr = i_after.add_access('vindptr')
     t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
     i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
-    i_after.add_edge(indptr, 'views', B, 'indptr', dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
+    i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
 
+    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -549,5 +555,5 @@ def test_write_nested_structure():
     test_write_structure()
     test_local_structure()
     test_read_nested_structure()
-    test_read_nested_structure_2()
+    # test_read_nested_structure_2()
     test_write_nested_structure()

From 6fa7e53ea4c39752c60b386895a6c9ba4a542b80 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 20:27:41 +0200
Subject: [PATCH 298/392] Added support for direct access to nested data.

---
 dace/codegen/targets/cpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 7ff91cbc7b..137de75c55 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -1169,6 +1169,7 @@ def memlet_definition(self,
         if not types:
             types = self._dispatcher.defined_vars.get(ptr, is_global=True)
         var_type, ctypedef = types
+        ptr = ptr.replace('.', '->')
 
         if fpga.is_fpga_array(desc):
             decouple_array_interfaces = Config.get_bool("compiler", "xilinx", "decouple_array_interfaces")

From 71d7c3db0f2391b79281a89732b64d0d4b861e14 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 20:28:20 +0200
Subject: [PATCH 299/392] WIP: Add nested data free symbols to SDFG.

---
 dace/sdfg/sdfg.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index a4c29c2e89..1f385a4b75 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2005,10 +2005,20 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
                 raise NameError(f'Array or Stream with name "{name}" already exists in SDFG')
         self._arrays[name] = datadesc
 
+        def _add_symbols(desc: dt.Data):
+            if isinstance(desc, dt.Structure):
+                for v in desc.members.values():
+                    if isinstance(v, dt.Data):
+                        _add_symbols(v)
+            for sym in desc.free_symbols:
+                if sym.name not in self.symbols:
+                    self.add_symbol(sym.name, sym.dtype)
+
         # Add free symbols to the SDFG global symbol storage
-        for sym in datadesc.free_symbols:
-            if sym.name not in self.symbols:
-                self.add_symbol(sym.name, sym.dtype)
+        # for sym in datadesc.free_symbols:
+        #     if sym.name not in self.symbols:
+        #         self.add_symbol(sym.name, sym.dtype)
+        _add_symbols(datadesc)
 
         return name
 

From e0a4409ff4a2b909f901f1a1592d3b9669387807 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 20:29:13 +0200
Subject: [PATCH 300/392] Added test for direct nested data access.

---
 tests/sdfg/data/structure_test.py | 82 ++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 6 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 8636dc1602..3116a5764a 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -550,10 +550,80 @@ def test_write_nested_structure():
     assert np.allclose(A, B.toarray())
 
 
+def test_direct_read_structure():
+
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+
+    sdfg = dace.SDFG('csr_to_dense_direct')
+
+    sdfg.add_datadesc('A', csr_obj)
+    sdfg.add_array('B', [M, N], dace.float32)
+
+    # sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    # sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    # sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    # A = state.add_access('A')
+    indptr = state.add_access('A.indptr')
+    indices = state.add_access('A.indices')
+    data = state.add_access('A.data')
+    B = state.add_access('B')
+
+    # indptr = state.add_access('vindptr')
+    # indices = state.add_access('vindices')
+    # data = state.add_access('vdata')
+
+    # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr']))
+    # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices']))
+    # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.indptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.indptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='A.indices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.data', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
+
+    sdfg.view()
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    B = np.zeros((20, 20), dtype=np.float32)
+
+    inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
+                                                indices=A.indices.__array_interface__['data'][0],
+                                                data=A.data.__array_interface__['data'][0],
+                                                rows=A.shape[0],
+                                                cols=A.shape[1],
+                                                M=A.shape[0],
+                                                N=A.shape[1],
+                                                nnz=A.nnz)
+
+    func(A=inpA, B=B, M=20, N=20, nnz=A.nnz)
+    ref = A.toarray()
+
+    assert np.allclose(B, ref)
+
+
 if __name__ == "__main__":
-    test_read_structure()
-    test_write_structure()
-    test_local_structure()
-    test_read_nested_structure()
-    # test_read_nested_structure_2()
-    test_write_nested_structure()
+    # test_read_structure()
+    # test_write_structure()
+    # test_local_structure()
+    # test_read_nested_structure()
+    # test_write_nested_structure()
+    test_direct_read_structure()

From 0593ea4f1a86951b210c727c95931ca3664f7423 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 20 Jul 2023 20:55:42 +0200
Subject: [PATCH 301/392] Added test for direct double-nested data accesses.

---
 tests/sdfg/data/structure_test.py | 75 +++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 3116a5764a..91429e8bbc 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -620,6 +620,80 @@ def test_direct_read_structure():
     assert np.allclose(B, ref)
 
 
+def test_direct_read_nested_structure():
+    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
+                                       indices=dace.int32[nnz],
+                                       data=dace.float32[nnz],
+                                       rows=M,
+                                       cols=N,
+                                       nnz=nnz),
+                                  name='CSRMatrix')
+    wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
+
+    sdfg = dace.SDFG('nested_csr_to_dense_direct')
+
+    sdfg.add_datadesc('A', wrapper_obj)
+    sdfg.add_array('B', [M, N], dace.float32)
+
+    spmat = wrapper_obj.members['csr']
+    sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
+    sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
+    sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    # A = state.add_access('A')
+    indptr = state.add_access('A.csr.indptr')
+    indices = state.add_access('A.csr.indices')
+    data = state.add_access('A.csr.data')
+    B = state.add_access('B')
+
+    # indptr = state.add_access('vindptr')
+    # indices = state.add_access('vindices')
+    # data = state.add_access('vdata')
+
+    # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
+    # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
+    # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.csr.indptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='A.csr.indptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='A.csr.indices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.csr.data', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
+
+    sdfg.view()
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+    B = np.zeros((20, 20), dtype=np.float32)
+
+    structclass = csr_obj.dtype._typeclass.as_ctypes()
+    inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
+                         indices=A.indices.__array_interface__['data'][0],
+                         data=A.data.__array_interface__['data'][0],
+                         rows=A.shape[0],
+                         cols=A.shape[1],
+                         M=A.shape[0],
+                         K=A.shape[1],
+                         nnz=A.nnz)
+    import ctypes
+    inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
+
+    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
+    ref = A.toarray()
+
+    assert np.allclose(B, ref)
+
+
 if __name__ == "__main__":
     # test_read_structure()
     # test_write_structure()
@@ -627,3 +701,4 @@ def test_direct_read_structure():
     # test_read_nested_structure()
     # test_write_nested_structure()
     test_direct_read_structure()
+    test_direct_read_nested_structure()

From 7d29defb511ffc7ba5cb88d440b8e45bc7988e99 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 20 Jul 2023 15:28:42 -0700
Subject: [PATCH 302/392] Relax test for inter-state edges in default schedules

---
 dace/sdfg/validation.py              | 17 ++++++++++++-----
 tests/sdfg/disallowed_access_test.py | 23 +++++++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 3bac646479..4fbc808fdd 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -42,7 +42,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
     """
     # Avoid import loop
     from dace.codegen.targets import fpga
-    from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
+    from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope
 
     references = references or set()
 
@@ -171,10 +171,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
             for memlet in ise_memlets:
                 container = memlet.data
                 if not _accessible(sdfg, container, context):
-                    eid = sdfg.edge_id(edge)
-                    raise InvalidSDFGInterstateEdgeError(
-                        f'Trying to read an inaccessible data container "{container}" '
-                        f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
+                    # Check context w.r.t. maps
+                    in_default_scope = False
+                    if sdfg.parent_nsdfg_node is not None:
+                        if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
+                                       [dtypes.ScheduleType.Default]):
+                            in_default_scope = True
+                    if not in_default_scope:
+                        eid = sdfg.edge_id(edge)
+                        raise InvalidSDFGInterstateEdgeError(
+                            f'Trying to read an inaccessible data container "{container}" '
+                            f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
 
             # Add edge symbols into defined symbols
             symbols.update(issyms)
diff --git a/tests/sdfg/disallowed_access_test.py b/tests/sdfg/disallowed_access_test.py
index 8700e34db5..520481ea46 100644
--- a/tests/sdfg/disallowed_access_test.py
+++ b/tests/sdfg/disallowed_access_test.py
@@ -40,6 +40,7 @@ def test_gpu_access_on_host_interstate_invalid():
 
 @pytest.mark.gpu
 def test_gpu_access_on_host_tasklet():
+
     @dace.program
     def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global):
         for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore:
@@ -49,7 +50,29 @@ def tester(a: dace.float64[20] @ dace.StorageType.GPU_Global):
         tester.to_sdfg(validate=True)
 
 
+@pytest.mark.gpu
+def test_gpu_access_on_device_interstate_edge_default():
+    sdfg = dace.SDFG('tester')
+    sdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global)
+    state = sdfg.add_state()
+
+    me, mx = state.add_map('test', dict(i='0:20'))
+
+    nsdfg = dace.SDFG('nester')
+    nsdfg.add_array('A', [20], dace.float64, storage=dace.StorageType.GPU_Global)
+    state1 = nsdfg.add_state()
+    state2 = nsdfg.add_state()
+    nsdfg.add_edge(state1, state2, dace.InterstateEdge(assignments=dict(s='A[4]')))
+
+    nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'A'}, {})
+    state.add_memlet_path(state.add_read('A'), me, nsdfg_node, dst_conn='A', memlet=dace.Memlet('A[0:20]'))
+    state.add_nedge(nsdfg_node, mx, dace.Memlet())
+
+    sdfg.validate()
+
+
 if __name__ == '__main__':
     test_gpu_access_on_host_interstate_ok()
     test_gpu_access_on_host_interstate_invalid()
     test_gpu_access_on_host_tasklet()
+    test_gpu_access_on_device_interstate_edge_default()

From cbe344c15dfd5668d0ddb5a419279fc12c0ef1da Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@gmail.com>
Date: Thu, 20 Jul 2023 15:34:17 -0700
Subject: [PATCH 303/392] Lazy-evaluate in_default_scope

---
 dace/sdfg/validation.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 4fbc808fdd..aa7674ca45 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -111,6 +111,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
         # Check if SDFG is located within a GPU kernel
         context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None)
         context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None)
+        in_default_scope = None
 
         # Check every state separately
         start_state = sdfg.start_state
@@ -172,12 +173,13 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                 container = memlet.data
                 if not _accessible(sdfg, container, context):
                     # Check context w.r.t. maps
-                    in_default_scope = False
-                    if sdfg.parent_nsdfg_node is not None:
-                        if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
-                                       [dtypes.ScheduleType.Default]):
-                            in_default_scope = True
-                    if not in_default_scope:
+                    if in_default_scope is None:  # Lazy-evaluate in_default_scope
+                        in_default_scope = False
+                        if sdfg.parent_nsdfg_node is not None:
+                            if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
+                                        [dtypes.ScheduleType.Default]):
+                                in_default_scope = True
+                    if in_default_scope is False:
                         eid = sdfg.edge_id(edge)
                         raise InvalidSDFGInterstateEdgeError(
                             f'Trying to read an inaccessible data container "{container}" '
@@ -226,9 +228,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
             for memlet in ise_memlets:
                 container = memlet.data
                 if not _accessible(sdfg, container, context):
-                    raise InvalidSDFGInterstateEdgeError(
-                        f'Trying to read an inaccessible data container "{container}" '
-                        f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
+                    # Check context w.r.t. maps
+                    if in_default_scope is None:  # Lazy-evaluate in_default_scope
+                        in_default_scope = False
+                        if sdfg.parent_nsdfg_node is not None:
+                            if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
+                                        [dtypes.ScheduleType.Default]):
+                                in_default_scope = True
+                    if in_default_scope is False:
+                        raise InvalidSDFGInterstateEdgeError(
+                            f'Trying to read an inaccessible data container "{container}" '
+                            f'(Storage: {sdfg.arrays[container].storage}) in host code interstate edge', sdfg, eid)
 
     except InvalidSDFGError as ex:
         # If the SDFG is invalid, save it

From 08aac34a8fdd6cd38860d5907e7bca925248a20a Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Fri, 21 Jul 2023 08:47:39 +0200
Subject: [PATCH 304/392] Add review changes

---
 dace/sdfg/state.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 0354dd107b..8059609c36 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -298,11 +298,10 @@ def scope_tree(self) -> 'dace.sdfg.scope.ScopeTree':
 
         # Get scopes
         for node, scopenodes in sdc.items():
-            scope_exit_nodes = [v for v in scopenodes if isinstance(v, nd.ExitNode)]
             if node is None:
                 exit_node = None
             else:
-                exit_node = next(iter(scope_exit_nodes))
+                exit_node = next(v for v in scopenodes if isinstance(v, nd.ExitNode))
             scope = ScopeTree(node, exit_node)
             result[node] = scope
 
@@ -506,10 +505,10 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr,
                     in_edges = sg.in_edges(n)
                     out_edges = sg.out_edges(n)
                     # Filter out memlets which go out but the same data is written to the AccessNode by another memlet
-                    for out_edge in out_edges:
-                        for in_edge in in_edges:
-                            if in_edge.data.data == out_edge.data.data and \
-                                    in_edge.data.dst_subset.covers(out_edge.data.src_subset):
+                    for out_edge in list(out_edges):
+                        for in_edge in list(in_edges):
+                            if (in_edge.data.data == out_edge.data.data and
+                                    in_edge.data.dst_subset.covers(out_edge.data.src_subset)):
                                 out_edges.remove(out_edge)
 
                     for e in in_edges:

From 851f1fa4f7041c4c3e1be564ae7c92929bb2dbc3 Mon Sep 17 00:00:00 2001
From: Samuel Martin <samuel.martin@macrolino.ch>
Date: Fri, 21 Jul 2023 08:48:05 +0200
Subject: [PATCH 305/392] Apply suggestions from code review

Co-authored-by: alexnick83 <31545860+alexnick83@users.noreply.github.com>
---
 tests/sdfg/state_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py
index 07e2e8c4c7..c5cb953c4d 100644
--- a/tests/sdfg/state_test.py
+++ b/tests/sdfg/state_test.py
@@ -1,3 +1,4 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 
 

From 9a1db886905fb037e92b96b0fbefa5c5074ba73d Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Fri, 21 Jul 2023 08:49:45 +0200
Subject: [PATCH 306/392] Added myself to AUTHORS file

---
 AUTHORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS b/AUTHORS
index 573f142cf9..48cb4c05ec 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -36,5 +36,6 @@ Reid Wahl
 Yihang Luo
 Alexandru Calotoiu
 Phillip Lane
+Samuel Martin
 
 and other contributors listed in https://github.com/spcl/dace/graphs/contributors

From 0df9c3518c6d1ff307314a39dcbc8621423e3af4 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:17:02 +0200
Subject: [PATCH 307/392] Added free-symbols and repr.

---
 dace/data.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/dace/data.py b/dace/data.py
index e424aca66a..b54a4f9efb 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -364,7 +364,7 @@ class Structure(Data):
     name = Property(dtype=str, desc="Structure name")
 
     def __init__(self,
-                 members: Dict[str, Any],
+                 members: Dict[str, Data],
                  name: str = 'Structure',
                  transient: bool = False,
                  storage: dtypes.StorageType = dtypes.StorageType.Default,
@@ -432,6 +432,17 @@ def start_offset(self):
     @property
     def strides(self):
         return [1]
+    
+    @property
+    def free_symbols(self) -> Set[symbolic.SymbolicType]:
+        """ Returns a set of undefined symbols in this data descriptor. """
+        result = set(self.symbols.keys())
+        for k, v in self.members.items():
+            result |= v.free_symbols
+        return result
+
+    def __repr__(self):
+        return f"{self.name} ({', '.join([f'{k}: {v}' for k, v in self.members.items()])})"
 
     def as_arg(self, with_types=True, for_call=False, name=None):
         if self.storage is dtypes.StorageType.GPU_Global:

From 909c1aaafd76622cecd4972cd2b3718caf2c261f Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:18:17 +0200
Subject: [PATCH 308/392] Recursively add free symbols from nested data.

---
 dace/sdfg/sdfg.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 1f385a4b75..ae85bff5d1 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2015,9 +2015,6 @@ def _add_symbols(desc: dt.Data):
                     self.add_symbol(sym.name, sym.dtype)
 
         # Add free symbols to the SDFG global symbol storage
-        # for sym in datadesc.free_symbols:
-        #     if sym.name not in self.symbols:
-        #         self.add_symbol(sym.name, sym.dtype)
         _add_symbols(datadesc)
 
         return name

From e2b0d8b410e699692c1bf4863ae36a0b6f932e27 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:23:48 +0200
Subject: [PATCH 309/392] Updated tests.

---
 tests/sdfg/data/structure_test.py | 234 +++---------------------------
 1 file changed, 22 insertions(+), 212 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 91429e8bbc..2646fe3d03 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -8,36 +8,10 @@
 from scipy import sparse
 
 
-def create_structure(name: str) -> dace.data.Structure:
-
-    StructureClass = type(name, (dace.data.Structure, ), {})
-
-    @staticmethod
-    def from_json(json_obj, context=None):
-        if json_obj['type'] != name:
-            raise TypeError("Invalid data type")
-
-        # Create dummy object
-        ret = StructureClass({})
-        serialize.set_properties_from_json(ret, json_obj, context=context)
-
-        return ret
-
-    setattr(StructureClass, 'from_json', from_json)
-    StructureClass = make_properties(StructureClass)
-
-    return StructureClass
-
-
 def test_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense')
@@ -83,14 +57,9 @@ def test_read_structure():
 
     inpA = csr_obj.dtype._typeclass.as_ctypes()(indptr=A.indptr.__array_interface__['data'][0],
                                                 indices=A.indices.__array_interface__['data'][0],
-                                                data=A.data.__array_interface__['data'][0],
-                                                rows=A.shape[0],
-                                                cols=A.shape[1],
-                                                M=A.shape[0],
-                                                N=A.shape[1],
-                                                nnz=A.nnz)
+                                                data=A.data.__array_interface__['data'][0])
 
-    func(A=inpA, B=B, M=20, N=20, nnz=A.nnz)
+    func(A=inpA, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz)
     ref = A.toarray()
 
     assert np.allclose(B, ref)
@@ -99,12 +68,7 @@ def test_read_structure():
 def test_write_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('dense_to_csr')
@@ -172,12 +136,7 @@ def test_write_structure():
 
     outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
                                                 indices=B.indices.__array_interface__['data'][0],
-                                                data=B.data.__array_interface__['data'][0],
-                                                rows=tmp.shape[0],
-                                                cols=tmp.shape[1],
-                                                M=tmp.shape[0],
-                                                N=tmp.shape[1],
-                                                nnz=tmp.nnz)
+                                                data=B.data.__array_interface__['data'][0])
 
     func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
 
@@ -187,19 +146,9 @@ def test_write_structure():
 def test_local_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
-    tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix',
                                   transient=True)
 
@@ -298,12 +247,7 @@ def test_local_structure():
 
     outB = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
                                                 indices=B.indices.__array_interface__['data'][0],
-                                                data=B.data.__array_interface__['data'][0],
-                                                rows=tmp.shape[0],
-                                                cols=tmp.shape[1],
-                                                M=tmp.shape[0],
-                                                N=tmp.shape[1],
-                                                nnz=tmp.nnz)
+                                                data=B.data.__array_interface__['data'][0])
 
     func(A=A, B=outB, M=tmp.shape[0], N=tmp.shape[1], nnz=tmp.nnz)
 
@@ -312,12 +256,7 @@ def test_local_structure():
 
 def test_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -366,93 +305,11 @@ def test_read_nested_structure():
     structclass = csr_obj.dtype._typeclass.as_ctypes()
     inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
                          indices=A.indices.__array_interface__['data'][0],
-                         data=A.data.__array_interface__['data'][0],
-                         rows=A.shape[0],
-                         cols=A.shape[1],
-                         M=A.shape[0],
-                         K=A.shape[1],
-                         nnz=A.nnz)
+                         data=A.data.__array_interface__['data'][0])
     import ctypes
     inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
 
-    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
-    ref = A.toarray()
-
-    assert np.allclose(B, ref)
-
-
-@pytest.mark.skip
-def test_read_nested_structure_2():
-    M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
-                                  name='CSRMatrix')
-    CSRView = dace.data.StructureView(csr_obj.members, transient=True)
-    wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
-
-    sdfg = dace.SDFG('nested_csr_to_dense_2')
-
-    sdfg.add_datadesc('A', wrapper_obj)
-    sdfg.add_array('B', [M, N], dace.float32)
-
-    spmat = wrapper_obj.members['csr']
-    sdfg.add_datadesc('vcsr', CSRView)
-    sdfg.add_view('vindptr', spmat.members['indptr'].shape, spmat.members['indptr'].dtype)
-    sdfg.add_view('vindices', spmat.members['indices'].shape, spmat.members['indices'].dtype)
-    sdfg.add_view('vdata', spmat.members['data'].shape, spmat.members['data'].dtype)
-
-    state = sdfg.add_state()
-
-    A = state.add_access('A')
-    B = state.add_access('B')
-
-    csr = state.add_access('vcsr')
-    indptr = state.add_access('vindptr')
-    indices = state.add_access('vindices')
-    data = state.add_access('vdata')
-
-    state.add_edge(A, 'csr', csr, 'views', dace.Memlet.from_array('A.csr', spmat))
-    state.add_edge(csr, 'indptr', indptr, 'views', dace.Memlet.from_array('vcsr.indptr', spmat.members['indptr']))
-    state.add_edge(csr, 'indices', indices, 'views', dace.Memlet.from_array('vcsr.indices', spmat.members['indices']))
-    state.add_edge(csr, 'data', data, 'views', dace.Memlet.from_array('vcsr.data', spmat.members['data']))
-
-    ime, imx = state.add_map('i', dict(i='0:M'))
-    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
-    jme.add_in_connector('start')
-    jme.add_in_connector('stop')
-    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
-
-    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start')
-    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop')
-    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j')
-    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
-    state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
-
-    sdfg.view()
-    return
-    func = sdfg.compile()
-
-    rng = np.random.default_rng(42)
-    A = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
-    B = np.zeros((20, 20), dtype=np.float32)
-
-    structclass = csr_obj.dtype._typeclass.as_ctypes()
-    inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
-                         indices=A.indices.__array_interface__['data'][0],
-                         data=A.data.__array_interface__['data'][0],
-                         rows=A.shape[0],
-                         cols=A.shape[1],
-                         M=A.shape[0],
-                         K=A.shape[1],
-                         nnz=A.nnz)
-    import ctypes
-    inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
-
-    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
+    func(A=inpW, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz)
     ref = A.toarray()
 
     assert np.allclose(B, ref)
@@ -461,12 +318,7 @@ def test_read_nested_structure_2():
 def test_write_nested_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -536,12 +388,7 @@ def test_write_nested_structure():
 
     outCSR = csr_obj.dtype._typeclass.as_ctypes()(indptr=B.indptr.__array_interface__['data'][0],
                                                   indices=B.indices.__array_interface__['data'][0],
-                                                  data=B.data.__array_interface__['data'][0],
-                                                  rows=tmp.shape[0],
-                                                  cols=tmp.shape[1],
-                                                  M=tmp.shape[0],
-                                                  N=tmp.shape[1],
-                                                  nnz=tmp.nnz)
+                                                  data=B.data.__array_interface__['data'][0])
     import ctypes
     outW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(outCSR))
 
@@ -553,12 +400,7 @@ def test_write_nested_structure():
 def test_direct_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense_direct')
@@ -566,26 +408,13 @@ def test_direct_read_structure():
     sdfg.add_datadesc('A', csr_obj)
     sdfg.add_array('B', [M, N], dace.float32)
 
-    # sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
-    # sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
-    # sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
-
     state = sdfg.add_state()
 
-    # A = state.add_access('A')
     indptr = state.add_access('A.indptr')
     indices = state.add_access('A.indices')
     data = state.add_access('A.data')
     B = state.add_access('B')
 
-    # indptr = state.add_access('vindptr')
-    # indices = state.add_access('vindices')
-    # data = state.add_access('vdata')
-
-    # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.indptr', csr_obj.members['indptr']))
-    # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.indices', csr_obj.members['indices']))
-    # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.data', csr_obj.members['data']))
-
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
     jme.add_in_connector('start')
@@ -622,12 +451,7 @@ def test_direct_read_structure():
 
 def test_direct_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1],
-                                       indices=dace.int32[nnz],
-                                       data=dace.float32[nnz],
-                                       rows=M,
-                                       cols=N,
-                                       nnz=nnz),
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -643,20 +467,11 @@ def test_direct_read_nested_structure():
 
     state = sdfg.add_state()
 
-    # A = state.add_access('A')
     indptr = state.add_access('A.csr.indptr')
     indices = state.add_access('A.csr.indices')
     data = state.add_access('A.csr.data')
     B = state.add_access('B')
 
-    # indptr = state.add_access('vindptr')
-    # indices = state.add_access('vindices')
-    # data = state.add_access('vdata')
-
-    # state.add_edge(A, None, indptr, 'views', dace.Memlet.from_array('A.csr.indptr', spmat.members['indptr']))
-    # state.add_edge(A, None, indices, 'views', dace.Memlet.from_array('A.csr.indices', spmat.members['indices']))
-    # state.add_edge(A, None, data, 'views', dace.Memlet.from_array('A.csr.data', spmat.members['data']))
-
     ime, imx = state.add_map('i', dict(i='0:M'))
     jme, jmx = state.add_map('idx', dict(idx='start:stop'))
     jme.add_in_connector('start')
@@ -679,26 +494,21 @@ def test_direct_read_nested_structure():
     structclass = csr_obj.dtype._typeclass.as_ctypes()
     inpCSR = structclass(indptr=A.indptr.__array_interface__['data'][0],
                          indices=A.indices.__array_interface__['data'][0],
-                         data=A.data.__array_interface__['data'][0],
-                         rows=A.shape[0],
-                         cols=A.shape[1],
-                         M=A.shape[0],
-                         K=A.shape[1],
-                         nnz=A.nnz)
+                         data=A.data.__array_interface__['data'][0])
     import ctypes
     inpW = wrapper_obj.dtype._typeclass.as_ctypes()(csr=ctypes.pointer(inpCSR))
 
-    func(A=inpW, B=B, M=20, N=20, nnz=A.nnz)
+    func(A=inpW, B=B, M=A.shape[0], N=A.shape[1], nnz=A.nnz)
     ref = A.toarray()
 
     assert np.allclose(B, ref)
 
 
 if __name__ == "__main__":
-    # test_read_structure()
-    # test_write_structure()
-    # test_local_structure()
-    # test_read_nested_structure()
-    # test_write_nested_structure()
+    test_read_structure()
+    test_write_structure()
+    test_local_structure()
+    test_read_nested_structure()
+    test_write_nested_structure()
     test_direct_read_structure()
     test_direct_read_nested_structure()

From 52afc7250b02fb4b85eb3a62bf5104dce9a72995 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:24:14 +0200
Subject: [PATCH 310/392] Scrapped structure private symbols for now.

---
 dace/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/data.py b/dace/data.py
index b54a4f9efb..9d3b6b86f3 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -436,7 +436,7 @@ def strides(self):
     @property
     def free_symbols(self) -> Set[symbolic.SymbolicType]:
         """ Returns a set of undefined symbols in this data descriptor. """
-        result = set(self.symbols.keys())
+        result = set()
         for k, v in self.members.items():
             result |= v.free_symbols
         return result

From 09246442f6e456b4b090651b895be53e3414a512 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:26:10 +0200
Subject: [PATCH 311/392] Updated tests.

---
 tests/sdfg/data/structure_test.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 2646fe3d03..02b8f0c174 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -48,7 +48,6 @@ def test_read_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -123,7 +122,6 @@ def test_write_structure():
     i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
     i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.indptr', subset='0:M+1'))
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -234,7 +232,6 @@ def test_local_structure():
                                          input_nodes={'tmp_vdata': tmp_data},
                                          output_nodes={'vdata': B_data})
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -295,7 +292,6 @@ def test_read_nested_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -375,7 +371,6 @@ def test_write_nested_structure():
     i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
     i_after.add_edge(indptr, 'views', B, None, dace.Memlet(data='B.csr.indptr', subset='0:M+1'))
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -427,7 +422,6 @@ def test_direct_read_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.data', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)
@@ -484,7 +478,6 @@ def test_direct_read_nested_structure():
     state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='A.csr.data', subset='idx'), dst_conn='__val')
     state.add_memlet_path(t, jmx, imx, B, memlet=dace.Memlet(data='B', subset='0:M, 0:N', volume=1), src_conn='__out')
 
-    sdfg.view()
     func = sdfg.compile()
 
     rng = np.random.default_rng(42)

From 8296a6de765b2209cbd644b6017d68304016ef3c Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 21 Jul 2023 18:29:06 +0200
Subject: [PATCH 312/392] Added setitem.

---
 dace/sdfg/sdfg.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index ae85bff5d1..23964dbe41 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -62,8 +62,12 @@ def __getitem__(self, key):
             token = tokens.pop(0)
             result = result.members[token]
         return result
-
     
+    def __setitem__(self, key, val):
+        if isinstance(key, str) and '.' in key:
+            raise KeyError('NestedDict does not support setting nested keys')
+        super(NestedDict, self).__setitem__(key, val)
+
     def __contains__(self, key):
         tokens = key.split('.') if isinstance(key, str) else [key]
         token = tokens.pop(0)

From e468a0ed965ae4d9b46a9b74d5f11fe66bf5406d Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Mon, 24 Jul 2023 12:05:06 +0200
Subject: [PATCH 313/392] Added testcase and fix

---
 dace/sdfg/state.py       |  1 +
 tests/sdfg/state_test.py | 30 +++++++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 8059609c36..c354cd9d1f 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -510,6 +510,7 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr,
                             if (in_edge.data.data == out_edge.data.data and
                                     in_edge.data.dst_subset.covers(out_edge.data.src_subset)):
                                 out_edges.remove(out_edge)
+                                break
 
                     for e in in_edges:
                         # skip empty memlets
diff --git a/tests/sdfg/state_test.py b/tests/sdfg/state_test.py
index c5cb953c4d..48dea04d0b 100644
--- a/tests/sdfg/state_test.py
+++ b/tests/sdfg/state_test.py
@@ -4,9 +4,9 @@
 
 def test_read_write_set():
     sdfg = dace.SDFG('graph')
-    A = sdfg.add_array('A', [10], dace.float64)
-    B = sdfg.add_array('B', [10], dace.float64)
-    C = sdfg.add_array('C', [10], dace.float64)
+    sdfg.add_array('A', [10], dace.float64)
+    sdfg.add_array('B', [10], dace.float64)
+    sdfg.add_array('C', [10], dace.float64)
     state = sdfg.add_state('state')
     task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1')
     task2 = state.add_tasklet('work2', {'B'},  {'C'}, 'C = B + 1')
@@ -20,5 +20,29 @@ def test_read_write_set():
 
     assert 'B' not in state.read_and_write_sets()[0]
 
+
+def test_read_write_set_y_formation():
+    sdfg = dace.SDFG('graph')
+    state = sdfg.add_state('state')
+    sdfg.add_array('A', [2], dace.float64)
+    sdfg.add_array('B', [2], dace.float64)
+    sdfg.add_array('C', [2], dace.float64)
+    task1 = state.add_tasklet('work1', {'A'}, {'B'}, 'B = A + 1')
+    task2 = state.add_tasklet('work2', {'B'},  {'C'}, 'C += B + 1')
+    task3 = state.add_tasklet('work3', {'A'},  {'B'}, 'B = A + 2')
+    read_a = state.add_access('A')
+    rw_b = state.add_access('B')
+    write_c = state.add_access('C')
+    state.add_memlet_path(read_a, task1, dst_conn='A', memlet=dace.Memlet(data='A', subset='0'))
+    state.add_memlet_path(read_a, task3, dst_conn='A', memlet=dace.Memlet(data='A', subset='1'))
+    state.add_memlet_path(task1, rw_b, src_conn='B', memlet=dace.Memlet(data='B', subset='0'))
+    state.add_memlet_path(task3, rw_b, src_conn='B', memlet=dace.Memlet(data='B', subset='0'))
+    state.add_memlet_path(rw_b, task2, dst_conn='B', memlet=dace.Memlet(data='B', subset='0'))
+    state.add_memlet_path(task2, write_c, src_conn='C', memlet=dace.Memlet(data='C', subset='0'))
+
+    assert 'B' not in state.read_and_write_sets()[0]
+
+
 if __name__ == '__main__':
     test_read_write_set()
+    test_read_write_set_y_formation()

From e68c3423834fb5479f22c039de7fe167e57d3f37 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Jul 2023 23:25:36 +0000
Subject: [PATCH 314/392] Bump certifi from 2023.5.7 to 2023.7.22

Bumps [certifi](https://github.com/certifi/python-certifi) from 2023.5.7 to 2023.7.22.
- [Commits](https://github.com/certifi/python-certifi/compare/2023.05.07...2023.07.22)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index da67189b70..33cd58a0bf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 aenum==3.1.12
 astunparse==1.6.3
 blinker==1.6.2
-certifi==2023.5.7
+certifi==2023.7.22
 charset-normalizer==3.1.0
 click==8.1.3
 dill==0.3.6

From e150571e453b5a4e646941ce650afd73b7632a20 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 13:44:58 +0200
Subject: [PATCH 315/392] Added test.

---
 tests/sdfg/validation/nested_sdfg_test.py | 38 +++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 tests/sdfg/validation/nested_sdfg_test.py

diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py
new file mode 100644
index 0000000000..127543fc95
--- /dev/null
+++ b/tests/sdfg/validation/nested_sdfg_test.py
@@ -0,0 +1,38 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import numpy as np
+import dace
+
+
+def test_inout_connector():
+
+    sdfg = dace.SDFG("test_inout_connector")
+    sdfg.add_array("A", [1], dace.int32)
+    sdfg.add_array("B", [1], dace.int32)
+
+    nsdfg = dace.SDFG("nested_sdfg")
+    nsdfg.add_array("C", [1], dace.int32)
+
+    nstate = nsdfg.add_state()
+    read_c = nstate.add_access("C")
+    write_c = nstate.add_access("C")
+    tasklet = nstate.add_tasklet("tasklet", {"__inp"}, {"__out"}, "__out = __inp + 5")
+    nstate.add_edge(read_c, None, tasklet, '__inp', dace.Memlet.from_array('C', nsdfg.arrays['C']))
+    nstate.add_edge(tasklet, '__out', write_c, None, dace.Memlet.from_array('C', nsdfg.arrays['C']))
+
+    state = sdfg.add_state()
+    read_a = state.add_access("A")
+    write_b = state.add_access("B")
+    tasklet = state.add_nested_sdfg(nsdfg, sdfg, {"C"}, {"C"})
+    state.add_edge(read_a, None, tasklet, 'C', dace.Memlet.from_array('A', sdfg.arrays['A']))
+    state.add_edge(tasklet, 'C', write_b, None, dace.Memlet.from_array('B', sdfg.arrays['B']))
+
+    try:
+        sdfg.validate()
+    except dace.sdfg.InvalidSDFGError:
+        return
+    
+    assert False, "SDFG should not validate"
+
+
+if __name__ == "__main__":
+    test_inout_connector()

From 16b64ccba063066c74782598e98ea9e05fe077ea Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 13:45:26 +0200
Subject: [PATCH 316/392] Removed unneeded import.

---
 tests/sdfg/validation/nested_sdfg_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py
index 127543fc95..d4d34d0a15 100644
--- a/tests/sdfg/validation/nested_sdfg_test.py
+++ b/tests/sdfg/validation/nested_sdfg_test.py
@@ -1,5 +1,4 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-import numpy as np
 import dace
 
 

From dda325a4284a01fb780112f4ac8a5d43eac191cf Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 13:55:38 +0200
Subject: [PATCH 317/392] Added inout connector validation.

---
 dace/sdfg/nodes.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 5c270153e1..f9ccda46e1 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -634,6 +634,23 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
                 raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname)
             if dname in connectors and desc.transient:
                 raise NameError('"%s" is a connector but its corresponding array is transient' % dname)
+        
+        # Validate inout connectors
+        inout_connectors = self.in_connectors.keys() & self.out_connectors.keys()
+        for conn in inout_connectors:
+            inputs = set()
+            outputs = set()
+            for edge in state.in_edges_by_connector(self, conn):
+                src = state.memlet_path(edge)[0].src
+                if isinstance(src, AccessNode):
+                    inputs.add(src.data)
+            for edge in state.out_edges_by_connector(self, conn):
+                dst = state.memlet_path(edge)[-1].dst
+                if isinstance(dst, AccessNode):
+                    outputs.add(dst.data)
+            if len(inputs - outputs) > 0:
+                raise ValueError(f"Inout connector {conn} is connected to different input ({inputs}) and "
+                                 f"output ({outputs}) arrays")
 
         # Validate undefined symbols
         symbols = set(k for k in self.sdfg.free_symbols if k not in connectors)

From b72c72249361225aeb6cfd1565de16937c9497cf Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 13:56:00 +0200
Subject: [PATCH 318/392] Added test.

---
 tests/sdfg/validation/nested_sdfg_test.py | 40 ++++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py
index d4d34d0a15..398a1635ef 100644
--- a/tests/sdfg/validation/nested_sdfg_test.py
+++ b/tests/sdfg/validation/nested_sdfg_test.py
@@ -2,9 +2,40 @@
 import dace
 
 
-def test_inout_connector():
+def test_inout_connector_validation_success():
 
-    sdfg = dace.SDFG("test_inout_connector")
+    sdfg = dace.SDFG("test_inout_connector_validation_success")
+    sdfg.add_array("A", [1], dace.int32)
+    sdfg.add_array("B", [1], dace.int32)
+
+    nsdfg = dace.SDFG("nested_sdfg")
+    nsdfg.add_array("C", [1], dace.int32)
+
+    nstate = nsdfg.add_state()
+    read_c = nstate.add_access("C")
+    write_c = nstate.add_access("C")
+    tasklet = nstate.add_tasklet("tasklet", {"__inp"}, {"__out"}, "__out = __inp + 5")
+    nstate.add_edge(read_c, None, tasklet, '__inp', dace.Memlet.from_array('C', nsdfg.arrays['C']))
+    nstate.add_edge(tasklet, '__out', write_c, None, dace.Memlet.from_array('C', nsdfg.arrays['C']))
+
+    state = sdfg.add_state()
+    read_b = state.add_access("B")
+    write_b = state.add_access("B")
+    tasklet = state.add_nested_sdfg(nsdfg, sdfg, {"C"}, {"C"})
+    state.add_edge(read_b, None, tasklet, 'C', dace.Memlet.from_array('B', sdfg.arrays['B']))
+    state.add_edge(tasklet, 'C', write_b, None, dace.Memlet.from_array('B', sdfg.arrays['B']))
+
+    try:
+        sdfg.validate()
+    except dace.sdfg.InvalidSDFGError:
+        assert False, "SDFG should validate"
+
+    return
+
+
+def test_inout_connector_validation_fail():
+
+    sdfg = dace.SDFG("test_inout_connector_validation_fail")
     sdfg.add_array("A", [1], dace.int32)
     sdfg.add_array("B", [1], dace.int32)
 
@@ -29,9 +60,10 @@ def test_inout_connector():
         sdfg.validate()
     except dace.sdfg.InvalidSDFGError:
         return
-    
+
     assert False, "SDFG should not validate"
 
 
 if __name__ == "__main__":
-    test_inout_connector()
+    test_inout_connector_validation_success()
+    test_inout_connector_validation_fail()

From 438dafdc9325eccbcfff488be579f144ba2e07b0 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 15:32:50 +0200
Subject: [PATCH 319/392] SubgraphFusion doesn't remove intermediate nodes
 whose data have also output accesses.

---
 dace/transformation/subgraph/subgraph_fusion.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dace/transformation/subgraph/subgraph_fusion.py b/dace/transformation/subgraph/subgraph_fusion.py
index a56336fa8d..1ff286b85c 100644
--- a/dace/transformation/subgraph/subgraph_fusion.py
+++ b/dace/transformation/subgraph/subgraph_fusion.py
@@ -1146,10 +1146,15 @@ def change_data(transient_array, shape, strides, total_size, offset, lifetime, s
         # by reconnecting their adjacent edges to nodes outside the subgraph.
         # NOTE: Currently limited to cases where there is a single source and sink
         # if there are multiple intermediate accesses for the same data.
+        # NOTE: Currently limited to intermediate data that do not have a separate output node
+
+        # Filter out outputs
+        output_data = set([n.data for n in out_nodes])
+        true_intermediate_nodes = set([n for n in intermediate_nodes if n.data not in output_data])
 
         # Sort intermediate nodes by data name
         intermediate_data = dict()
-        for acc in intermediate_nodes:
+        for acc in true_intermediate_nodes:
             if acc.data in intermediate_data:
                 intermediate_data[acc.data].append(acc)
             else:

From 4b14a733d10e7a121bd4a02ab2810e353b403272 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 17:25:54 +0200
Subject: [PATCH 320/392] Added utility methods for a finding (one of) the
 sources and destinations of a memlet path across nested SDFG levels.

---
 dace/sdfg/utils.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 8d251efd89..7eef600180 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1797,3 +1797,29 @@ def get_thread_local_data(sdfg: SDFG) -> List[str]:
         if not sdfg.arrays[name].transient:
             warnings.warn(f'Found thread-local data "{name}" that is not transient.')
     return result
+
+
+def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node:
+    src = state.memlet_path(edge)[0].src
+    if isinstance(src, nd.AccessNode) and not sdfg.arrays[src.data].transient and sdfg.parent is not None:
+        psdfg = sdfg.parent_sdfg
+        pstate = sdfg.parent
+        pnode = sdfg.parent_nsdfg_node
+        pedges = list(pstate.in_edges_by_connector(pnode, src.data))
+        if len(pedges) > 0:
+            pedge = pedges[0]
+            return get_global_memlet_path_src(psdfg, pstate, pedge)
+    return src
+
+
+def get_global_memlet_path_dst(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node:
+    dst = state.memlet_path(edge)[-1].dst
+    if isinstance(dst, nd.AccessNode) and not sdfg.arrays[dst.data].transient and sdfg.parent is not None:
+        psdfg = sdfg.parent_sdfg
+        pstate = sdfg.parent
+        pnode = sdfg.parent_nsdfg_node
+        pedges = list(pstate.out_edges_by_connector(pnode, dst.data))
+        if len(pedges) > 0:
+            pedge = pedges[0]
+            return get_global_memlet_path_dst(psdfg, pstate, pedge)
+    return dst

From 7fb6757ff3df535580158e5df9d04a6c4cf41c57 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 17:26:25 +0200
Subject: [PATCH 321/392] Amended validation to use new utility methods.

---
 dace/sdfg/nodes.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index f9ccda46e1..6ba84d919e 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -636,16 +636,17 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
                 raise NameError('"%s" is a connector but its corresponding array is transient' % dname)
         
         # Validate inout connectors
+        from dace.sdfg import utils  # Avoids circular import
         inout_connectors = self.in_connectors.keys() & self.out_connectors.keys()
         for conn in inout_connectors:
             inputs = set()
             outputs = set()
             for edge in state.in_edges_by_connector(self, conn):
-                src = state.memlet_path(edge)[0].src
+                src = utils.get_global_memlet_path_src(sdfg, state, edge)
                 if isinstance(src, AccessNode):
                     inputs.add(src.data)
             for edge in state.out_edges_by_connector(self, conn):
-                dst = state.memlet_path(edge)[-1].dst
+                dst = utils.get_global_memlet_path_dst(sdfg, state, edge)
                 if isinstance(dst, AccessNode):
                     outputs.add(dst.data)
             if len(inputs - outputs) > 0:

From 70198d5fae51ff3884966a7c13ed475e0216ebc9 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 20:29:18 +0200
Subject: [PATCH 322/392] Added comm-comparison tests.

---
 .../mpi4py/comm_comparison_test.py            | 200 ++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 tests/python_frontend/mpi4py/comm_comparison_test.py

diff --git a/tests/python_frontend/mpi4py/comm_comparison_test.py b/tests/python_frontend/mpi4py/comm_comparison_test.py
new file mode 100644
index 0000000000..e7d74e5981
--- /dev/null
+++ b/tests/python_frontend/mpi4py/comm_comparison_test.py
@@ -0,0 +1,200 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Tests comparison operators with communicator objects. """
+import dace
+import numpy as np
+import pytest
+
+
+@pytest.mark.mpi
+def test_eq_commworld_0():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+    @dace.program
+    def eq_commworld_0(out: dace.bool[1]):
+        out[0] = comm == MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    eq_commworld_0(res)
+    assert res[0] == (comm == MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_eq_commworld_1():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    comm2 = comm.Dup()
+
+    @dace.program
+    def eq_commworld_1(out: dace.bool[1]):
+        out[0] = comm2 == MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    eq_commworld_1(res)
+    assert res[0] == (comm2 == MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_eq_commworld_2():
+
+    from mpi4py import MPI
+
+    @dace.program
+    def eq_commworld_2(out: dace.bool[1]):
+        out[0] = MPI.COMM_NULL == MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    eq_commworld_2(res)
+    assert res[0] == (MPI.COMM_NULL == MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_noteq_commworld_0():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+    @dace.program
+    def noteq_commworld_0(out: dace.bool[1]):
+        out[0] = comm != MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    noteq_commworld_0(res)
+    assert res[0] == (comm != MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_noteq_commworld_1():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    comm2 = comm.Dup()
+
+    @dace.program
+    def noteq_commworld_1(out: dace.bool[1]):
+        out[0] = comm2 != MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    noteq_commworld_1(res)
+    assert res[0] == (comm2 != MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_noteq_commworld_2():
+
+    from mpi4py import MPI
+
+    @dace.program
+    def noteq_commworld_2(out: dace.bool[1]):
+        out[0] = MPI.COMM_NULL != MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    noteq_commworld_2(res)
+    assert res[0] == (MPI.COMM_NULL != MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_is_commworld_0():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+    @dace.program
+    def is_commworld_0(out: dace.bool[1]):
+        out[0] = comm is MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    is_commworld_0(res)
+    assert res[0] == (comm is MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_is_commworld_1():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    comm2 = comm.Dup()
+
+    @dace.program
+    def is_commworld_1(out: dace.bool[1]):
+        out[0] = comm2 is MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    is_commworld_1(res)
+    assert res[0] == (comm2 is MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_is_commworld_2():
+
+    from mpi4py import MPI
+
+    @dace.program
+    def is_commworld_2(out: dace.bool[1]):
+        out[0] = MPI.COMM_NULL is MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    is_commworld_2(res)
+    assert res[0] == (MPI.COMM_NULL is MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_isnot_commworld_0():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+    @dace.program
+    def isnot_commworld_0(out: dace.bool[1]):
+        out[0] = comm is MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    isnot_commworld_0(res)
+    assert res[0] == (comm is MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_isnot_commworld_1():
+
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    comm2 = comm.Dup()
+
+    @dace.program
+    def isnot_commworld_1(out: dace.bool[1]):
+        out[0] = comm2 is not MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    isnot_commworld_1(res)
+    assert res[0] == (comm2 is not MPI.COMM_WORLD)
+
+
+@pytest.mark.mpi
+def test_isnot_commworld_2():
+
+    from mpi4py import MPI
+
+    @dace.program
+    def isnot_commworld_2(out: dace.bool[1]):
+        out[0] = MPI.COMM_NULL is not MPI.COMM_WORLD
+    
+    res = np.zeros((1,), dtype=np.bool_)
+    isnot_commworld_2(res)
+    assert res[0] == (MPI.COMM_NULL is not MPI.COMM_WORLD)
+
+
+if __name__ == "__main__":
+    test_eq_commworld_0()
+    test_eq_commworld_1()
+    test_eq_commworld_2()
+    test_noteq_commworld_0()
+    test_noteq_commworld_1()
+    test_noteq_commworld_2()
+    test_is_commworld_0()
+    test_is_commworld_1()
+    test_is_commworld_2()
+    test_isnot_commworld_0()
+    test_isnot_commworld_1()
+    test_isnot_commworld_2()

From 727afa7d9a2dfcf0cd0ac3a81b1841a8a773fc54 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 20:30:47 +0200
Subject: [PATCH 323/392] Refactored communicator comparsion replacements.

---
 dace/frontend/common/distr.py | 68 ++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 68b6f120d8..b6868d3289 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1,16 +1,15 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-from numbers import Integral, Number
-from typing import Sequence, Tuple, Union
-
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
+import itertools
+import sympy as sp
+
 from dace import dtypes, symbolic
 from dace.frontend.common import op_repository as oprepo
+from dace.frontend.python.replacements import _define_local_scalar
 from dace.memlet import Memlet
 from dace.sdfg import SDFG, SDFGState
-
-import sympy as sp
-
-from dace.frontend.python.replacements import _define_local_scalar
+from numbers import Integral, Number
+from typing import Sequence, Tuple, Union
 
 ShapeType = Sequence[Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]]
 RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]
@@ -117,40 +116,33 @@ def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid:
     return _cart_sub(pv, sdfg, state, parent_grid, color)
 
 
-@oprepo.replaces_operator('ProcessGrid', 'Eq', otherclass='Comm')
-@oprepo.replaces_operator('ProcessGrid', 'Is', otherclass='Comm')
-def _pgrid_eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'):
-    from mpi4py import MPI
-    if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL:
-        return False
-    return True
-
-
-@oprepo.replaces_operator('Comm', 'Eq', otherclass='ProcessGrid')
-@oprepo.replaces_operator('Comm', 'Is', otherclass='ProcessGrid')
-def _comm_eq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'):
-    from mpi4py import MPI
-    if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL:
-        return False
-    return True
-
+# TODO: Revisit after discussing how "immutable" mpi4py communicators are during the program's execution.
+for left_cls, right_cls in itertools.product(['Comm', 'Cartcomm', 'Intracomm'], repeat=2):
 
-@oprepo.replaces_operator('ProcessGrid', 'NotEq', otherclass='Comm')
-@oprepo.replaces_operator('ProcessGrid', 'IsNot', otherclass='Comm')
-def _pgrid_neq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: 'Comm'):
-    from mpi4py import MPI
-    if op2 is MPI.COMM_WORLD or op2 is MPI.COMM_NULL:
-        return True
-    return False
+    @oprepo.replaces_operator(left_cls, 'Eq', otherclass=right_cls)
+    def _eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
+        return op1 == op2
+    
+    @oprepo.replaces_operator(left_cls, 'NotEq', otherclass=right_cls)
+    def _noteq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
+        return op1 != op2
+    
+    @oprepo.replaces_operator(left_cls, 'Is', otherclass=right_cls)
+    def _is_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
+        return op1 is op2
+    
+    @oprepo.replaces_operator(left_cls, 'IsNot', otherclass=right_cls)
+    def _isnot_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
+        return op1 is not op2
 
 
-@oprepo.replaces_operator('Comm', 'NotEq', otherclass='ProcessGrid')
-@oprepo.replaces_operator('Comm', 'IsNot', otherclass='ProcessGrid')
-def _comm_neq_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'str'):
-    from mpi4py import MPI
-    if op1 is MPI.COMM_WORLD or op1 is MPI.COMM_NULL:
+for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'], ['Eq', 'NotEq', 'Is', 'IsNot']):
+    @oprepo.replaces_operator(cls_a, op, otherclass=cls_b)
+    @oprepo.replaces_operator(cls_b, op, otherclass=cls_a)
+    def _op_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: Union[str, 'Comm'], op2: Union[str, 'Comm']):
+        if op in ('Eq', 'Is'):
+            return False
         return True
-    return False
 
 
 ##### MPI Collectives

From ac177bdc9a63709366fd8c857cc5007a950dbbb5 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 20:31:43 +0200
Subject: [PATCH 324/392] Addressed review comments.

---
 dace/frontend/python/newast.py         | 2 +-
 dace/frontend/python/preprocessing.py  | 2 +-
 dace/libraries/mpi/nodes/alltoall.py   | 2 +-
 dace/libraries/mpi/nodes/isend.py      | 7 ++-----
 tests/library/mpi/mpi4py_test.py       | 2 +-
 tests/library/mpi/mpi_alltoall_test.py | 2 +-
 6 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 31cb8907c1..853316e097 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1308,7 +1308,7 @@ def defined(self):
         try:
             from mpi4py import MPI
             result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)})
-        except:
+        except (ImportError, ModuleNotFoundError):
             pass
 
         return result
diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index ea312a18c0..6a4ea89394 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -1553,7 +1553,7 @@ def visit_BinOp(self, node: ast.BinOp) -> ast.BinOp:
         if isinstance(node.op, ast.Mod):
             left = self.generic_visit(node.left)
             right = self.generic_visit(node.right)
-            newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=copy.deepcopy(right)), left)
+            newleft = ast.copy_location(ast.BinOp(left=left, op=ast.Add(), right=astutils.copy_tree(right)), left)
             node.left = newleft
             return node
         return self.generic_visit(node)
diff --git a/dace/libraries/mpi/nodes/alltoall.py b/dace/libraries/mpi/nodes/alltoall.py
index 92be24ce45..bb64740f50 100644
--- a/dace/libraries/mpi/nodes/alltoall.py
+++ b/dace/libraries/mpi/nodes/alltoall.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace.library
 import dace.properties
 import dace.sdfg.nodes
diff --git a/dace/libraries/mpi/nodes/isend.py b/dace/libraries/mpi/nodes/isend.py
index 8de4035515..39951dd0d0 100644
--- a/dace/libraries/mpi/nodes/isend.py
+++ b/dace/libraries/mpi/nodes/isend.py
@@ -97,11 +97,8 @@ def validate(self, sdfg, state):
             if e.src_conn == "_request":
                 req = sdfg.arrays[e.data.data]
 
-        # TODO: Should we expect any integer type here and cast to int32 later?. Investigate further in the future.
-        # if dest.dtype.base_type != dace.dtypes.int32:
-        #     raise ValueError("Destination must be an integer!")
-        # if tag.dtype.base_type != dace.dtypes.int32:
-        #     raise ValueError("Tag must be an integer!")
+        # TODO: Should we expect any integer type for dst/tag and cast to int32 later?.
+        # TODO: Investigate further in the future.
 
         count_str = "XXX"
         for _, _, _, dst_conn, data in state.in_edges(self):
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 1bbeae627f..2237ed8ba4 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 from dace.sdfg import utils
 import dace.dtypes as dtypes
diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py
index e1eb4fe5f1..66199d9aa5 100644
--- a/tests/library/mpi/mpi_alltoall_test.py
+++ b/tests/library/mpi/mpi_alltoall_test.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 from dace.memlet import Memlet
 import dace.libraries.mpi as mpi

From 01f82fac5486ca4faca9e0408a7b0cecdb8bc121 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 20:34:13 +0200
Subject: [PATCH 325/392] YAPF

---
 dace/frontend/common/distr.py                 | 16 +++---
 dace/frontend/python/newast.py                |  6 ++-
 tests/library/mpi/mpi4py_test.py              | 50 ++++++++++++-------
 tests/library/mpi/mpi_allgather_test.py       |  5 +-
 tests/library/mpi/mpi_alltoall_test.py        |  4 +-
 tests/library/mpi/mpi_isend_irecv_test.py     |  4 +-
 tests/library/mpi/mpi_send_recv_test.py       |  1 +
 .../mpi4py/comm_comparison_test.py            | 48 +++++++++---------
 8 files changed, 75 insertions(+), 59 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index b6868d3289..d6f22da358 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -122,21 +122,23 @@ def _pgrid_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid:
     @oprepo.replaces_operator(left_cls, 'Eq', otherclass=right_cls)
     def _eq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
         return op1 == op2
-    
+
     @oprepo.replaces_operator(left_cls, 'NotEq', otherclass=right_cls)
     def _noteq_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
         return op1 != op2
-    
+
     @oprepo.replaces_operator(left_cls, 'Is', otherclass=right_cls)
     def _is_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
         return op1 is op2
-    
+
     @oprepo.replaces_operator(left_cls, 'IsNot', otherclass=right_cls)
     def _isnot_comm(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: 'Comm', op2: 'Comm'):
         return op1 is not op2
 
 
-for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'], ['Eq', 'NotEq', 'Is', 'IsNot']):
+for cls_a, cls_b, op in itertools.product(['ProcessGrid'], ['Comm', 'Cartcomm', 'Intracomm'],
+                                          ['Eq', 'NotEq', 'Is', 'IsNot']):
+
     @oprepo.replaces_operator(cls_a, op, otherclass=cls_b)
     @oprepo.replaces_operator(cls_b, op, otherclass=cls_a)
     def _op_pgrid(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: Union[str, 'Comm'], op2: Union[str, 'Comm']):
@@ -469,7 +471,7 @@ def _send(pv: ProgramVisitor,
 
 @oprepo.replaces_method('Intracomm', 'Send')
 def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
-                     dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+                    dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """
 
     from mpi4py import MPI
@@ -481,7 +483,7 @@ def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: '
 
 @oprepo.replaces_method('ProcessGrid', 'Send')
 def _pgrid_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, pgrid: str, buffer: str,
-                 dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+                dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Send(buffer, dst, tag, grid=pgrid)`. """
 
     raise NotImplementedError('ProcessGrid.Send is not supported yet.')
@@ -689,7 +691,7 @@ def _recv(pv: ProgramVisitor,
 
 @oprepo.replaces_method('Intracomm', 'Recv')
 def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
-                     src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
+                    src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """
 
     from mpi4py import MPI
diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 853316e097..c9d92b7860 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1304,7 +1304,9 @@ def defined(self):
         result.update(self.sdfg.arrays)
 
         # MPI-related stuff
-        result.update({k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids})
+        result.update(
+            {k: self.sdfg.process_grids[v]
+             for k, v in self.variables.items() if v in self.sdfg.process_grids})
         try:
             from mpi4py import MPI
             result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)})
@@ -5001,7 +5003,7 @@ def visit_Subscript(self, node: ast.Subscript, inference: bool = False):
             rng = expr.subset
             rng.offset(rng, True)
             return self.sdfg.arrays[array].dtype, rng.size()
-        
+
         if is_read:
             return self._add_read_slice(array, node, expr)
         else:
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 2237ed8ba4..52b5deb7a8 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -20,7 +20,7 @@ def comm_world_bcast(A: dace.int32[10]):
 
     if size < 2:
         raise ValueError("Please run this test with at least two processes.")
-    
+
     sdfg = None
     if rank == 0:
         sdfg = comm_world_bcast.to_sdfg()
@@ -36,7 +36,7 @@ def comm_world_bcast(A: dace.int32[10]):
     func(A=A)
     comm_world_bcast.f(A_ref)
 
-    assert(np.array_equal(A, A_ref))
+    assert (np.array_equal(A, A_ref))
 
 
 @pytest.mark.mpi
@@ -55,7 +55,7 @@ def external_comm_bcast(A: dace.int32[10]):
 
     if size < 2:
         raise ValueError("Please run this test with at least two processes.")
-    
+
     sdfg = None
     if rank == 0:
         sdfg = external_comm_bcast.to_sdfg()
@@ -74,7 +74,7 @@ def external_comm_bcast(A: dace.int32[10]):
     func(A=A, new_comm=new_comm.py2f())
     external_comm_bcast.f(A_ref)
 
-    assert(np.array_equal(A, A_ref))
+    assert (np.array_equal(A, A_ref))
 
 
 @pytest.mark.mpi
@@ -109,7 +109,7 @@ def pgrid_bcast(A: dace.int32[10]):
     func(A=A)
     pgrid_bcast.f(A_ref)
 
-    assert(np.array_equal(A, A_ref))
+    assert (np.array_equal(A, A_ref))
 
 
 @pytest.mark.mpi
@@ -149,12 +149,24 @@ def subgrid_bcast(A: dace.int32[10], rank: dace.int32):
     func(A=A, rank=rank)
     subgrid_bcast.f(A_ref, rank)
 
-    assert(np.array_equal(A, A_ref))
-
-
-def initialize_3mm(b_NI: int, b_NJ: int, b_NK: int, b_NL: int, b_NM: int,
-                   ts_NI: int, ts_NJ: int, ts_NK, ts_NL: int, ts_NM: int,
-                   NI: int, NJ: int, NK: int, NL: int, NM: int,
+    assert (np.array_equal(A, A_ref))
+
+
+def initialize_3mm(b_NI: int,
+                   b_NJ: int,
+                   b_NK: int,
+                   b_NL: int,
+                   b_NM: int,
+                   ts_NI: int,
+                   ts_NJ: int,
+                   ts_NK,
+                   ts_NL: int,
+                   ts_NM: int,
+                   NI: int,
+                   NJ: int,
+                   NK: int,
+                   NL: int,
+                   NM: int,
                    datatype: type = np.float64):
 
     A = np.fromfunction(lambda i, k: b_NK + k + 1, (ts_NI, ts_NK), dtype=datatype)
@@ -206,16 +218,16 @@ def k3mm(A, B, C, D):
             return E
 
     N = 128
-    assert(size <= 128)
-    
-    NI, NJ, NK, NL, NM = (N,) * 5
+    assert (size <= 128)
+
+    NI, NJ, NK, NL, NM = (N, ) * 5
     PNI, PNJ, PNK, PNL, PNM = 1, 2, 1, 1, 1
 
     cart_comm = commworld.Create_cart([1, size, 1])
     cart_rank = cart_comm.Get_rank()
     cart_size = cart_comm.Get_size()
     cart_coords = cart_comm.Get_coords(cart_rank)
-    
+
     ts_NI = int(np.ceil(NI / PNI))
     ts_NJ = int(np.ceil(NJ / PNJ))
     ts_NK = int(np.ceil(NJ / PNK))
@@ -240,7 +252,7 @@ def k3mm(A, B, C, D):
     commworld.Barrier()
 
     if E_ref is not None:
-        assert(np.array_equal(E, E_ref))
+        assert (np.array_equal(E, E_ref))
 
 
 @pytest.mark.mpi
@@ -255,7 +267,7 @@ def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32):
         src = (rank - 1) % size
         dst = (rank + 1) % size
         req = np.empty((2, ), dtype=MPI.Request)
-        sbuf = np.full((1,), rank, dtype=np.int32)
+        sbuf = np.full((1, ), rank, dtype=np.int32)
         req[0] = commworld.Isend(sbuf, dst, tag=0)
         rbuf = np.empty((1, ), dtype=np.int32)
         req[1] = commworld.Irecv(rbuf, src, tag=0)
@@ -284,7 +296,7 @@ def test_send_recv():
     def mpi4py_send_recv(rank: dace.int32, size: dace.int32):
         src = np.full([1], (rank - 1) % size, dtype=np.int32)
         dst = np.full([1], (rank + 1) % size, dtype=np.int32)
-        sbuf = np.full((1,), rank, dtype=np.int32)
+        sbuf = np.full((1, ), rank, dtype=np.int32)
         commworld.Send(sbuf, dst, tag=0)
         rbuf = np.empty((1, ), dtype=np.int32)
         commworld.Recv(rbuf, src, tag=0)
@@ -310,7 +322,7 @@ def test_alltoall():
 
     @dace.program
     def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
-        sbuf = np.full((size,), rank, dtype=np.int32)
+        sbuf = np.full((size, ), rank, dtype=np.int32)
         rbuf = np.zeros((size, ), dtype=np.int32)
         commworld.Alltoall(sbuf, rbuf)
         return rbuf
diff --git a/tests/library/mpi/mpi_allgather_test.py b/tests/library/mpi/mpi_allgather_test.py
index 1eebcd5676..1f0a30a4d1 100644
--- a/tests/library/mpi/mpi_allgather_test.py
+++ b/tests/library/mpi/mpi_allgather_test.py
@@ -22,10 +22,7 @@ def make_sdfg(dtype):
     outA = state.add_access("outA")
     allgather_node = mpi.nodes.allgather.Allgather("allgather")
 
-    state.add_memlet_path(inA,
-                          allgather_node,
-                          dst_conn="_inbuffer",
-                          memlet=Memlet.simple(inA, "0:n", num_accesses=n))
+    state.add_memlet_path(inA, allgather_node, dst_conn="_inbuffer", memlet=Memlet.simple(inA, "0:n", num_accesses=n))
     state.add_memlet_path(allgather_node,
                           outA,
                           src_conn="_outbuffer",
diff --git a/tests/library/mpi/mpi_alltoall_test.py b/tests/library/mpi/mpi_alltoall_test.py
index 66199d9aa5..b51289ddd0 100644
--- a/tests/library/mpi/mpi_alltoall_test.py
+++ b/tests/library/mpi/mpi_alltoall_test.py
@@ -56,14 +56,14 @@ def test_mpi(implementation, dtype):
         comm.Barrier()
 
     size = 128
-    size_per_proc = int(size/commsize)
+    size_per_proc = int(size / commsize)
     A = np.arange(0, size, dtype=np_dtype)
     B = np.full(size, 0, dtype=np_dtype)
     mpi_sdfg(inbuf=A, outbuf=B, n=size)
 
     # now B should be an array of size,
     # containing (size / size_per_proc) repeated chunked_data
-    chunked_data = A[rank * size_per_proc: (rank + 1) * size_per_proc]
+    chunked_data = A[rank * size_per_proc:(rank + 1) * size_per_proc]
     correct_data = np.tile(chunked_data, int(size / size_per_proc))
     if (not np.allclose(B, correct_data)):
         raise (ValueError("The received values are not what I expected on root."))
diff --git a/tests/library/mpi/mpi_isend_irecv_test.py b/tests/library/mpi/mpi_isend_irecv_test.py
index 9fab8c0158..ed21ec3fa4 100644
--- a/tests/library/mpi/mpi_isend_irecv_test.py
+++ b/tests/library/mpi/mpi_isend_irecv_test.py
@@ -109,8 +109,10 @@ def _test_mpi(info, sdfg, dtype):
 def test_mpi():
     _test_mpi("MPI Isend/Irecv", make_sdfg(np.float64), np.float64)
 
+
 ###############################################################################
 
+
 @pytest.mark.mpi
 def test_isend_irecv():
     from mpi4py import MPI
@@ -123,7 +125,7 @@ def mpi4py_isend_irecv(rank: dace.int32, size: dace.int32):
         src = (rank - 1) % size
         dst = (rank + 1) % size
         req = np.empty((2, ), dtype=MPI.Request)
-        sbuf = np.full((1,), rank, dtype=np.int32)
+        sbuf = np.full((1, ), rank, dtype=np.int32)
         req[0] = commworld.Isend(sbuf, dst, tag=0)
         rbuf = np.empty((1, ), dtype=np.int32)
         req[1] = commworld.Irecv(rbuf, src, tag=0)
diff --git a/tests/library/mpi/mpi_send_recv_test.py b/tests/library/mpi/mpi_send_recv_test.py
index bf39c955d3..9c8d78c042 100644
--- a/tests/library/mpi/mpi_send_recv_test.py
+++ b/tests/library/mpi/mpi_send_recv_test.py
@@ -76,6 +76,7 @@ def test_mpi():
 
 ###############################################################################
 
+
 @dace.program
 def dace_send_recv(rank: dace.int32, size: dace.int32):
     src = np.full([1], (rank - 1) % size, dtype=np.int32)
diff --git a/tests/python_frontend/mpi4py/comm_comparison_test.py b/tests/python_frontend/mpi4py/comm_comparison_test.py
index e7d74e5981..45bda19876 100644
--- a/tests/python_frontend/mpi4py/comm_comparison_test.py
+++ b/tests/python_frontend/mpi4py/comm_comparison_test.py
@@ -14,8 +14,8 @@ def test_eq_commworld_0():
     @dace.program
     def eq_commworld_0(out: dace.bool[1]):
         out[0] = comm == MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     eq_commworld_0(res)
     assert res[0] == (comm == MPI.COMM_WORLD)
 
@@ -30,8 +30,8 @@ def test_eq_commworld_1():
     @dace.program
     def eq_commworld_1(out: dace.bool[1]):
         out[0] = comm2 == MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     eq_commworld_1(res)
     assert res[0] == (comm2 == MPI.COMM_WORLD)
 
@@ -44,8 +44,8 @@ def test_eq_commworld_2():
     @dace.program
     def eq_commworld_2(out: dace.bool[1]):
         out[0] = MPI.COMM_NULL == MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     eq_commworld_2(res)
     assert res[0] == (MPI.COMM_NULL == MPI.COMM_WORLD)
 
@@ -59,8 +59,8 @@ def test_noteq_commworld_0():
     @dace.program
     def noteq_commworld_0(out: dace.bool[1]):
         out[0] = comm != MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     noteq_commworld_0(res)
     assert res[0] == (comm != MPI.COMM_WORLD)
 
@@ -75,8 +75,8 @@ def test_noteq_commworld_1():
     @dace.program
     def noteq_commworld_1(out: dace.bool[1]):
         out[0] = comm2 != MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     noteq_commworld_1(res)
     assert res[0] == (comm2 != MPI.COMM_WORLD)
 
@@ -89,8 +89,8 @@ def test_noteq_commworld_2():
     @dace.program
     def noteq_commworld_2(out: dace.bool[1]):
         out[0] = MPI.COMM_NULL != MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     noteq_commworld_2(res)
     assert res[0] == (MPI.COMM_NULL != MPI.COMM_WORLD)
 
@@ -104,8 +104,8 @@ def test_is_commworld_0():
     @dace.program
     def is_commworld_0(out: dace.bool[1]):
         out[0] = comm is MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     is_commworld_0(res)
     assert res[0] == (comm is MPI.COMM_WORLD)
 
@@ -120,8 +120,8 @@ def test_is_commworld_1():
     @dace.program
     def is_commworld_1(out: dace.bool[1]):
         out[0] = comm2 is MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     is_commworld_1(res)
     assert res[0] == (comm2 is MPI.COMM_WORLD)
 
@@ -134,8 +134,8 @@ def test_is_commworld_2():
     @dace.program
     def is_commworld_2(out: dace.bool[1]):
         out[0] = MPI.COMM_NULL is MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     is_commworld_2(res)
     assert res[0] == (MPI.COMM_NULL is MPI.COMM_WORLD)
 
@@ -149,8 +149,8 @@ def test_isnot_commworld_0():
     @dace.program
     def isnot_commworld_0(out: dace.bool[1]):
         out[0] = comm is MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     isnot_commworld_0(res)
     assert res[0] == (comm is MPI.COMM_WORLD)
 
@@ -165,8 +165,8 @@ def test_isnot_commworld_1():
     @dace.program
     def isnot_commworld_1(out: dace.bool[1]):
         out[0] = comm2 is not MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     isnot_commworld_1(res)
     assert res[0] == (comm2 is not MPI.COMM_WORLD)
 
@@ -179,8 +179,8 @@ def test_isnot_commworld_2():
     @dace.program
     def isnot_commworld_2(out: dace.bool[1]):
         out[0] = MPI.COMM_NULL is not MPI.COMM_WORLD
-    
-    res = np.zeros((1,), dtype=np.bool_)
+
+    res = np.zeros((1, ), dtype=np.bool_)
     isnot_commworld_2(res)
     assert res[0] == (MPI.COMM_NULL is not MPI.COMM_WORLD)
 

From 7bfd96036c379aa2bfaf7c0a0be21e3eb054b983 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 26 Jul 2023 21:07:11 +0200
Subject: [PATCH 326/392] Added extra exception to catch.

---
 dace/frontend/python/preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index 6a4ea89394..10a1ab120e 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -1603,7 +1603,7 @@ def preprocess_dace_program(f: Callable[..., Any],
     
     try:
         src_ast = MPIResolver(global_vars).visit(src_ast)
-    except ModuleNotFoundError:
+    except (ImportError, ModuleNotFoundError):
         pass
     src_ast = ModuloConverter().visit(src_ast)
 

From a98fce07b7e78b0bf1a0bc53d17e37e38c22b3dc Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 27 Jul 2023 20:58:42 +0200
Subject: [PATCH 327/392] Serialize Structure members and struct data/length as
 list of tuples.

---
 dace/data.py   |  5 +++--
 dace/dtypes.py | 11 +++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 9d3b6b86f3..fd7cdaf8e3 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -344,13 +344,14 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
 def _arrays_to_json(arrays):
     if arrays is None:
         return None
-    return {k: serialize.to_json(v) for k, v in arrays.items()}
+    sorted_keys = sorted(arrays.keys())
+    return [(k, serialize.to_json(arrays[k])) for k in sorted_keys]
 
 
 def _arrays_from_json(obj, context=None):
     if obj is None:
         return {}
-    return {k: serialize.from_json(v, context) for k, v in obj.items()}
+    return {k: serialize.from_json(v, context) for k, v in obj}
 
 
 @make_properties
diff --git a/dace/dtypes.py b/dace/dtypes.py
index d01209469f..9c483d5df1 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -768,13 +768,12 @@ def fields(self):
         return self._data
 
     def to_json(self):
+        sorted_keys = sorted(self._data.keys())
         return {
             'type': 'struct',
             'name': self.name,
-            'data': {k: v.to_json()
-                     for k, v in self._data.items()},
-            'length': {k: v
-                       for k, v in self._length.items()},
+            'data': [(k, self._data[k].to_json()) for k in sorted_keys],
+            'length': [(k, self._length[k]) for k in sorted_keys if k in self._length],
             'bytes': self.bytes
         }
 
@@ -786,8 +785,8 @@ def from_json(json_obj, context=None):
         import dace.serialize  # Avoid import loop
 
         ret = struct(json_obj['name'])
-        ret._data = {k: json_to_typeclass(v, context) for k, v in json_obj['data'].items()}
-        ret._length = {k: v for k, v in json_obj['length'].items()}
+        ret._data = {k: json_to_typeclass(v, context) for k, v in json_obj['data']}
+        ret._length = {k: v for k, v in json_obj['length']}
         ret.bytes = json_obj['bytes']
 
         return ret

From f431a8df0c99890d5dbeef48674157aa196d6a3e Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Jul 2023 10:29:11 +0200
Subject: [PATCH 328/392] Switched Structures and structs to OrderedDicts.

---
 dace/data.py                      | 40 ++++++++++++++++++++-----------
 dace/dtypes.py                    | 26 ++++++++++----------
 tests/sdfg/data/structure_test.py |  8 +++++++
 3 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index fd7cdaf8e3..b20f9f7db5 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -3,8 +3,9 @@
 import ctypes
 import functools
 
+from collections import OrderedDict
 from numbers import Number
-from typing import Any, Dict, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
 
 import numpy
 import sympy as sp
@@ -344,40 +345,47 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
 def _arrays_to_json(arrays):
     if arrays is None:
         return None
-    sorted_keys = sorted(arrays.keys())
-    return [(k, serialize.to_json(arrays[k])) for k in sorted_keys]
+    return [(k, serialize.to_json(v)) for k, v in arrays.items()]
 
 
 def _arrays_from_json(obj, context=None):
     if obj is None:
         return {}
-    return {k: serialize.from_json(v, context) for k, v in obj}
+    return OrderedDict((k, serialize.from_json(v, context)) for k, v in obj)
 
 
 @make_properties
 class Structure(Data):
     """ Base class for structures. """
 
-    members = Property(dtype=dict,
+    members = Property(dtype=OrderedDict,
                        desc="Dictionary of structure members",
                        from_json=_arrays_from_json,
                        to_json=_arrays_to_json)
+    order = ListProperty(element_type=str, desc="Order of structure members")
     name = Property(dtype=str, desc="Structure name")
 
     def __init__(self,
                  members: Dict[str, Data],
+                 order: List[str] = None,
                  name: str = 'Structure',
                  transient: bool = False,
                  storage: dtypes.StorageType = dtypes.StorageType.Default,
                  location: Dict[str, str] = None,
                  lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
                  debuginfo: dtypes.DebugInfo = None):
+
+        self.order = order or list(members.keys())
+        if set(members.keys()) != set(self.order):
+            raise ValueError('Order must contain all members of the structure.')
+        
         # TODO: Should we make a deep-copy here?
-        self.members = members or {}
+        self.members = OrderedDict((k, members[k]) for k in self.order)
+
         for k, v in self.members.items():
             v.transient = transient
         self.name = name
-        fields_and_types = dict()
+        fields_and_types = OrderedDict()
         symbols = set()
         for k, v in members.items():
             if isinstance(v, Structure):
@@ -396,13 +404,17 @@ def __init__(self,
                 fields_and_types[k] = dtypes.typeclass(type(v))
             else:
                 raise TypeError(f"Attribute {k}'s value {v} has unsupported type: {type(v)}")
-        for s in symbols:
-            if str(s) in fields_and_types:
-                continue
-            if hasattr(s, "dtype"):
-                fields_and_types[str(s)] = s.dtype
-            else:
-                fields_and_types[str(s)] = dtypes.int32
+        
+        # NOTE: We will not store symbols in the dtype for now, but leaving it as a comment to investigate later.
+        # NOTE: See discussion about data/object symbols.
+        # for s in symbols:
+        #     if str(s) in fields_and_types:
+        #         continue
+        #     if hasattr(s, "dtype"):
+        #         fields_and_types[str(s)] = s.dtype
+        #     else:
+        #         fields_and_types[str(s)] = dtypes.int32
+
         dtype = dtypes.pointer(dtypes.struct(name, **fields_and_types))
         shape = (1,)
         super(Structure, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo)
diff --git a/dace/dtypes.py b/dace/dtypes.py
index 9c483d5df1..678f2f59b0 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -7,6 +7,7 @@
 import itertools
 import numpy
 import re
+from collections import OrderedDict
 from functools import wraps
 from typing import Any
 from dace.config import Config
@@ -768,12 +769,11 @@ def fields(self):
         return self._data
 
     def to_json(self):
-        sorted_keys = sorted(self._data.keys())
         return {
             'type': 'struct',
             'name': self.name,
-            'data': [(k, self._data[k].to_json()) for k in sorted_keys],
-            'length': [(k, self._length[k]) for k in sorted_keys if k in self._length],
+            'data': [(k, v.to_json()) for k, v in self._data.items()],
+            'length': [(k, v) for k, v in self._length.items()],
             'bytes': self.bytes
         }
 
@@ -792,19 +792,21 @@ def from_json(json_obj, context=None):
         return ret
 
     def _parse_field_and_types(self, **fields_and_types):
-        from dace.symbolic import pystr_to_symbolic
-        self._data = dict()
-        self._length = dict()
+        # from dace.symbolic import pystr_to_symbolic
+        self._data = OrderedDict()
+        self._length = OrderedDict()
         self.bytes = 0
         for k, v in fields_and_types.items():
             if isinstance(v, tuple):
                 t, l = v
                 if not isinstance(t, pointer):
                     raise TypeError("Only pointer types may have a length.")
-                sym_tokens = pystr_to_symbolic(l).free_symbols
-                for sym in sym_tokens:
-                    if str(sym) not in fields_and_types.keys():
-                        raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}")
+                # TODO: Do we need the free symbols of the length in the struct?
+                # NOTE: It is needed for the old use of dtype.struct. Are we deprecating that?
+                # sym_tokens = pystr_to_symbolic(l).free_symbols
+                # for sym in sym_tokens:
+                #     if str(sym) not in fields_and_types.keys():
+                #         raise ValueError(f"Symbol {sym} in {k}'s length {l} is not a field of struct {self.name}")
                 self._data[k] = t
                 self._length[k] = l
                 self.bytes += t.bytes
@@ -830,7 +832,7 @@ def as_ctypes(self):
                 fields.append((k, v.as_ctypes()))
             else:
                 fields.append((k, _FFI_CTYPES[v.type]))
-        fields = sorted(fields, key=lambda f: f[0])
+        # fields = sorted(fields, key=lambda f: f[0])
         # Create new struct class.
         struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields})
         _FFI_CTYPES[self] = struct_class
@@ -844,7 +846,7 @@ def emit_definition(self):
 {typ}
 }};""".format(
             name=self.name,
-            typ='\n'.join(["    %s %s;" % (t.ctype, tname) for tname, t in sorted(self._data.items())]),
+            typ='\n'.join(["    %s %s;" % (t.ctype, tname) for tname, t in self._data.items()]),
         )
 
 
diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 02b8f0c174..995aacb2fd 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -12,6 +12,7 @@ def test_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense')
@@ -68,6 +69,7 @@ def test_write_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('dense_to_csr')
@@ -145,8 +147,10 @@ def test_local_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix',
                                   transient=True)
 
@@ -254,6 +258,7 @@ def test_local_structure():
 def test_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -315,6 +320,7 @@ def test_write_nested_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -396,6 +402,7 @@ def test_direct_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense_direct')
@@ -446,6 +453,7 @@ def test_direct_read_structure():
 def test_direct_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 

From 86d9cf2180c0b599b0a025447f1a36b7f9a05ecf Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Jul 2023 10:31:32 +0200
Subject: [PATCH 329/392] Removed order from properties.

---
 dace/data.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index b20f9f7db5..d8f2d52998 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -362,7 +362,6 @@ class Structure(Data):
                        desc="Dictionary of structure members",
                        from_json=_arrays_from_json,
                        to_json=_arrays_to_json)
-    order = ListProperty(element_type=str, desc="Order of structure members")
     name = Property(dtype=str, desc="Structure name")
 
     def __init__(self,
@@ -375,12 +374,12 @@ def __init__(self,
                  lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
                  debuginfo: dtypes.DebugInfo = None):
 
-        self.order = order or list(members.keys())
-        if set(members.keys()) != set(self.order):
+        order = order or list(members.keys())
+        if set(members.keys()) != set(order):
             raise ValueError('Order must contain all members of the structure.')
         
         # TODO: Should we make a deep-copy here?
-        self.members = OrderedDict((k, members[k]) for k in self.order)
+        self.members = OrderedDict((k, members[k]) for k in order)
 
         for k, v in self.members.items():
             v.transient = transient

From 76d6266cead9f7b3de58e8fc879a7d978ddbe757 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Fri, 28 Jul 2023 12:05:50 +0200
Subject: [PATCH 330/392] `_argminmax` now creates a struct with the members
 ordered as accessed in the related tasklets.

---
 dace/frontend/python/replacements.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 9eac240a87..b325a2ea7e 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -975,7 +975,7 @@ def _argminmax(pv: ProgramVisitor,
     reduced_shape = list(copy.deepcopy(a_arr.shape))
     reduced_shape.pop(axis)
 
-    val_and_idx = dace.struct('_val_and_idx', val=a_arr.dtype, idx=result_type)
+    val_and_idx = dace.struct('_val_and_idx', idx=result_type, val=a_arr.dtype)
 
     # HACK: since identity cannot be specified for structs, we have to init the output array
     reduced_structs, reduced_struct_arr = sdfg.add_temp_transient(reduced_shape, val_and_idx)

From 60b404515f732d81c96dcaed21b0d0c5d7632a18 Mon Sep 17 00:00:00 2001
From: Phillip Allen Lane <iamaperson620@gmail.com>
Date: Sat, 29 Jul 2023 16:34:59 -0500
Subject: [PATCH 331/392] Fix some underlying issues with tensor core sample
 (#1336)

Co-authored-by: Phillip Allen Lane <lane47@lassen709.coral.llnl.gov>
---
 samples/codegen/tensor_cores.py | 87 ++++++++++++++-------------------
 1 file changed, 36 insertions(+), 51 deletions(-)

diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py
index 52d906254b..92ea28eacf 100644
--- a/samples/codegen/tensor_cores.py
+++ b/samples/codegen/tensor_cores.py
@@ -27,6 +27,7 @@
 from dace.sdfg.graph import MultiConnectorEdge
 from dace.sdfg.state import StateSubgraphView
 from dace.codegen.prettycode import CodeIOStream
+from dace.codegen.dispatcher import DefinedType
 from typing import Any, List
 
 # Other imports
@@ -76,6 +77,9 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG):
     def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode,
                        nodedesc: dt.Array, function_stream: CodeIOStream, declaration_stream: CodeIOStream,
                        allocation_stream: CodeIOStream):
+        # Make sure the codegen includes the appropriate header files
+        _include_mma(sdfg)
+
         name = node.data
 
         # Based on the hardware, the total size must be 16^2
@@ -85,14 +89,16 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int,
 
         # Write a fragment based on the storage type
         if nodedesc.storage == dace.StorageType.TensorCore_Accumulator:
-            declaration_stream.write('wmma::fragment<wmma::accumulator, '
-                                     '16, 16, 16, float> {};'.format(name), sdfg, state_id, node)
+            ctype = 'wmma::fragment<wmma::accumulator, 16, 16, 16, float>'
+            declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node)
         else:
-            declaration_stream.write(
-                'wmma::fragment<wmma::matrix_{mat}, '
-                '16, 16, 16, half, wmma::{maj}_major> '
-                '{name};'.format(mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj, name=name), sdfg,
-                state_id, node)
+            ctype = 'wmma::fragment<wmma::matrix_{mat}, 16, 16, 16, half, wmma::{maj}_major>'.format(
+                mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj)
+            declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node)
+            
+        # Add the ctype to defined_vars so that the codegen can properly pass
+        # fragments to functions as an object reference.
+        self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype)
 
     def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode,
                          nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream):
@@ -187,50 +193,29 @@ def _include_mma(sdfg: dace.SDFG):
         sdfg.append_global_code(global_code, 'cuda')
 
 
-@replaces('frag_fill')
-def frag_fill(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, frag: str, fill: Any) -> List[str]:
-    # Replacement functions receive the SDFG and the current state as the first
-    # two arguments, followed by all the other arguments. Here we treat them as
-    # two strings representing the array name to fill and what to fill it with.
-
-    # NOTE: If a slice is used in the `frag` argument, the Python frontend
-    # automatically creates a new array for it, and uses the correct string as
-    # the argument.
-    wnode = state.add_write(frag)
-    tasklet = state.add_tasklet('fill',
-                                set(), {'out'},
-                                '''
-      wmma::fill_fragment(out, %s);''' % fill,
-                                language=dace.Language.CPP)
-
-    state.add_edge(tasklet, 'out', wnode, None, dace.Memlet.from_array(frag, wnode.desc(sdfg)))
-
-    _include_mma(sdfg)
-
-    # Function has no return value
-    return []
-
-
-@replaces('wmma')
-def wmma(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, a_frag: str, b_frag: str,
-         c_frag: str) -> List[str]:
-    # Implemented similarly to `frag_fill`, but with inputs and outputs.
-    anode = state.add_read(a_frag)
-    bnode = state.add_read(b_frag)
-    cnode = state.add_write(c_frag)
-    tasklet = state.add_tasklet('wmma', {'afrag', 'bfrag'}, {'cfrag'},
-                                '''
-      wmma::mma_sync(cfrag, afrag, bfrag, cfrag);''',
-                                language=dace.Language.CPP)
-
-    state.add_edge(anode, None, tasklet, 'afrag', dace.Memlet.from_array(a_frag, anode.desc(sdfg)))
-    state.add_edge(bnode, None, tasklet, 'bfrag', dace.Memlet.from_array(b_frag, bnode.desc(sdfg)))
-    state.add_edge(tasklet, 'cfrag', cnode, None, dace.Memlet.from_array(c_frag, cnode.desc(sdfg)))
-
-    _include_mma(sdfg)
-
-    # Function has no return value
-    return []
+def frag_fill(frag, fill):
+    # Define a tasklet with the appropriate input and output connectors.
+    # Then we can directly emit CUDA for the tasklet.
+    with dace.tasklet(dace.Language.CPP):
+        val << fill
+        out >> frag
+        """
+        wmma::fill_fragment(out, val);
+        """
+
+def wmma(a_frag, b_frag, c_frag):
+    # We do the same here as we did with frag_fill. Since c_frag is used
+    # as both an input and an output, we specify two separate variables
+    # to be passed to mma_sync and declare c_frag as an input to one and
+    # an output to the other. This ensures proper dataflow.
+    with dace.tasklet(dace.Language.CPP):
+        afrag << a_frag
+        bfrag << b_frag
+        cfrag << c_frag
+        dfrag >> c_frag
+        """
+        wmma::mma_sync(dfrag, afrag, bfrag, cfrag);
+        """
 
 
 ############################################################################

From b97443e2782a161cf8fa6afc03f707c6e8bc54c0 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Sun, 30 Jul 2023 15:15:40 -0700
Subject: [PATCH 332/392] Add CPU_Persistent map schedule (#1330)

---
 dace/cli/daceprof.py                          |   3 +-
 dace/codegen/instrumentation/likwid.py        |   2 +-
 dace/codegen/instrumentation/papi.py          |   6 +-
 dace/codegen/targets/cpu.py                   | 140 +++++++++++-------
 dace/dtypes.py                                |  13 +-
 dace/sdfg/nodes.py                            |   6 +-
 .../transformation/interstate/sdfg_nesting.py |   3 +-
 tests/openmp_test.py                          | 104 +++++++++++++
 8 files changed, 206 insertions(+), 71 deletions(-)

diff --git a/dace/cli/daceprof.py b/dace/cli/daceprof.py
index 8a2f894910..b201d40661 100644
--- a/dace/cli/daceprof.py
+++ b/dace/cli/daceprof.py
@@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG):
             for n, _ in sdfg.all_nodes_recursive():
                 if isinstance(n, dace.nodes.EntryNode):
                     sched = getattr(n, 'schedule', False)
-                    if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default:
+                    if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent,
+                                 dace.ScheduleType.Default):
                         n.schedule = dace.ScheduleType.Sequential
 
         registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential))
diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
index b14a8166af..e4f9c3154e 100644
--- a/dace/codegen/instrumentation/likwid.py
+++ b/dace/codegen/instrumentation/likwid.py
@@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider):
         the Likwid tool.
     """
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._likwid_used = False
diff --git a/dace/codegen/instrumentation/papi.py b/dace/codegen/instrumentation/papi.py
index ee7f17308a..bc7163ea9b 100644
--- a/dace/codegen/instrumentation/papi.py
+++ b/dace/codegen/instrumentation/papi.py
@@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider):
 
     _counters: Optional[Set[str]] = None
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._papi_used = False
@@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream):
 
     @staticmethod
     def perf_get_supersection_start_string(node, dfg, unified_id):
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
             # Nested SuperSections are not supported. Therefore, we mark the
             # outermost section and disallow internal scopes from creating it.
             if not hasattr(node.map, '_can_be_supersection_start'):
@@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id):
             for x in children:
                 if not hasattr(x.map, '_can_be_supersection_start'):
                     x.map._can_be_supersection_start = True
-                if x.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
 
                     x.map._can_be_supersection_start = False
                 elif x.map.schedule == dtypes.ScheduleType.Sequential:
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index eb7d232966..3b7b592775 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -18,7 +18,7 @@
 from dace.sdfg import nodes, utils as sdutils
 from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError,
                        dynamic_map_inputs, local_transients)
-from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
+from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope
 from typing import Union
 from dace.codegen.targets import fpga
 
@@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg):
 
         # Register dispatchers
         dispatcher.register_node_dispatcher(self)
-        dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self)
+        dispatcher.register_map_dispatcher(
+            [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential],
+            self)
 
         cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register]
         dispatcher.register_array_dispatcher(cpu_storage, self)
@@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
         # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if
         # `nodedesc` is a View and `dfg` is None.
         if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
-                raise NotImplementedError("The declare_array method should only be used for variables "
+            raise NotImplementedError("The declare_array method should only be used for variables "
                                       "that must have their declaration and allocation separate.")
 
         name = node.data
@@ -1714,66 +1716,87 @@ def _generate_MapEntry(
 
         # TODO: Refactor to generate_scope_preamble once a general code
         #  generator (that CPU inherits from) is implemented
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-            map_header += "#pragma omp parallel for"
-            if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
-                schedule = " schedule("
-                if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
-                    schedule += "static"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
-                    schedule += "dynamic"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
-                    schedule += "guided"
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+            # OpenMP header
+            in_persistent = False
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent])
+                if in_persistent:
+                    # If already in a #pragma omp parallel, no need to use it twice
+                    map_header += "#pragma omp for"
+                    # TODO(later): barriers and map_header += " nowait"
                 else:
-                    raise ValueError("Unknown OpenMP schedule type")
-                if node.map.omp_chunk_size > 0:
-                    schedule += f", {node.map.omp_chunk_size}"
-                schedule += ")"
-                map_header += schedule
-            if node.map.omp_num_threads > 0:
-                map_header += f" num_threads({node.map.omp_num_threads})"
-            if node.map.collapse > 1:
+                    map_header += "#pragma omp parallel for"
+
+            elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+                map_header += "#pragma omp parallel"
+
+            # OpenMP schedule properties
+            if not in_persistent:
+                if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
+                    schedule = " schedule("
+                    if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
+                        schedule += "static"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
+                        schedule += "dynamic"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
+                        schedule += "guided"
+                    else:
+                        raise ValueError("Unknown OpenMP schedule type")
+                    if node.map.omp_chunk_size > 0:
+                        schedule += f", {node.map.omp_chunk_size}"
+                    schedule += ")"
+                    map_header += schedule
+
+                if node.map.omp_num_threads > 0:
+                    map_header += f" num_threads({node.map.omp_num_threads})"
+
+            # OpenMP nested loop properties
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1:
                 map_header += ' collapse(%d)' % node.map.collapse
-            # Loop over outputs, add OpenMP reduction clauses to detected cases
-            # TODO: set up register outside loop
-            # exit_node = dfg.exit_node(node)
-            reduction_stmts = []
-            # for outedge in dfg.in_edges(exit_node):
-            #    if (isinstance(outedge.src, nodes.CodeNode)
-            #            and outedge.data.wcr is not None):
-            #        redt = operations.detect_reduction_type(outedge.data.wcr)
-            #        if redt != dtypes.ReductionType.Custom:
-            #            reduction_stmts.append('reduction({typ}:{var})'.format(
-            #                typ=_REDUCTION_TYPE_TO_OPENMP[redt],
-            #                var=outedge.src_conn))
-            #            reduced_variables.append(outedge)
-
-            map_header += " %s\n" % ", ".join(reduction_stmts)
-
-        # TODO: Explicit map unroller
-        if node.map.unroll:
-            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-                raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")")
 
-        constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r])
+        if node.map.unroll:
+            if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+                raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")")
 
-        # Nested loops
         result.write(map_header, sdfg, state_id, node)
-        for i, r in enumerate(node.map.range):
-            # var = '__DACEMAP_%s_%d' % (node.map.label, i)
-            var = map_params[i]
-            begin, end, skip = r
 
-            if node.map.unroll:
-                result.write("#pragma unroll", sdfg, state_id, node)
+        if node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+            result.write('{\n', sdfg, state_id, node)
+
+            # Find if bounds are used within the scope
+            scope = state_dfg.scope_subgraph(node, False, False)
+            fsyms = scope.free_symbols
+            # Include external edges
+            for n in scope.nodes():
+                for e in state_dfg.all_edges(n):
+                    fsyms |= e.data.free_symbols
+            fsyms = set(map(str, fsyms))
+
+            ntid_is_used = '__omp_num_threads' in fsyms
+            tid_is_used = node.map.params[0] in fsyms
+            if tid_is_used or ntid_is_used:
+                function_stream.write('#include <omp.h>', sdfg, state_id, node)
+            if tid_is_used:
+                result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node)
+            if ntid_is_used:
+                result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node)
+        else:
+            # Emit nested loops
+            for i, r in enumerate(node.map.range):
+                var = map_params[i]
+                begin, end, skip = r
 
-            result.write(
-                "for (auto %s = %s; %s < %s; %s += %s) {\n" %
-                (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
-                sdfg,
-                state_id,
-                node,
-            )
+                if node.map.unroll:
+                    result.write("#pragma unroll", sdfg, state_id, node)
+
+                result.write(
+                    "for (auto %s = %s; %s < %s; %s += %s) {\n" %
+                    (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
+                    sdfg,
+                    state_id,
+                    node,
+                )
 
         callsite_stream.write(inner_stream.getvalue())
 
@@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite
 
         self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream)
 
-        for _ in map_node.map.range:
+        if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
             result.write("}", sdfg, state_id, node)
+        else:
+            for _ in map_node.map.range:
+                result.write("}", sdfg, state_id, node)
 
         result.write(outer_stream.getvalue())
 
diff --git a/dace/dtypes.py b/dace/dtypes.py
index dee2283f25..88ce583d08 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -61,7 +61,8 @@ class ScheduleType(aenum.AutoNumberEnum):
     Default = ()  #: Scope-default parallel schedule
     Sequential = ()  #: Sequential code (single-thread)
     MPI = ()  #: MPI processes
-    CPU_Multicore = ()  #: OpenMP
+    CPU_Multicore = ()  #: OpenMP parallel for loop
+    CPU_Persistent = ()  #: OpenMP parallel region
     Unrolled = ()  #: Unrolled code
     SVE_Map = ()  #: Arm SVE
 
@@ -188,6 +189,7 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.Sequential: StorageType.Register,
     ScheduleType.MPI: StorageType.CPU_Heap,
     ScheduleType.CPU_Multicore: StorageType.Register,
+    ScheduleType.CPU_Persistent: StorageType.CPU_Heap,
     ScheduleType.GPU_Default: StorageType.GPU_Global,
     ScheduleType.GPU_Persistent: StorageType.GPU_Global,
     ScheduleType.GPU_Device: StorageType.GPU_Shared,
@@ -205,6 +207,7 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.Sequential: ScheduleType.Sequential,
     ScheduleType.MPI: ScheduleType.CPU_Multicore,
     ScheduleType.CPU_Multicore: ScheduleType.Sequential,
+    ScheduleType.CPU_Persistent: ScheduleType.CPU_Multicore,
     ScheduleType.Unrolled: ScheduleType.CPU_Multicore,
     ScheduleType.GPU_Default: ScheduleType.GPU_Device,
     ScheduleType.GPU_Persistent: ScheduleType.GPU_Device,
@@ -1432,7 +1435,7 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Default,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
-    elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore]:
+    elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:
         return storage in [
             StorageType.Default, StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal
         ]
@@ -1460,19 +1463,19 @@ def can_allocate(storage: StorageType, schedule: ScheduleType):
     # Host-only allocation
     if storage in [StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal]:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
         ]
 
     # GPU-global memory
     if storage is StorageType.GPU_Global:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
         ]
 
     # FPGA-global memory
     if storage is StorageType.FPGA_Global:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device,
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device,
             ScheduleType.GPU_Default
         ]
 
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 5c270153e1..d82cd5607d 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -833,17 +833,17 @@ class Map(object):
                                default=0,
                                desc="Number of OpenMP threads executing the Map",
                                optional=True,
-                               optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                               optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType,
                                 default=dtypes.OMPScheduleType.Default,
                                 desc="OpenMP schedule {static, dynamic, guided}",
                                 optional=True,
-                                optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                                optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_chunk_size = Property(dtype=int,
                               default=0,
                               desc="OpenMP schedule chunk size",
                               optional=True,
-                              optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                              optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
 
     gpu_block_size = ListProperty(element_type=int,
                                   default=None,
diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index 71d9e22aca..fc3ebfbdca 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -814,7 +814,8 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi
         # Not every schedule is supported
         if not permissive:
             if nsdfg.schedule not in (None, dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential,
-                                      dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.GPU_Device):
+                                      dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent,
+                                      dtypes.ScheduleType.GPU_Device):
                 return False
 
         candidates = InlineTransients._candidates(sdfg, graph, nsdfg)
diff --git a/tests/openmp_test.py b/tests/openmp_test.py
index 9f4535dfe4..d842b407fb 100644
--- a/tests/openmp_test.py
+++ b/tests/openmp_test.py
@@ -2,6 +2,7 @@
 import dace
 from dace import dtypes, nodes
 from typing import Any, Dict, List, Union
+import numpy as np
 
 N = dace.symbol("N")
 
@@ -73,6 +74,109 @@ def test_omp_props():
     assert ("#pragma omp parallel for schedule(guided, 5) num_threads(10)" in code)
 
 
+def test_omp_parallel():
+
+    @dace.program
+    def tester(A: dace.float64[1]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[0] += 1
+
+    sdfg = tester.to_sdfg()
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 2
+
+    code = sdfg.generate_code()[0].clean_code
+    assert ("#pragma omp parallel num_threads(2)" in code)
+
+    a = np.random.rand(1)
+    ref = a + 2
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_parallel_for_in_parallel():
+    """
+    Tests that an OpenMP map inside a parallel section ends up without an
+    extra (semantically-incorrect) ``parallel`` statement.
+    """
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore:
+                A[i] += 1
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "#pragma omp parallel" in code
+    assert "#pragma omp for" in code
+
+    a = np.random.rand(20)
+    ref = a + 1
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_get_tid():
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[t] += 1
+
+    sdfg = tester.to_sdfg()
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 2
+
+    code = sdfg.generate_code()[0].clean_code
+    assert "#pragma omp parallel num_threads(2)" in code
+    assert "omp_get_thread_num()" in code
+
+    a = np.random.rand(20)
+    ref = np.copy(a)
+    ref[:2] += 1
+
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_get_tid_elision():
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[0] += 1
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "omp_get_thread_num()" not in code
+
+
+def test_omp_get_ntid():
+    __omp_num_threads = dace.symbol('__omp_num_threads')
+
+    @dace.program
+    def tester(A: dace.int64[1]):
+        for _ in dace.map[0:__omp_num_threads] @ dace.ScheduleType.CPU_Persistent:
+            A[0] = __omp_num_threads
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "omp_get_num_threads()" in code
+
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 3
+
+    a = np.zeros([1], dtype=np.int64)
+    sdfg(a, __omp_num_threads=1)  # Feed in some other value
+    assert np.allclose(a, 3)
+
+
 if __name__ == "__main__":
     test_lack_of_omp_props()
     test_omp_props()
+    test_omp_parallel()
+    test_omp_parallel_for_in_parallel()
+    test_omp_get_tid()
+    test_omp_get_tid_elision()
+    test_omp_get_ntid()

From 7e9d197a5d13ac9b4ba411f3fe7c5a04a20d7327 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 1 Aug 2023 17:57:01 +0200
Subject: [PATCH 333/392] Updated hlslib to support Xilinx Vitis >2022.2
 (#1340)

---
 dace/external/hlslib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/external/hlslib b/dace/external/hlslib
index 1403cd016c..1b5b3aee5d 160000
--- a/dace/external/hlslib
+++ b/dace/external/hlslib
@@ -1 +1 @@
-Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce
+Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1

From f39762ff1397dc8eaa6f7db08acd025c264b55af Mon Sep 17 00:00:00 2001
From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com>
Date: Wed, 2 Aug 2023 13:00:04 +0200
Subject: [PATCH 334/392] Docs: mention FPGA backend tested with Intel Quartus
 PRO (#1335)

---
 doc/setup/installation.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/setup/installation.rst b/doc/setup/installation.rst
index 6eb266dc7c..893f4a1688 100644
--- a/doc/setup/installation.rst
+++ b/doc/setup/installation.rst
@@ -21,7 +21,9 @@ however, it requires two more runtime dependencies to be installed and available
 **GPU**: For NVIDIA GPUs, the CUDA toolkit is also required, and AMD GPUs require HIP. :ref:`See more information on how to configure DaCe to use AMD GPUs <amd>`.
 You may (optionally) want to install `CuPy <https://cupy.dev/>`_ for easy integration of GPU arrays in Python.
 
-**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed.
+**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. 
+DaCe has been tested with Intel FPGA SDK for OpenCL Pro edition v18.1 and v19.1, targeting Arria 10 and Stratix 10 devices, and Xilinx Vitis HLS v2020.x, v2021.x targeting u250 and u280 devices.
+
 
 **Distributed Computing**: If using multiple nodes, MPI has to be installed and available.
 
@@ -136,6 +138,12 @@ Common issues with the DaCe Python module
   * **Bug in DaCe**: If you suspect an issue happens within DaCe, see :ref:`debugging` for ways to pinpoint the source
     of the issue.
 
+  * **Intel FPGA libraries not found**: when targeting Intel FPGAs, the compilation process may fail due to missing OpenCL headers (CMake returns 
+    a ``Could NOT find IntelFPGAOpenCL`` error). This is usually the case when Intel OpenCL compiler does not return the right path to OpenCL host headers. 
+    DaCe relies on ``hlslib`` for compiling FPGA programs, which in turns relies on Intel's compiler to derive the right include path. Please verify that
+    the include path returned by the Intel compiler (using the ``aocl compile-config`` command) points to a directory that actually contains the OpenCL headers (namely ``cl.hpp`` and
+    ``cl2.hpp`` files). If this is not the case, please locate them under the Intel Quartus installation folder, and symlink (or copy) them in the ``aocl`` returned path.
+
 .. _qa_vscode:
 
 Common issues with the Visual Studio Code extension

From 30af8dabca952d9c4307738c98bd0bb0669f6af9 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Sun, 30 Jul 2023 15:15:40 -0700
Subject: [PATCH 335/392] Add CPU_Persistent map schedule (#1330)

---
 dace/cli/daceprof.py                          |   3 +-
 dace/codegen/instrumentation/likwid.py        |   2 +-
 dace/codegen/instrumentation/papi.py          |   6 +-
 dace/codegen/targets/cpu.py                   | 140 +++++++++++-------
 dace/dtypes.py                                |  13 +-
 dace/sdfg/nodes.py                            |   6 +-
 .../transformation/interstate/sdfg_nesting.py |   3 +-
 tests/openmp_test.py                          | 104 +++++++++++++
 8 files changed, 206 insertions(+), 71 deletions(-)

diff --git a/dace/cli/daceprof.py b/dace/cli/daceprof.py
index 8a2f894910..b201d40661 100644
--- a/dace/cli/daceprof.py
+++ b/dace/cli/daceprof.py
@@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG):
             for n, _ in sdfg.all_nodes_recursive():
                 if isinstance(n, dace.nodes.EntryNode):
                     sched = getattr(n, 'schedule', False)
-                    if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default:
+                    if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent,
+                                 dace.ScheduleType.Default):
                         n.schedule = dace.ScheduleType.Sequential
 
         registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential))
diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
index b14a8166af..e4f9c3154e 100644
--- a/dace/codegen/instrumentation/likwid.py
+++ b/dace/codegen/instrumentation/likwid.py
@@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider):
         the Likwid tool.
     """
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._likwid_used = False
diff --git a/dace/codegen/instrumentation/papi.py b/dace/codegen/instrumentation/papi.py
index ee7f17308a..bc7163ea9b 100644
--- a/dace/codegen/instrumentation/papi.py
+++ b/dace/codegen/instrumentation/papi.py
@@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider):
 
     _counters: Optional[Set[str]] = None
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._papi_used = False
@@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream):
 
     @staticmethod
     def perf_get_supersection_start_string(node, dfg, unified_id):
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
             # Nested SuperSections are not supported. Therefore, we mark the
             # outermost section and disallow internal scopes from creating it.
             if not hasattr(node.map, '_can_be_supersection_start'):
@@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id):
             for x in children:
                 if not hasattr(x.map, '_can_be_supersection_start'):
                     x.map._can_be_supersection_start = True
-                if x.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
 
                     x.map._can_be_supersection_start = False
                 elif x.map.schedule == dtypes.ScheduleType.Sequential:
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index eb7d232966..3b7b592775 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -18,7 +18,7 @@
 from dace.sdfg import nodes, utils as sdutils
 from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError,
                        dynamic_map_inputs, local_transients)
-from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
+from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope
 from typing import Union
 from dace.codegen.targets import fpga
 
@@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg):
 
         # Register dispatchers
         dispatcher.register_node_dispatcher(self)
-        dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self)
+        dispatcher.register_map_dispatcher(
+            [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential],
+            self)
 
         cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register]
         dispatcher.register_array_dispatcher(cpu_storage, self)
@@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
         # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if
         # `nodedesc` is a View and `dfg` is None.
         if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
-                raise NotImplementedError("The declare_array method should only be used for variables "
+            raise NotImplementedError("The declare_array method should only be used for variables "
                                       "that must have their declaration and allocation separate.")
 
         name = node.data
@@ -1714,66 +1716,87 @@ def _generate_MapEntry(
 
         # TODO: Refactor to generate_scope_preamble once a general code
         #  generator (that CPU inherits from) is implemented
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-            map_header += "#pragma omp parallel for"
-            if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
-                schedule = " schedule("
-                if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
-                    schedule += "static"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
-                    schedule += "dynamic"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
-                    schedule += "guided"
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+            # OpenMP header
+            in_persistent = False
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent])
+                if in_persistent:
+                    # If already in a #pragma omp parallel, no need to use it twice
+                    map_header += "#pragma omp for"
+                    # TODO(later): barriers and map_header += " nowait"
                 else:
-                    raise ValueError("Unknown OpenMP schedule type")
-                if node.map.omp_chunk_size > 0:
-                    schedule += f", {node.map.omp_chunk_size}"
-                schedule += ")"
-                map_header += schedule
-            if node.map.omp_num_threads > 0:
-                map_header += f" num_threads({node.map.omp_num_threads})"
-            if node.map.collapse > 1:
+                    map_header += "#pragma omp parallel for"
+
+            elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+                map_header += "#pragma omp parallel"
+
+            # OpenMP schedule properties
+            if not in_persistent:
+                if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
+                    schedule = " schedule("
+                    if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
+                        schedule += "static"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
+                        schedule += "dynamic"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
+                        schedule += "guided"
+                    else:
+                        raise ValueError("Unknown OpenMP schedule type")
+                    if node.map.omp_chunk_size > 0:
+                        schedule += f", {node.map.omp_chunk_size}"
+                    schedule += ")"
+                    map_header += schedule
+
+                if node.map.omp_num_threads > 0:
+                    map_header += f" num_threads({node.map.omp_num_threads})"
+
+            # OpenMP nested loop properties
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1:
                 map_header += ' collapse(%d)' % node.map.collapse
-            # Loop over outputs, add OpenMP reduction clauses to detected cases
-            # TODO: set up register outside loop
-            # exit_node = dfg.exit_node(node)
-            reduction_stmts = []
-            # for outedge in dfg.in_edges(exit_node):
-            #    if (isinstance(outedge.src, nodes.CodeNode)
-            #            and outedge.data.wcr is not None):
-            #        redt = operations.detect_reduction_type(outedge.data.wcr)
-            #        if redt != dtypes.ReductionType.Custom:
-            #            reduction_stmts.append('reduction({typ}:{var})'.format(
-            #                typ=_REDUCTION_TYPE_TO_OPENMP[redt],
-            #                var=outedge.src_conn))
-            #            reduced_variables.append(outedge)
-
-            map_header += " %s\n" % ", ".join(reduction_stmts)
-
-        # TODO: Explicit map unroller
-        if node.map.unroll:
-            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-                raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")")
 
-        constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r])
+        if node.map.unroll:
+            if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+                raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")")
 
-        # Nested loops
         result.write(map_header, sdfg, state_id, node)
-        for i, r in enumerate(node.map.range):
-            # var = '__DACEMAP_%s_%d' % (node.map.label, i)
-            var = map_params[i]
-            begin, end, skip = r
 
-            if node.map.unroll:
-                result.write("#pragma unroll", sdfg, state_id, node)
+        if node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+            result.write('{\n', sdfg, state_id, node)
+
+            # Find if bounds are used within the scope
+            scope = state_dfg.scope_subgraph(node, False, False)
+            fsyms = scope.free_symbols
+            # Include external edges
+            for n in scope.nodes():
+                for e in state_dfg.all_edges(n):
+                    fsyms |= e.data.free_symbols
+            fsyms = set(map(str, fsyms))
+
+            ntid_is_used = '__omp_num_threads' in fsyms
+            tid_is_used = node.map.params[0] in fsyms
+            if tid_is_used or ntid_is_used:
+                function_stream.write('#include <omp.h>', sdfg, state_id, node)
+            if tid_is_used:
+                result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node)
+            if ntid_is_used:
+                result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node)
+        else:
+            # Emit nested loops
+            for i, r in enumerate(node.map.range):
+                var = map_params[i]
+                begin, end, skip = r
 
-            result.write(
-                "for (auto %s = %s; %s < %s; %s += %s) {\n" %
-                (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
-                sdfg,
-                state_id,
-                node,
-            )
+                if node.map.unroll:
+                    result.write("#pragma unroll", sdfg, state_id, node)
+
+                result.write(
+                    "for (auto %s = %s; %s < %s; %s += %s) {\n" %
+                    (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
+                    sdfg,
+                    state_id,
+                    node,
+                )
 
         callsite_stream.write(inner_stream.getvalue())
 
@@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite
 
         self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream)
 
-        for _ in map_node.map.range:
+        if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
             result.write("}", sdfg, state_id, node)
+        else:
+            for _ in map_node.map.range:
+                result.write("}", sdfg, state_id, node)
 
         result.write(outer_stream.getvalue())
 
diff --git a/dace/dtypes.py b/dace/dtypes.py
index dee2283f25..88ce583d08 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -61,7 +61,8 @@ class ScheduleType(aenum.AutoNumberEnum):
     Default = ()  #: Scope-default parallel schedule
     Sequential = ()  #: Sequential code (single-thread)
     MPI = ()  #: MPI processes
-    CPU_Multicore = ()  #: OpenMP
+    CPU_Multicore = ()  #: OpenMP parallel for loop
+    CPU_Persistent = ()  #: OpenMP parallel region
     Unrolled = ()  #: Unrolled code
     SVE_Map = ()  #: Arm SVE
 
@@ -188,6 +189,7 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.Sequential: StorageType.Register,
     ScheduleType.MPI: StorageType.CPU_Heap,
     ScheduleType.CPU_Multicore: StorageType.Register,
+    ScheduleType.CPU_Persistent: StorageType.CPU_Heap,
     ScheduleType.GPU_Default: StorageType.GPU_Global,
     ScheduleType.GPU_Persistent: StorageType.GPU_Global,
     ScheduleType.GPU_Device: StorageType.GPU_Shared,
@@ -205,6 +207,7 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.Sequential: ScheduleType.Sequential,
     ScheduleType.MPI: ScheduleType.CPU_Multicore,
     ScheduleType.CPU_Multicore: ScheduleType.Sequential,
+    ScheduleType.CPU_Persistent: ScheduleType.CPU_Multicore,
     ScheduleType.Unrolled: ScheduleType.CPU_Multicore,
     ScheduleType.GPU_Default: ScheduleType.GPU_Device,
     ScheduleType.GPU_Persistent: ScheduleType.GPU_Device,
@@ -1432,7 +1435,7 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Default,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
-    elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore]:
+    elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:
         return storage in [
             StorageType.Default, StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal
         ]
@@ -1460,19 +1463,19 @@ def can_allocate(storage: StorageType, schedule: ScheduleType):
     # Host-only allocation
     if storage in [StorageType.CPU_Heap, StorageType.CPU_Pinned, StorageType.CPU_ThreadLocal]:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
         ]
 
     # GPU-global memory
     if storage is StorageType.GPU_Global:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.GPU_Default
         ]
 
     # FPGA-global memory
     if storage is StorageType.FPGA_Global:
         return schedule in [
-            ScheduleType.CPU_Multicore, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device,
+            ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent, ScheduleType.Sequential, ScheduleType.MPI, ScheduleType.FPGA_Device,
             ScheduleType.GPU_Default
         ]
 
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 6ba84d919e..bd384b6736 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -851,17 +851,17 @@ class Map(object):
                                default=0,
                                desc="Number of OpenMP threads executing the Map",
                                optional=True,
-                               optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                               optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType,
                                 default=dtypes.OMPScheduleType.Default,
                                 desc="OpenMP schedule {static, dynamic, guided}",
                                 optional=True,
-                                optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                                optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_chunk_size = Property(dtype=int,
                               default=0,
                               desc="OpenMP schedule chunk size",
                               optional=True,
-                              optional_condition=lambda m: m.schedule == dtypes.ScheduleType.CPU_Multicore)
+                              optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
 
     gpu_block_size = ListProperty(element_type=int,
                                   default=None,
diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
index 71d9e22aca..fc3ebfbdca 100644
--- a/dace/transformation/interstate/sdfg_nesting.py
+++ b/dace/transformation/interstate/sdfg_nesting.py
@@ -814,7 +814,8 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi
         # Not every schedule is supported
         if not permissive:
             if nsdfg.schedule not in (None, dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential,
-                                      dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.GPU_Device):
+                                      dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent,
+                                      dtypes.ScheduleType.GPU_Device):
                 return False
 
         candidates = InlineTransients._candidates(sdfg, graph, nsdfg)
diff --git a/tests/openmp_test.py b/tests/openmp_test.py
index 9f4535dfe4..d842b407fb 100644
--- a/tests/openmp_test.py
+++ b/tests/openmp_test.py
@@ -2,6 +2,7 @@
 import dace
 from dace import dtypes, nodes
 from typing import Any, Dict, List, Union
+import numpy as np
 
 N = dace.symbol("N")
 
@@ -73,6 +74,109 @@ def test_omp_props():
     assert ("#pragma omp parallel for schedule(guided, 5) num_threads(10)" in code)
 
 
+def test_omp_parallel():
+
+    @dace.program
+    def tester(A: dace.float64[1]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[0] += 1
+
+    sdfg = tester.to_sdfg()
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 2
+
+    code = sdfg.generate_code()[0].clean_code
+    assert ("#pragma omp parallel num_threads(2)" in code)
+
+    a = np.random.rand(1)
+    ref = a + 2
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_parallel_for_in_parallel():
+    """
+    Tests that an OpenMP map inside a parallel section ends up without an
+    extra (semantically-incorrect) ``parallel`` statement.
+    """
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            for i in dace.map[0:20] @ dace.ScheduleType.CPU_Multicore:
+                A[i] += 1
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "#pragma omp parallel" in code
+    assert "#pragma omp for" in code
+
+    a = np.random.rand(20)
+    ref = a + 1
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_get_tid():
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[t] += 1
+
+    sdfg = tester.to_sdfg()
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 2
+
+    code = sdfg.generate_code()[0].clean_code
+    assert "#pragma omp parallel num_threads(2)" in code
+    assert "omp_get_thread_num()" in code
+
+    a = np.random.rand(20)
+    ref = np.copy(a)
+    ref[:2] += 1
+
+    sdfg(a)
+    assert np.allclose(a, ref)
+
+
+def test_omp_get_tid_elision():
+
+    @dace.program
+    def tester(A: dace.float64[20]):
+        for t in dace.map[0:1] @ dace.ScheduleType.CPU_Persistent:
+            A[0] += 1
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "omp_get_thread_num()" not in code
+
+
+def test_omp_get_ntid():
+    __omp_num_threads = dace.symbol('__omp_num_threads')
+
+    @dace.program
+    def tester(A: dace.int64[1]):
+        for _ in dace.map[0:__omp_num_threads] @ dace.ScheduleType.CPU_Persistent:
+            A[0] = __omp_num_threads
+
+    sdfg = tester.to_sdfg()
+    code = sdfg.generate_code()[0].clean_code
+    assert "omp_get_num_threads()" in code
+
+    me = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry))
+    me.map.omp_num_threads = 3
+
+    a = np.zeros([1], dtype=np.int64)
+    sdfg(a, __omp_num_threads=1)  # Feed in some other value
+    assert np.allclose(a, 3)
+
+
 if __name__ == "__main__":
     test_lack_of_omp_props()
     test_omp_props()
+    test_omp_parallel()
+    test_omp_parallel_for_in_parallel()
+    test_omp_get_tid()
+    test_omp_get_tid_elision()
+    test_omp_get_ntid()

From 8ace3676e326afd3b0081a032d6483d3f07f0982 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 1 Aug 2023 17:57:01 +0200
Subject: [PATCH 336/392] Updated hlslib to support Xilinx Vitis >2022.2
 (#1340)

---
 dace/external/hlslib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/external/hlslib b/dace/external/hlslib
index 1403cd016c..1b5b3aee5d 160000
--- a/dace/external/hlslib
+++ b/dace/external/hlslib
@@ -1 +1 @@
-Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce
+Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1

From 2f5a00519e88fa6dc2705d1564dd85cbfea7d1ff Mon Sep 17 00:00:00 2001
From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com>
Date: Wed, 2 Aug 2023 13:00:04 +0200
Subject: [PATCH 337/392] Docs: mention FPGA backend tested with Intel Quartus
 PRO (#1335)

---
 doc/setup/installation.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/setup/installation.rst b/doc/setup/installation.rst
index 6eb266dc7c..893f4a1688 100644
--- a/doc/setup/installation.rst
+++ b/doc/setup/installation.rst
@@ -21,7 +21,9 @@ however, it requires two more runtime dependencies to be installed and available
 **GPU**: For NVIDIA GPUs, the CUDA toolkit is also required, and AMD GPUs require HIP. :ref:`See more information on how to configure DaCe to use AMD GPUs <amd>`.
 You may (optionally) want to install `CuPy <https://cupy.dev/>`_ for easy integration of GPU arrays in Python.
 
-**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed.
+**FPGA**: Xilinx FPGAs require the Vitis suite and Intel FPGAs require the Intel FPGA SDK to be installed. 
+DaCe has been tested with Intel FPGA SDK for OpenCL Pro edition v18.1 and v19.1, targeting Arria 10 and Stratix 10 devices, and Xilinx Vitis HLS v2020.x, v2021.x targeting u250 and u280 devices.
+
 
 **Distributed Computing**: If using multiple nodes, MPI has to be installed and available.
 
@@ -136,6 +138,12 @@ Common issues with the DaCe Python module
   * **Bug in DaCe**: If you suspect an issue happens within DaCe, see :ref:`debugging` for ways to pinpoint the source
     of the issue.
 
+  * **Intel FPGA libraries not found**: when targeting Intel FPGAs, the compilation process may fail due to missing OpenCL headers (CMake returns 
+    a ``Could NOT find IntelFPGAOpenCL`` error). This is usually the case when Intel OpenCL compiler does not return the right path to OpenCL host headers. 
+    DaCe relies on ``hlslib`` for compiling FPGA programs, which in turns relies on Intel's compiler to derive the right include path. Please verify that
+    the include path returned by the Intel compiler (using the ``aocl compile-config`` command) points to a directory that actually contains the OpenCL headers (namely ``cl.hpp`` and
+    ``cl2.hpp`` files). If this is not the case, please locate them under the Intel Quartus installation folder, and symlink (or copy) them in the ``aocl`` returned path.
+
 .. _qa_vscode:
 
 Common issues with the Visual Studio Code extension

From d68c4ffe78c182f082dd04c70967b6f8d8ba345f Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 2 Aug 2023 16:12:13 +0200
Subject: [PATCH 338/392] Added docstrings.

---
 dace/sdfg/utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index 7eef600180..d08518b10c 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1800,6 +1800,14 @@ def get_thread_local_data(sdfg: SDFG) -> List[str]:
 
 
 def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node:
+    """
+    Finds the global source node of an edge/memlet path, crossing nested SDFG scopes.
+
+    :param sdfg: The SDFG containing the edge.
+    :param state: The state containing the edge.
+    :param edge: The edge to find the global source node for.
+    :return: The global source node of the edge.
+    """
     src = state.memlet_path(edge)[0].src
     if isinstance(src, nd.AccessNode) and not sdfg.arrays[src.data].transient and sdfg.parent is not None:
         psdfg = sdfg.parent_sdfg
@@ -1813,6 +1821,14 @@ def get_global_memlet_path_src(sdfg: SDFG, state: SDFGState, edge: MultiConnecto
 
 
 def get_global_memlet_path_dst(sdfg: SDFG, state: SDFGState, edge: MultiConnectorEdge) -> nd.Node:
+    """
+    Finds the global destination node of an edge/memlet path, crossing nested SDFG scopes.
+
+    :param sdfg: The SDFG containing the edge.
+    :param state: The state containing the edge.
+    :param edge: The edge to find the global destination node for.
+    :return: The global destination node of the edge.
+    """
     dst = state.memlet_path(edge)[-1].dst
     if isinstance(dst, nd.AccessNode) and not sdfg.arrays[dst.data].transient and sdfg.parent is not None:
         psdfg = sdfg.parent_sdfg

From 9ff109293443416b7591165e5b4dd29ca0e8befa Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 2 Aug 2023 16:28:46 +0200
Subject: [PATCH 339/392] Disabled skip for Scalars.

---
 dace/sdfg/nodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index d82cd5607d..1c5cdcc0af 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -628,8 +628,8 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
         for dname, desc in self.sdfg.arrays.items():
             # TODO(later): Disallow scalars without access nodes (so that this
             #              check passes for them too).
-            if isinstance(desc, data.Scalar):
-                continue
+            # if isinstance(desc, data.Scalar):
+            #     continue
             if not desc.transient and dname not in connectors:
                 raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname)
             if dname in connectors and desc.transient:

From 67b839f80849b0231a7abbcf00190c95eb9a3a48 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 2 Aug 2023 18:00:52 +0200
Subject: [PATCH 340/392] Fixed typo.

---
 tests/transformations/refine_nested_access_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
index d6d0921da4..5343240df5 100644
--- a/tests/transformations/refine_nested_access_test.py
+++ b/tests/transformations/refine_nested_access_test.py
@@ -98,7 +98,7 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]
     assert np.allclose(B, lower.T + lower - diag)
 
 
-def test_free_sybmols_only_by_indices():
+def test_free_symbols_only_by_indices():
     i = dace.symbol('i')
     idx_a = dace.symbol('idx_a')
     idx_b = dace.symbol('idx_b')
@@ -132,4 +132,4 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int):
 if __name__ == '__main__':
     test_refine_dataflow()
     test_refine_interstate()
-    test_free_sybmols_only_by_indices()
+    test_free_symbols_only_by_indices()

From f350b46b35e70cc406c7bccd40706d286ca71714 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Wed, 2 Aug 2023 18:15:10 +0200
Subject: [PATCH 341/392] Fixed test.

---
 .../refine_nested_access_test.py              | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/tests/transformations/refine_nested_access_test.py b/tests/transformations/refine_nested_access_test.py
index 5343240df5..d9fb9a7392 100644
--- a/tests/transformations/refine_nested_access_test.py
+++ b/tests/transformations/refine_nested_access_test.py
@@ -100,11 +100,11 @@ def inner_sdfg(A: dace.int32[5, 5], B: dace.int32[5, 5], select: dace.bool[5, 5]
 
 def test_free_symbols_only_by_indices():
     i = dace.symbol('i')
-    idx_a = dace.symbol('idx_a')
-    idx_b = dace.symbol('idx_b')
     sdfg = dace.SDFG('refine_free_symbols_only_by_indices')
     sdfg.add_array('A', [5], dace.int32)
     sdfg.add_array('B', [5, 5], dace.int32)
+    sdfg.add_scalar('idx_a', dace.int64)
+    sdfg.add_scalar('idx_b', dace.int64)
 
     @dace.program
     def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int):
@@ -116,10 +116,22 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int):
     state = sdfg.add_state()
     A = state.add_access('A')
     B = state.add_access('B')
+    ia = state.add_access('idx_a')
+    ib = state.add_access('idx_b')
     map_entry, map_exit = state.add_map('map', dict(i='0:5'))
-    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A'}, {'B'}, {'i': 'i'})
-    state.add_memlet_path(A, map_entry, nsdfg,  dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
-    state.add_memlet_path(nsdfg, map_exit, B,  src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
+    nsdfg = state.add_nested_sdfg(inner_sdfg.to_sdfg(simplify=False), sdfg, {'A', 'idx_a', 'idx_b'}, {'B'}, {'i': 'i'})
+    state.add_memlet_path(A, map_entry, nsdfg, dst_conn='A', memlet=dace.Memlet.from_array('A', sdfg.arrays['A']))
+    state.add_memlet_path(nsdfg, map_exit, B, src_conn='B', memlet=dace.Memlet.from_array('B', sdfg.arrays['B']))
+    state.add_memlet_path(ia,
+                          map_entry,
+                          nsdfg,
+                          dst_conn='idx_a',
+                          memlet=dace.Memlet.from_array('idx_a', sdfg.arrays['idx_a']))
+    state.add_memlet_path(ib,
+                          map_entry,
+                          nsdfg,
+                          dst_conn='idx_b',
+                          memlet=dace.Memlet.from_array('idx_b', sdfg.arrays['idx_b']))
 
     num = sdfg.apply_transformations_repeated(RefineNestedAccess)
     assert num == 1
@@ -128,6 +140,21 @@ def inner_sdfg(A: dace.int32[5], B: dace.int32[5, 5], idx_a: int, idx_b: int):
     edge = state.in_edges(map_exit)[0]
     assert edge.data.subset == dace.subsets.Range([(i, i, 1), (0, 4, 1)])
 
+    A = np.array([0, 1, 0, 1, 0], dtype=np.int32)
+    ref = np.zeros((5, 5), dtype=np.int32)
+    val = np.zeros((5, 5), dtype=np.int32)
+    ia = 3
+    ib = 2
+
+    for i in range(5):
+        if A[i] > 0.5:
+            ref[i, ia] = 1
+        else:
+            ref[i, ib] = 0
+    sdfg(A=A, B=val, idx_a=ia, idx_b=ib)
+
+    assert np.allclose(ref, val)
+
 
 if __name__ == '__main__':
     test_refine_dataflow()

From 1d3db91f7104e51dd90ce41da3f84a0140ab69e4 Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 3 Aug 2023 08:38:15 +0200
Subject: [PATCH 342/392] Update dependency

---
 dace/external/hlslib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/external/hlslib b/dace/external/hlslib
index 1b5b3aee5d..1403cd016c 160000
--- a/dace/external/hlslib
+++ b/dace/external/hlslib
@@ -1 +1 @@
-Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1
+Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce

From 350cff4ef27bfdf9d859e1f4fbee46888dc34c61 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 3 Aug 2023 04:35:50 -0700
Subject: [PATCH 343/392] Remove unused global data descriptor shapes from
 arguments (#1338)

---
 dace/codegen/targets/framecode.py        |    5 +-
 dace/data.py                             |   48 +-
 dace/memlet.py                           |   35 +-
 dace/sdfg/nodes.py                       |   35 +-
 dace/sdfg/sdfg.py                        |   60 +-
 dace/sdfg/state.py                       |   43 +-
 tests/codegen/symbol_arguments_test.py   |   54 ++
 tests/transformations/mapfission_test.py | 1023 +++++++++++-----------
 8 files changed, 747 insertions(+), 556 deletions(-)
 create mode 100644 tests/codegen/symbol_arguments_test.py

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 6f302c11ba..56419b9701 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -82,7 +82,10 @@ def free_symbols(self, obj: Any):
         k = id(obj)
         if k in self.fsyms:
             return self.fsyms[k]
-        result = obj.free_symbols
+        if hasattr(obj, 'used_symbols'):
+            result = obj.used_symbols(all_symbols=False)
+        else:
+            result = obj.free_symbols
         self.fsyms[k] = result
         return result
 
diff --git a/dace/data.py b/dace/data.py
index 2fc5f334c6..d492d06258 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -243,14 +243,26 @@ def as_arg(self, with_types=True, for_call=False, name=None):
         """Returns a string for a C++ function signature (e.g., `int *A`). """
         raise NotImplementedError
 
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        """
+        Returns a set of symbols that are used by this data descriptor.
+
+        :param all_symbols: Include not-strictly-free symbols that are used by this data descriptor,
+                            e.g., shape and size of a global array.
+        :return: A set of symbols that are used by this data descriptor. NOTE: The results are symbolic
+                 rather than a set of strings.
+        """
+        result = set()
+        if self.transient or all_symbols:
+            for s in self.shape:
+                if isinstance(s, sp.Basic):
+                    result |= set(s.free_symbols)
+        return result
+
     @property
     def free_symbols(self) -> Set[symbolic.SymbolicType]:
         """ Returns a set of undefined symbols in this data descriptor. """
-        result = set()
-        for s in self.shape:
-            if isinstance(s, sp.Basic):
-                result |= set(s.free_symbols)
-        return result
+        return self.used_symbols(all_symbols=True)
 
     def __repr__(self):
         return 'Abstract Data Container, DO NOT USE'
@@ -689,20 +701,23 @@ def as_arg(self, with_types=True, for_call=False, name=None):
     def sizes(self):
         return [d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape]
 
-    @property
-    def free_symbols(self):
-        result = super().free_symbols
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        result = super().used_symbols(all_symbols)
         for s in self.strides:
             if isinstance(s, sp.Expr):
                 result |= set(s.free_symbols)
-        if isinstance(self.total_size, sp.Expr):
-            result |= set(self.total_size.free_symbols)
         for o in self.offset:
             if isinstance(o, sp.Expr):
                 result |= set(o.free_symbols)
-
+        if self.transient or all_symbols:
+            if isinstance(self.total_size, sp.Expr):
+                result |= set(self.total_size.free_symbols)
         return result
 
+    @property
+    def free_symbols(self):
+        return self.used_symbols(all_symbols=True)
+
     def _set_shape_dependent_properties(self, shape, strides, total_size, offset):
         """
         Used to set properties which depend on the shape of the array
@@ -890,10 +905,9 @@ def covers_range(self, rng):
 
         return True
 
-    @property
-    def free_symbols(self):
-        result = super().free_symbols
-        if isinstance(self.buffer_size, sp.Expr):
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        result = super().used_symbols(all_symbols)
+        if (self.transient or all_symbols) and isinstance(self.buffer_size, sp.Expr):
             result |= set(self.buffer_size.free_symbols)
         for o in self.offset:
             if isinstance(o, sp.Expr):
@@ -901,6 +915,10 @@ def free_symbols(self):
 
         return result
 
+    @property
+    def free_symbols(self):
+        return self.used_symbols(all_symbols=True)
+
 
 @make_properties
 class View(Array):
diff --git a/dace/memlet.py b/dace/memlet.py
index 35b689381d..74a1320a3b 100644
--- a/dace/memlet.py
+++ b/dace/memlet.py
@@ -17,6 +17,7 @@
 if TYPE_CHECKING:
     import dace.sdfg.graph
 
+
 @make_properties
 class Memlet(object):
     """ Data movement object. Represents the data, the subset moved, and the
@@ -176,15 +177,16 @@ def to_json(self):
     @staticmethod
     def from_json(json_obj, context=None):
         ret = Memlet()
-        dace.serialize.set_properties_from_json(ret,
-                                                json_obj,
-                                                context=context,
-                                                ignore_properties={'src_subset', 'dst_subset', 'num_accesses', 'is_data_src'})
-        
+        dace.serialize.set_properties_from_json(
+            ret,
+            json_obj,
+            context=context,
+            ignore_properties={'src_subset', 'dst_subset', 'num_accesses', 'is_data_src'})
+
         # Allow serialized memlet to override src/dst_subset to disambiguate self-copies
         if 'is_data_src' in json_obj['attributes']:
             ret._is_data_src = json_obj['attributes']['is_data_src']
-        
+
         if context:
             ret._sdfg = context['sdfg']
             ret._state = context['sdfg_state']
@@ -510,18 +512,30 @@ def validate(self, sdfg, state):
         if self.data is not None and self.data not in sdfg.arrays:
             raise KeyError('Array "%s" not found in SDFG' % self.data)
 
-    @property
-    def free_symbols(self) -> Set[str]:
-        """ Returns a set of symbols used in this edge's properties. """
+    def used_symbols(self, all_symbols: bool) -> Set[str]:
+        """
+        Returns a set of symbols used in this edge's properties. 
+        
+        :param all_symbols: If False, only returns the set of symbols that will be used
+                            in the generated code and are needed as arguments.
+        """
         # Symbolic properties are in volume, and the two subsets
         result = set()
-        result |= set(map(str, self.volume.free_symbols))
+        if all_symbols:
+            result |= set(map(str, self.volume.free_symbols))
         if self.src_subset:
             result |= self.src_subset.free_symbols
+
         if self.dst_subset:
             result |= self.dst_subset.free_symbols
+
         return result
 
+    @property
+    def free_symbols(self) -> Set[str]:
+        """ Returns a set of symbols used in this edge's properties. """
+        return self.used_symbols(all_symbols=True)
+
     def get_free_symbols_by_indices(self, indices_src: List[int], indices_dst: List[int]) -> Set[str]:
         """
         Returns set of free symbols used in this edges properties but only taking certain indices of the src and dst
@@ -640,6 +654,7 @@ class MemletTree(object):
         all siblings of the same edge and their children, for instance if
         multiple inputs from the same access node are used.
     """
+
     def __init__(self,
                  edge: 'dace.sdfg.graph.MultiConnectorEdge[Memlet]',
                  downwards: bool = True,
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index bd384b6736..378ee7be3e 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -580,12 +580,22 @@ def from_json(json_obj, context=None):
 
         return ret
 
+    def used_symbols(self, all_symbols: bool) -> Set[str]:
+        free_syms = set().union(*(map(str,
+                                      pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()),
+                                *(map(str,
+                                      pystr_to_symbolic(v).free_symbols) for v in self.location.values()))
+
+        # Filter out unused internal symbols from symbol mapping
+        if not all_symbols:
+            internally_used_symbols = self.sdfg.used_symbols(all_symbols=False)
+            free_syms &= internally_used_symbols
+        
+        return free_syms
+
     @property
     def free_symbols(self) -> Set[str]:
-        return set().union(*(map(str,
-                                 pystr_to_symbolic(v).free_symbols) for v in self.symbol_mapping.values()),
-                           *(map(str,
-                                 pystr_to_symbolic(v).free_symbols) for v in self.location.values()))
+        return self.used_symbols(all_symbols=True)
 
     def infer_connector_types(self, sdfg, state):
         # Avoid import loop
@@ -673,6 +683,7 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
 # Scope entry class
 class EntryNode(Node):
     """ A type of node that opens a scope (e.g., Map or Consume). """
+
     def validate(self, sdfg, state):
         self.map.validate(sdfg, state, self)
 
@@ -683,6 +694,7 @@ def validate(self, sdfg, state):
 # Scope exit class
 class ExitNode(Node):
     """ A type of node that closes a scope (e.g., Map or Consume). """
+
     def validate(self, sdfg, state):
         self.map.validate(sdfg, state, self)
 
@@ -696,6 +708,7 @@ class MapEntry(EntryNode):
         
         :see: Map
     """
+
     def __init__(self, map: 'Map', dynamic_inputs=None):
         super(MapEntry, self).__init__(dynamic_inputs or set())
         if map is None:
@@ -772,6 +785,7 @@ class MapExit(ExitNode):
         
         :see: Map
     """
+
     def __init__(self, map: 'Map'):
         super(MapExit, self).__init__()
         if map is None:
@@ -851,17 +865,20 @@ class Map(object):
                                default=0,
                                desc="Number of OpenMP threads executing the Map",
                                optional=True,
-                               optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
+                               optional_condition=lambda m: m.schedule in
+                               (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_schedule = EnumProperty(dtype=dtypes.OMPScheduleType,
                                 default=dtypes.OMPScheduleType.Default,
                                 desc="OpenMP schedule {static, dynamic, guided}",
                                 optional=True,
-                                optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
+                                optional_condition=lambda m: m.schedule in
+                                (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
     omp_chunk_size = Property(dtype=int,
                               default=0,
                               desc="OpenMP schedule chunk size",
                               optional=True,
-                              optional_condition=lambda m: m.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
+                              optional_condition=lambda m: m.schedule in
+                              (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent))
 
     gpu_block_size = ListProperty(element_type=int,
                                   default=None,
@@ -928,6 +945,7 @@ class ConsumeEntry(EntryNode):
         
         :see: Consume
     """
+
     def __init__(self, consume: 'Consume', dynamic_inputs=None):
         super(ConsumeEntry, self).__init__(dynamic_inputs or set())
         if consume is None:
@@ -1006,6 +1024,7 @@ class ConsumeExit(ExitNode):
         
         :see: Consume
     """
+
     def __init__(self, consume: 'Consume'):
         super(ConsumeExit, self).__init__()
         if consume is None:
@@ -1117,6 +1136,7 @@ def get_param_num(self):
 
 @dace.serialize.serializable
 class PipelineEntry(MapEntry):
+
     @staticmethod
     def map_type():
         return PipelineScope
@@ -1149,6 +1169,7 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]:
 
 @dace.serialize.serializable
 class PipelineExit(MapExit):
+
     @staticmethod
     def map_type():
         return PipelineScope
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 18763e385a..f3a37ef08c 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -212,8 +212,7 @@ def read_symbols(self) -> Set[str]:
 
         return result
 
-    @property
-    def free_symbols(self) -> Set[str]:
+    def used_symbols(self, all_symbols: bool) -> Set[str]:
         """ Returns a set of symbols used in this edge's properties. """
         # NOTE: The former algorithm for computing an edge's free symbols was:
         #       `self.read_symbols() - set(self.assignments.keys())`
@@ -241,6 +240,11 @@ def free_symbols(self) -> Set[str]:
         # Return the set of candidate free symbols minus the set of candidate defined symbols
         return (cond_symbols | rhs_symbols) - lhs_symbols
 
+    @property
+    def free_symbols(self) -> Set[str]:
+        """ Returns a set of symbols used in this edge's properties. """
+        return self.used_symbols(all_symbols=True)
+
     def replace_dict(self, repl: Dict[str, str], replace_keys=True) -> None:
         """
         Replaces all given keys with their corresponding values.
@@ -293,7 +297,7 @@ def new_symbols(self, sdfg, symbols) -> Dict[str, dtypes.typeclass]:
             alltypes = symbols
 
         inferred_lhs_symbols = {k: infer_expr_type(v, alltypes) for k, v in self.assignments.items()}
-    
+
         # Symbols in assignment keys are candidate newly defined symbols
         lhs_symbols = set()
         # Symbols already defined
@@ -303,7 +307,7 @@ def new_symbols(self, sdfg, symbols) -> Dict[str, dtypes.typeclass]:
             # Only add LHS to the set of candidate newly defined symbols if it has not been defined yet
             if lhs not in rhs_symbols:
                 lhs_symbols.add(lhs)
-        
+
         return {k: v for k, v in inferred_lhs_symbols.items() if k in lhs_symbols}
 
     def get_read_memlets(self, arrays: Dict[str, dt.Data]) -> List[mm.Memlet]:
@@ -593,6 +597,7 @@ def hash_sdfg(self, jsondict: Optional[Dict[str, Any]] = None) -> str:
         :param jsondict: If not None, uses given JSON dictionary as input.
         :return: The hash (in SHA-256 format).
         """
+
         def keyword_remover(json_obj: Any, last_keyword=""):
             # Makes non-unique in SDFG hierarchy v2
             # Recursively remove attributes from the SDFG which are not used in
@@ -1277,27 +1282,36 @@ def arrays_recursive(self):
                 if isinstance(node, nd.NestedSDFG):
                     yield from node.sdfg.arrays_recursive()
 
-    @property
-    def free_symbols(self) -> Set[str]:
+    def used_symbols(self, all_symbols: bool) -> Set[str]:
         """
         Returns a set of symbol names that are used by the SDFG, but not
         defined within it. This property is used to determine the symbolic
-        parameters of the SDFG and verify that ``SDFG.symbols`` is complete.
+        parameters of the SDFG.
 
-        :note: Assumes that the graph is valid (i.e., without undefined or
-               overlapping symbols).
+        :param all_symbols: If False, only returns the set of symbols that will be used
+                            in the generated code and are needed as arguments.
         """
         defined_syms = set()
         free_syms = set()
 
-        # Start with the set of SDFG free symbols
-        free_syms |= set(self.symbols.keys())
-
-        # Exclude data descriptor names and constants
+        # Exclude data descriptor names, constants, and shapes of global data descriptors
+        not_strictly_necessary_global_symbols = set()
         for name, desc in self.arrays.items():
             defined_syms.add(name)
+
+            if not all_symbols:
+                used_desc_symbols = desc.used_symbols(all_symbols)
+                not_strictly_necessary = (desc.used_symbols(all_symbols=True) - used_desc_symbols)
+                not_strictly_necessary_global_symbols |= set(map(str, not_strictly_necessary))
+
         defined_syms |= set(self.constants_prop.keys())
 
+        # Start with the set of SDFG free symbols
+        if all_symbols:
+            free_syms |= set(self.symbols.keys())
+        else:
+            free_syms |= set(s for s in self.symbols.keys() if s not in not_strictly_necessary_global_symbols)
+
         # Add free state symbols
         used_before_assignment = set()
 
@@ -1307,14 +1321,14 @@ def free_symbols(self) -> Set[str]:
             ordered_states = self.nodes()
 
         for state in ordered_states:
-            free_syms |= state.free_symbols
+            free_syms |= state.used_symbols(all_symbols)
 
             # Add free inter-state symbols
             for e in self.out_edges(state):
                 # NOTE: First we get the true InterstateEdge free symbols, then we compute the newly defined symbols by
                 # subracting the (true) free symbols from the edge's assignment keys. This way we can correctly
                 # compute the symbols that are used before being assigned.
-                efsyms = e.data.free_symbols
+                efsyms = e.data.used_symbols(all_symbols)
                 defined_syms |= set(e.data.assignments.keys()) - efsyms
                 used_before_assignment.update(efsyms - defined_syms)
                 free_syms |= efsyms
@@ -1325,6 +1339,18 @@ def free_symbols(self) -> Set[str]:
         # Subtract symbols defined in inter-state edges and constants
         return free_syms - defined_syms
 
+    @property
+    def free_symbols(self) -> Set[str]:
+        """
+        Returns a set of symbol names that are used by the SDFG, but not
+        defined within it. This property is used to determine the symbolic
+        parameters of the SDFG and verify that ``SDFG.symbols`` is complete.
+
+        :note: Assumes that the graph is valid (i.e., without undefined or
+               overlapping symbols).
+        """
+        return self.used_symbols(all_symbols=True)
+
     def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]:
         """
         Determines what data containers are read and written in this SDFG. Does
@@ -1373,8 +1399,8 @@ def arglist(self, scalars_only=False, free_symbols=None) -> Dict[str, dt.Data]:
             if not v.transient and isinstance(v, dt.Scalar) and not k.startswith('__dace')
         }
 
-        # Add global free symbols to scalar arguments
-        free_symbols = free_symbols if free_symbols is not None else self.free_symbols
+        # Add global free symbols used in the generated code to scalar arguments
+        free_symbols = free_symbols if free_symbols is not None else self.used_symbols(all_symbols=False)
         scalar_args.update({k: dt.Scalar(self.symbols[k]) for k in free_symbols if not k.startswith('__dace')})
 
         # Fill up ordered dictionary
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index c354cd9d1f..a4a6648401 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -409,14 +409,12 @@ def scope_children(self,
     ###################################################################
     # Query, subgraph, and replacement methods
 
-    @property
-    def free_symbols(self) -> Set[str]:
+    def used_symbols(self, all_symbols: bool) -> Set[str]:
         """
-        Returns a set of symbol names that are used, but not defined, in
-        this graph view (SDFG state or subgraph thereof).
+        Returns a set of symbol names that are used in the state.
 
-        :note: Assumes that the graph is valid (i.e., without undefined or
-               overlapping symbols).
+        :param all_symbols: If False, only returns the set of symbols that will be used
+                            in the generated code and are needed as arguments.
         """
         state = self.graph if isinstance(self, SubgraphView) else self
         sdfg = state.parent
@@ -429,7 +427,7 @@ def free_symbols(self) -> Set[str]:
                 new_symbols |= set(n.new_symbols(sdfg, self, {}).keys())
             elif isinstance(n, nd.AccessNode):
                 # Add data descriptor symbols
-                freesyms |= set(map(str, n.desc(sdfg).free_symbols))
+                freesyms |= set(map(str, n.desc(sdfg).used_symbols(all_symbols)))
             elif (isinstance(n, nd.Tasklet) and n.language == dtypes.Language.Python):
                 # Consider callbacks defined as symbols as free
                 for stmt in n.code.code:
@@ -438,14 +436,41 @@ def free_symbols(self) -> Set[str]:
                                 and astnode.func.id in sdfg.symbols):
                             freesyms.add(astnode.func.id)
 
-            freesyms |= n.free_symbols
+            if hasattr(n, 'used_symbols'):
+                freesyms |= n.used_symbols(all_symbols)
+            else:
+                freesyms |= n.free_symbols
+
         # Free symbols from memlets
+        def _is_leaf_memlet(e):
+            if isinstance(e.src, nd.ExitNode) and e.src_conn and e.src_conn.startswith('OUT_'):
+                return False
+            if isinstance(e.dst, nd.EntryNode) and e.dst_conn and e.dst_conn.startswith('IN_'):
+                return False
+            return True
+        
         for e in self.edges():
-            freesyms |= e.data.free_symbols
+            # If used for code generation, only consider memlet tree leaves
+            if not all_symbols and not _is_leaf_memlet(e):
+                continue
+
+            freesyms |= e.data.used_symbols(all_symbols)
 
         # Do not consider SDFG constants as symbols
         new_symbols.update(set(sdfg.constants.keys()))
         return freesyms - new_symbols
+    
+    @property
+    def free_symbols(self) -> Set[str]:
+        """
+        Returns a set of symbol names that are used, but not defined, in
+        this graph view (SDFG state or subgraph thereof).
+
+        :note: Assumes that the graph is valid (i.e., without undefined or
+               overlapping symbols).
+        """
+        return self.used_symbols(all_symbols=True)
+
 
     def defined_symbols(self) -> Dict[str, dt.Data]:
         """
diff --git a/tests/codegen/symbol_arguments_test.py b/tests/codegen/symbol_arguments_test.py
new file mode 100644
index 0000000000..3ca89ddd06
--- /dev/null
+++ b/tests/codegen/symbol_arguments_test.py
@@ -0,0 +1,54 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import dace
+import numpy as np
+
+N = dace.symbol('N')
+
+
+def test_global_sizes():
+
+    @dace.program
+    def tester(A: dace.float64[N]):
+        for i in dace.map[0:10]:
+            A[i] = 2
+
+    sdfg = tester.to_sdfg()
+    # Since N is not used anywhere, it should not be listed in the arguments
+    assert 'N' not in sdfg.arglist()
+
+    a = np.random.rand(20)
+    sdfg(a, N=20)
+    assert np.allclose(a[:10], 2)
+
+
+def test_global_sizes_used():
+
+    @dace.program
+    def tester(A: dace.float64[N]):
+        for i in dace.map[0:10]:
+            with dace.tasklet:
+                a >> A[i]
+                a = N
+
+    sdfg = tester.to_sdfg()
+    # N is used in a tasklet
+    assert 'N' in sdfg.arglist()
+
+
+def test_global_sizes_multidim():
+
+    @dace.program
+    def tester(A: dace.float64[N, N]):
+        for i, j in dace.map[0:10, 0:10]:
+            A[i, j] = 2
+
+    sdfg = tester.to_sdfg()
+    # Here N is implicitly used in the index expression, so it should be in the arguments
+    assert 'N' in sdfg.arglist()
+
+
+if __name__ == '__main__':
+    test_global_sizes()
+    test_global_sizes_used()
+    test_global_sizes_multidim()
diff --git a/tests/transformations/mapfission_test.py b/tests/transformations/mapfission_test.py
index 72dbebb089..609c075c21 100644
--- a/tests/transformations/mapfission_test.py
+++ b/tests/transformations/mapfission_test.py
@@ -60,533 +60,562 @@ def config():
     return A, expected
 
 
-class MapFissionTest(unittest.TestCase):
-
-    def test_subgraph(self):
-        A, expected = config()
-        B = np.random.rand(2)
-
-        graph = mapfission_sdfg()
-        self.assertGreater(graph.apply_transformations(MapFission), 0)
-        graph(A=A, B=B)
-
-        self.assertTrue(np.allclose(B, expected))
-
-    def test_nested_sdfg(self):
-        A, expected = config()
-        B = np.random.rand(2)
-
-        # Nest the subgraph within the outer map, then apply transformation
-        graph = mapfission_sdfg()
-        state = graph.nodes()[0]
-        topmap = next(node for node in state.nodes() if isinstance(node, nodes.MapEntry) and node.label == 'outer')
-        subgraph = state.scope_subgraph(topmap, include_entry=False, include_exit=False)
-        nest_state_subgraph(graph, state, subgraph)
-        self.assertGreater(graph.apply_transformations(MapFission), 0)
-        graph(A=A, B=B)
-        self.assertTrue(np.allclose(B, expected))
-
-    def test_nested_transient(self):
-        """ Test nested SDFGs with transients. """
-
-        # Inner SDFG
-        nsdfg = dace.SDFG('nested')
-        nsdfg.add_array('a', [1], dace.float64)
-        nsdfg.add_array('b', [1], dace.float64)
-        nsdfg.add_transient('t', [1], dace.float64)
-
-        # a->t state
-        nstate = nsdfg.add_state()
-        irnode = nstate.add_read('a')
-        task = nstate.add_tasklet('t1', {'inp'}, {'out'}, 'out = 2*inp')
-        iwnode = nstate.add_write('t')
-        nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('a', '0'))
-        nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0'))
-
-        # t->a state
-        first_state = nstate
-        nstate = nsdfg.add_state()
-        irnode = nstate.add_read('t')
-        task = nstate.add_tasklet('t2', {'inp'}, {'out'}, 'out = 3*inp')
-        iwnode = nstate.add_write('b')
-        nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('t', '0'))
-        nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('b', '0'))
-
-        nsdfg.add_edge(first_state, nstate, dace.InterstateEdge())
-
-        # Outer SDFG
-        sdfg = dace.SDFG('nested_transient_fission')
-        sdfg.add_array('A', [2], dace.float64)
-        state = sdfg.add_state()
-        rnode = state.add_read('A')
-        wnode = state.add_write('A')
-        me, mx = state.add_map('outer', dict(i='0:2'))
-        nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'})
-        state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
-        state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
-
-        self.assertGreater(sdfg.apply_transformations_repeated(MapFission), 0)
-
-        # Test
-        A = np.random.rand(2)
-        expected = A * 6
-        sdfg(A=A)
-        self.assertTrue(np.allclose(A, expected))
-
-    def test_inputs_outputs(self):
-        """
-        Test subgraphs where the computation modules that are in the middle
-        connect to the outside.
-        """
-
-        sdfg = dace.SDFG('inputs_outputs_fission')
-        sdfg.add_array('in1', [2], dace.float64)
-        sdfg.add_array('in2', [2], dace.float64)
-        sdfg.add_scalar('tmp', dace.float64, transient=True)
-        sdfg.add_array('out1', [2], dace.float64)
-        sdfg.add_array('out2', [2], dace.float64)
-        state = sdfg.add_state()
-        in1 = state.add_read('in1')
-        in2 = state.add_read('in2')
-        out1 = state.add_write('out1')
-        out2 = state.add_write('out2')
-        me, mx = state.add_map('outer', dict(i='0:2'))
-        t1 = state.add_tasklet('t1', {'i1'}, {'o1', 'o2'}, 'o1 = i1 * 2; o2 = i1 * 5')
-        t2 = state.add_tasklet('t2', {'i1', 'i2'}, {'o1'}, 'o1 = i1 * i2')
-        state.add_memlet_path(in1, me, t1, dst_conn='i1', memlet=dace.Memlet.simple('in1', 'i'))
-        state.add_memlet_path(in2, me, t2, dst_conn='i2', memlet=dace.Memlet.simple('in2', 'i'))
-        state.add_edge(t1, 'o1', t2, 'i1', dace.Memlet.simple('tmp', '0'))
-        state.add_memlet_path(t2, mx, out1, src_conn='o1', memlet=dace.Memlet.simple('out1', 'i'))
-        state.add_memlet_path(t1, mx, out2, src_conn='o2', memlet=dace.Memlet.simple('out2', 'i'))
-
-        self.assertGreater(sdfg.apply_transformations(MapFission), 0)
-
-        # Test
-        A, B, C, D = tuple(np.random.rand(2) for _ in range(4))
-        expected_C = (A * 2) * B
-        expected_D = A * 5
-        sdfg(in1=A, in2=B, out1=C, out2=D)
-        self.assertTrue(np.allclose(C, expected_C))
-        self.assertTrue(np.allclose(D, expected_D))
-
-    def test_multidim(self):
-        sdfg = dace.SDFG('mapfission_multidim')
-        sdfg.add_array('A', [2, 3], dace.float64)
-        state = sdfg.add_state()
-        me, mx = state.add_map('outer', dict(i='0:2', j='0:3'))
-
-        nsdfg = dace.SDFG('nested')
-        nsdfg.add_array('a', [1], dace.float64)
-        nstate = nsdfg.add_state()
-        t = nstate.add_tasklet('reset', {}, {'out'}, 'out = 0')
-        a = nstate.add_write('a')
-        nstate.add_edge(t, 'out', a, None, dace.Memlet.simple('a', '0'))
-        nsdfg_node = state.add_nested_sdfg(nsdfg, None, {}, {'a'})
-
-        state.add_edge(me, None, nsdfg_node, None, dace.Memlet())
-        anode = state.add_write('A')
-        state.add_memlet_path(nsdfg_node, mx, anode, src_conn='a', memlet=dace.Memlet.simple('A', 'i,j'))
-
-        self.assertGreater(sdfg.apply_transformations_repeated(MapFission), 0)
-
-        # Test
-        A = np.random.rand(2, 3)
-        sdfg(A=A)
-        self.assertTrue(np.allclose(A, np.zeros_like(A)))
-
-    def test_offsets(self):
-        sdfg = dace.SDFG('mapfission_offsets')
-        sdfg.add_array('A', [20], dace.float64)
-        sdfg.add_scalar('interim', dace.float64, transient=True)
-        state = sdfg.add_state()
-        me, mx = state.add_map('outer', dict(i='10:20'))
-
-        t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1')
-        t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2')
-
-        aread = state.add_read('A')
-        awrite = state.add_write('A')
-        state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
-        state.add_edge(t1, 'b', t2, 'a', dace.Memlet.simple('interim', '0'))
-        state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
-
-        self.assertGreater(sdfg.apply_transformations(MapFission), 0)
-
-        dace.propagate_memlets_sdfg(sdfg)
-        sdfg.validate()
-
-        # Test
-        A = np.random.rand(20)
-        expected = A.copy()
-        expected[10:] += 3
-        sdfg(A=A)
-        self.assertTrue(np.allclose(A, expected))
-
-    def test_offsets_array(self):
-        sdfg = dace.SDFG('mapfission_offsets2')
-        sdfg.add_array('A', [20], dace.float64)
-        sdfg.add_array('interim', [1], dace.float64, transient=True)
-        state = sdfg.add_state()
-        me, mx = state.add_map('outer', dict(i='10:20'))
-
-        t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1')
-        interim = state.add_access('interim')
-        t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2')
-
-        aread = state.add_read('A')
-        awrite = state.add_write('A')
-        state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
-        state.add_edge(t1, 'b', interim, None, dace.Memlet.simple('interim', '0'))
-        state.add_edge(interim, None, t2, 'a', dace.Memlet.simple('interim', '0'))
-        state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
-
-        self.assertGreater(sdfg.apply_transformations(MapFission), 0)
-
-        dace.propagate_memlets_sdfg(sdfg)
-        sdfg.validate()
-
-        # Test
-        A = np.random.rand(20)
-        expected = A.copy()
-        expected[10:] += 3
-        sdfg(A=A)
-        self.assertTrue(np.allclose(A, expected))
-
-    def test_mapfission_with_symbols(self):
-        '''
-        Tests MapFission in the case of a Map containing a NestedSDFG that is using some symbol from the top-level SDFG
-        missing from the NestedSDFG's symbol mapping. Please note that this is an unusual case that is difficult to
-        reproduce and ultimately unrelated to MapFission. Consider solving the underlying issue and then deleting this
-        test and the corresponding (obsolete) code in MapFission.
-        '''
-
-        M, N = dace.symbol('M'), dace.symbol('N')
-
-        sdfg = dace.SDFG('tasklet_code_with_symbols')
-        sdfg.add_array('A', (M, N), dace.int32)
-        sdfg.add_array('B', (M, N), dace.int32)
-
-        state = sdfg.add_state('parent', is_start_state=True)
-        me, mx = state.add_map('parent_map', {'i': '0:N'})
-
-        nsdfg = dace.SDFG('nested_sdfg')
-        nsdfg.add_scalar('inner_A', dace.int32)
-        nsdfg.add_scalar('inner_B', dace.int32)
-
-        nstate = nsdfg.add_state('child', is_start_state=True)
-        na = nstate.add_access('inner_A')
-        nb = nstate.add_access('inner_B')
-        ta = nstate.add_tasklet('tasklet_A', {}, {'__out'}, '__out = M')
-        tb = nstate.add_tasklet('tasklet_B', {}, {'__out'}, '__out = M')
-        nstate.add_edge(ta, '__out', na, None, dace.Memlet.from_array('inner_A', nsdfg.arrays['inner_A']))
-        nstate.add_edge(tb, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B']))
-
-        a = state.add_access('A')
-        b = state.add_access('B')
-        t = nodes.NestedSDFG('child_sdfg', nsdfg, {}, {'inner_A', 'inner_B'}, {})
-        nsdfg.parent = state
-        nsdfg.parent_sdfg = sdfg
-        nsdfg.parent_nsdfg_node = t
-        state.add_node(t)
-        state.add_nedge(me, t, dace.Memlet())
-        state.add_memlet_path(t, mx, a, memlet=dace.Memlet('A[0, i]'), src_conn='inner_A')
-        state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[0, i]'), src_conn='inner_B')
-
-        num = sdfg.apply_transformations_repeated(MapFission)
-        self.assertTrue(num == 1)
-
-        A = np.ndarray((2, 10), dtype=np.int32)
-        B = np.ndarray((2, 10), dtype=np.int32)
-        sdfg(A=A, B=B, M=2, N=10)
-
-        ref = np.full((10, ), fill_value=2, dtype=np.int32)
-
-        self.assertTrue(np.array_equal(A[0], ref))
-        self.assertTrue(np.array_equal(B[0], ref))
-
-    def test_two_edges_through_map(self):
-        '''
-        Tests MapFission in the case of a Map with a component that has two inputs from a single data container. In such
-        cases, using `fill_scope_connectors` will lead to broken Map connectors. The tests confirms that new code in the
-        transformation manually adding the appropriate Map connectors works properly.
-        '''
-
-        N = dace.symbol('N')
-
-        sdfg = dace.SDFG('two_edges_through_map')
-        sdfg.add_array('A', (N, ), dace.int32)
-        sdfg.add_array('B', (N, ), dace.int32)
-
-        state = sdfg.add_state('parent', is_start_state=True)
-        me, mx = state.add_map('parent_map', {'i': '0:N'})
-
-        nsdfg = dace.SDFG('nested_sdfg')
-        nsdfg.add_array('inner_A', (N, ), dace.int32)
-        nsdfg.add_scalar('inner_B', dace.int32)
-
-        nstate = nsdfg.add_state('child', is_start_state=True)
-        na = nstate.add_access('inner_A')
-        nb = nstate.add_access('inner_B')
-        t = nstate.add_tasklet('tasklet', {'__in1', '__in2'}, {'__out'}, '__out = __in1 + __in2')
-        nstate.add_edge(na, None, t, '__in1', dace.Memlet('inner_A[i]'))
-        nstate.add_edge(na, None, t, '__in2', dace.Memlet('inner_A[N-i-1]'))
-        nstate.add_edge(t, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B']))
-
-        a = state.add_access('A')
-        b = state.add_access('B')
-        t = state.add_nested_sdfg(nsdfg, None, {'inner_A'}, {'inner_B'}, {'N': 'N', 'i': 'i'})
-        state.add_memlet_path(a, me, t, memlet=dace.Memlet.from_array('A', sdfg.arrays['A']), dst_conn='inner_A')
-        state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[i]'), src_conn='inner_B')
-
-        num = sdfg.apply_transformations_repeated(MapFission)
-        self.assertTrue(num == 1)
-
-        A = np.arange(10, dtype=np.int32)
-        B = np.ndarray((10, ), dtype=np.int32)
-        sdfg(A=A, B=B, N=10)
-
-        ref = np.full((10, ), fill_value=9, dtype=np.int32)
-
-        self.assertTrue(np.array_equal(B, ref))
-
-    def test_if_scope(self):
-
-        @dace.program
-        def map_with_if(A: dace.int32[10]):
-            for i in dace.map[0:10]:
-                if i < 5:
-                    A[i] = 0
-                else:
-                    A[i] = 1
-
-        ref = np.array([0] * 5 + [1] * 5, dtype=np.int32)
-
-        sdfg = map_with_if.to_sdfg()
-        val0 = np.ndarray((10, ), dtype=np.int32)
-        sdfg(A=val0)
-        self.assertTrue(np.array_equal(val0, ref))
-
-        sdfg.apply_transformations_repeated(MapFission)
-
-        val1 = np.ndarray((10, ), dtype=np.int32)
-        sdfg(A=val1)
-        self.assertTrue(np.array_equal(val1, ref))
-
-    def test_if_scope_2(self):
-
-        @dace.program
-        def map_with_if_2(A: dace.int32[10]):
-            for i in dace.map[0:10]:
-                j = i < 5
-                if j:
-                    A[i] = 0
-                else:
-                    A[i] = 1
-
-        ref = np.array([0] * 5 + [1] * 5, dtype=np.int32)
-
-        sdfg = map_with_if_2.to_sdfg()
-        val0 = np.ndarray((10, ), dtype=np.int32)
-        sdfg(A=val0)
-        self.assertTrue(np.array_equal(val0, ref))
-
-        sdfg.apply_transformations_repeated(MapFission)
-
-        val1 = np.ndarray((10, ), dtype=np.int32)
-        sdfg(A=val1)
-        self.assertTrue(np.array_equal(val1, ref))
-    
-    def test_array_copy_outside_scope(self):
-
-        """
-        This test checks for two issues occuring when MapFission applies on a NestedSDFG with a state-subgraph
-        containing copies among AccessNodes. In such cases, these copies may end up outside the scope of the generated
-        Maps (after MapFssion), potentially leading to the following errors:
-        1. The memlet subset corresponding to a NestedSDFG connector (input/output) may have its dimensionality
-        erroneously increased.
-        2. The memlet subset corresponding to a NestedSDFG connector (input/output) may not be propagated even if it uses
-        the Map's parameters.
-        """
-
-        sdfg = dace.SDFG('array_copy_outside_scope')
-        iname, _ = sdfg.add_array('inp', (10,), dtype=dace.int32)
-        oname, _ = sdfg.add_array('out', (10,), dtype=dace.int32)
-        
-        nsdfg = dace.SDFG('nested_sdfg')
-        niname, nidesc = nsdfg.add_array('ninp', (1,), dtype=dace.int32)
-        ntname, ntdesc = nsdfg.add_scalar('ntmp', dtype=dace.int32, transient=True)
-        noname, nodesc = nsdfg.add_array('nout', (1,), dtype=dace.int32)
-
-        nstate = nsdfg.add_state('nmain')
-        ninode = nstate.add_access(niname)
-        ntnode = nstate.add_access(ntname)
-        nonode = nstate.add_access(noname)
-        tasklet = nstate.add_tasklet('tasklet', {'__inp'}, {'__out'}, '__out = __inp + 1')
-        nstate.add_edge(ninode, None, tasklet, '__inp', dace.Memlet.from_array(niname, nidesc))
-        nstate.add_edge(tasklet, '__out', ntnode, None, dace.Memlet.from_array(ntname, ntdesc))
-        nstate.add_nedge(ntnode, nonode, dace.Memlet.from_array(noname, nodesc))
-
-        state = sdfg.add_state('main')
-        inode = state.add_access(iname)
-        onode = state.add_access(oname)
-        me, mx = state.add_map('map', {'i': '0:10'})
-        snode = state.add_nested_sdfg(nsdfg, None, {'ninp'}, {'nout'})
-        state.add_memlet_path(inode, me, snode, memlet=dace.Memlet(data=iname, subset='i'), dst_conn='ninp')
-        state.add_memlet_path(snode, mx, onode, memlet=dace.Memlet(data=oname, subset='i'), src_conn='nout')
-
-        # Issue no. 1 will be caught by validation after MapFission
-        sdfg.apply_transformations(MapFission)
-
-        # Issue no. 2 will be caught by code-generation due to `i` existing in a memlet outside the Map's scope.
-        A = np.arange(10, dtype=np.int32)
-        B = np.empty((10,), dtype=np.int32)
-        sdfg(inp=A, out=B)
-        assert np.array_equal(A+1, B)
-    
-    def test_single_data_multiple_connectors(self):
-
-        outer_sdfg = dace.SDFG('single_data_multiple_connectors')
-        outer_sdfg.add_array('A', (2, 10), dtype=dace.int32)
-        outer_sdfg.add_array('B', (2, 10), dtype=dace.int32)
-
-        inner_sdfg = dace.SDFG('inner')
-        inner_sdfg.add_array('A0', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('A1', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('B0', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('B1', (10,), dtype=dace.int32)
-
-        inner_state = inner_sdfg.add_state('inner_state', is_start_state=True)
-
-        inner_state.add_mapped_tasklet(name='plus',
-                                       map_ranges={'j': '0:10'},
-                                       inputs={'__a0': dace.Memlet(data='A0', subset='j'),
-                                               '__a1': dace.Memlet(data='A1', subset='j')},
-                                       outputs={'__b0': dace.Memlet(data='B0', subset='j')},
-                                       code='__b0 = __a0 + __a1',
-                                       external_edges=True)
-        inner_state.add_mapped_tasklet(name='minus',
-                                       map_ranges={'j': '0:10'},
-                                       inputs={'__a0': dace.Memlet(data='A0', subset='j'),
-                                               '__a1': dace.Memlet(data='A1', subset='j')},
-                                       outputs={'__b1': dace.Memlet(data='B1', subset='j')},
-                                       code='__b1 = __a0 - __a1',
-                                    external_edges=True)
+def test_subgraph():
+    A, expected = config()
+    B = np.random.rand(2)
+
+    graph = mapfission_sdfg()
+    assert graph.apply_transformations(MapFission) > 0
+    graph(A=A, B=B)
+
+    assert np.allclose(B, expected)
+
+
+def test_nested_sdfg():
+    A, expected = config()
+    B = np.random.rand(2)
+
+    # Nest the subgraph within the outer map, then apply transformation
+    graph = mapfission_sdfg()
+    state = graph.nodes()[0]
+    topmap = next(node for node in state.nodes() if isinstance(node, nodes.MapEntry) and node.label == 'outer')
+    subgraph = state.scope_subgraph(topmap, include_entry=False, include_exit=False)
+    nest_state_subgraph(graph, state, subgraph)
+    assert graph.apply_transformations(MapFission) > 0
+    graph(A=A, B=B)
+    assert np.allclose(B, expected)
+
+
+def test_nested_transient():
+    """ Test nested SDFGs with transients. """
+
+    # Inner SDFG
+    nsdfg = dace.SDFG('nested')
+    nsdfg.add_array('a', [1], dace.float64)
+    nsdfg.add_array('b', [1], dace.float64)
+    nsdfg.add_transient('t', [1], dace.float64)
+
+    # a->t state
+    nstate = nsdfg.add_state()
+    irnode = nstate.add_read('a')
+    task = nstate.add_tasklet('t1', {'inp'}, {'out'}, 'out = 2*inp')
+    iwnode = nstate.add_write('t')
+    nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('a', '0'))
+    nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0'))
+
+    # t->a state
+    first_state = nstate
+    nstate = nsdfg.add_state()
+    irnode = nstate.add_read('t')
+    task = nstate.add_tasklet('t2', {'inp'}, {'out'}, 'out = 3*inp')
+    iwnode = nstate.add_write('b')
+    nstate.add_edge(irnode, None, task, 'inp', dace.Memlet.simple('t', '0'))
+    nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('b', '0'))
+
+    nsdfg.add_edge(first_state, nstate, dace.InterstateEdge())
+
+    # Outer SDFG
+    sdfg = dace.SDFG('nested_transient_fission')
+    sdfg.add_array('A', [2], dace.float64)
+    state = sdfg.add_state()
+    rnode = state.add_read('A')
+    wnode = state.add_write('A')
+    me, mx = state.add_map('outer', dict(i='0:2'))
+    nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'})
+    state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
+    state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
+
+    assert sdfg.apply_transformations_repeated(MapFission) > 0
+
+    # Test
+    A = np.random.rand(2)
+    expected = A * 6
+    sdfg(A=A)
+    assert np.allclose(A, expected)
+
+
+def test_inputs_outputs():
+    """
+    Test subgraphs where the computation modules that are in the middle
+    connect to the outside.
+    """
+
+    sdfg = dace.SDFG('inputs_outputs_fission')
+    sdfg.add_array('in1', [2], dace.float64)
+    sdfg.add_array('in2', [2], dace.float64)
+    sdfg.add_scalar('tmp', dace.float64, transient=True)
+    sdfg.add_array('out1', [2], dace.float64)
+    sdfg.add_array('out2', [2], dace.float64)
+    state = sdfg.add_state()
+    in1 = state.add_read('in1')
+    in2 = state.add_read('in2')
+    out1 = state.add_write('out1')
+    out2 = state.add_write('out2')
+    me, mx = state.add_map('outer', dict(i='0:2'))
+    t1 = state.add_tasklet('t1', {'i1'}, {'o1', 'o2'}, 'o1 = i1 * 2; o2 = i1 * 5')
+    t2 = state.add_tasklet('t2', {'i1', 'i2'}, {'o1'}, 'o1 = i1 * i2')
+    state.add_memlet_path(in1, me, t1, dst_conn='i1', memlet=dace.Memlet.simple('in1', 'i'))
+    state.add_memlet_path(in2, me, t2, dst_conn='i2', memlet=dace.Memlet.simple('in2', 'i'))
+    state.add_edge(t1, 'o1', t2, 'i1', dace.Memlet.simple('tmp', '0'))
+    state.add_memlet_path(t2, mx, out1, src_conn='o1', memlet=dace.Memlet.simple('out1', 'i'))
+    state.add_memlet_path(t1, mx, out2, src_conn='o2', memlet=dace.Memlet.simple('out2', 'i'))
+
+    assert sdfg.apply_transformations(MapFission) > 0
+
+    # Test
+    A, B, C, D = tuple(np.random.rand(2) for _ in range(4))
+    expected_C = (A * 2) * B
+    expected_D = A * 5
+    sdfg(in1=A, in2=B, out1=C, out2=D)
+    assert np.allclose(C, expected_C)
+    assert np.allclose(D, expected_D)
+
+
+def test_multidim():
+    sdfg = dace.SDFG('mapfission_multidim')
+    sdfg.add_array('A', [2, 3], dace.float64)
+    state = sdfg.add_state()
+    me, mx = state.add_map('outer', dict(i='0:2', j='0:3'))
+
+    nsdfg = dace.SDFG('nested')
+    nsdfg.add_array('a', [1], dace.float64)
+    nstate = nsdfg.add_state()
+    t = nstate.add_tasklet('reset', {}, {'out'}, 'out = 0')
+    a = nstate.add_write('a')
+    nstate.add_edge(t, 'out', a, None, dace.Memlet.simple('a', '0'))
+    nsdfg_node = state.add_nested_sdfg(nsdfg, None, {}, {'a'})
+
+    state.add_edge(me, None, nsdfg_node, None, dace.Memlet())
+    anode = state.add_write('A')
+    state.add_memlet_path(nsdfg_node, mx, anode, src_conn='a', memlet=dace.Memlet.simple('A', 'i,j'))
+
+    assert sdfg.apply_transformations_repeated(MapFission) > 0
+
+    # Test
+    A = np.random.rand(2, 3)
+    sdfg(A=A)
+    assert np.allclose(A, np.zeros_like(A))
+
+
+def test_offsets():
+    sdfg = dace.SDFG('mapfission_offsets')
+    sdfg.add_array('A', [20], dace.float64)
+    sdfg.add_scalar('interim', dace.float64, transient=True)
+    state = sdfg.add_state()
+    me, mx = state.add_map('outer', dict(i='10:20'))
+
+    t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1')
+    t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2')
+
+    aread = state.add_read('A')
+    awrite = state.add_write('A')
+    state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
+    state.add_edge(t1, 'b', t2, 'a', dace.Memlet.simple('interim', '0'))
+    state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
+
+    assert sdfg.apply_transformations(MapFission) > 0
+
+    dace.propagate_memlets_sdfg(sdfg)
+    sdfg.validate()
+
+    # Test
+    A = np.random.rand(20)
+    expected = A.copy()
+    expected[10:] += 3
+    sdfg(A=A)
+    assert np.allclose(A, expected)
+
+
+def test_offsets_array():
+    sdfg = dace.SDFG('mapfission_offsets2')
+    sdfg.add_array('A', [20], dace.float64)
+    sdfg.add_array('interim', [1], dace.float64, transient=True)
+    state = sdfg.add_state()
+    me, mx = state.add_map('outer', dict(i='10:20'))
+
+    t1 = state.add_tasklet('addone', {'a'}, {'b'}, 'b = a + 1')
+    interim = state.add_access('interim')
+    t2 = state.add_tasklet('addtwo', {'a'}, {'b'}, 'b = a + 2')
+
+    aread = state.add_read('A')
+    awrite = state.add_write('A')
+    state.add_memlet_path(aread, me, t1, dst_conn='a', memlet=dace.Memlet.simple('A', 'i'))
+    state.add_edge(t1, 'b', interim, None, dace.Memlet.simple('interim', '0'))
+    state.add_edge(interim, None, t2, 'a', dace.Memlet.simple('interim', '0'))
+    state.add_memlet_path(t2, mx, awrite, src_conn='b', memlet=dace.Memlet.simple('A', 'i'))
+
+    assert sdfg.apply_transformations(MapFission) > 0
+
+    dace.propagate_memlets_sdfg(sdfg)
+    sdfg.validate()
+
+    # Test
+    A = np.random.rand(20)
+    expected = A.copy()
+    expected[10:] += 3
+    sdfg(A=A)
+    assert np.allclose(A, expected)
+
+
+def test_mapfission_with_symbols():
+    """
+    Tests MapFission in the case of a Map containing a NestedSDFG that is using some symbol from the top-level SDFG
+    missing from the NestedSDFG's symbol mapping. Please note that this is an unusual case that is difficult to
+    reproduce and ultimately unrelated to MapFission. Consider solving the underlying issue and then deleting this
+    test and the corresponding (obsolete) code in MapFission.
+    """
+
+    M, N = dace.symbol('M'), dace.symbol('N')
 
-        outer_state = outer_sdfg.add_state('outer_state', is_start_state=True)
+    sdfg = dace.SDFG('tasklet_code_with_symbols')
+    sdfg.add_array('A', (M, N), dace.int32)
+    sdfg.add_array('B', (M, N), dace.int32)
 
-        a = outer_state.add_access('A')
-        b = outer_state.add_access('B')
+    state = sdfg.add_state('parent', is_start_state=True)
+    me, mx = state.add_map('parent_map', {'i': '0:N'})
 
-        me, mx = outer_state.add_map('map', {'i': '0:2'})
-        inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'})
+    nsdfg = dace.SDFG('nested_sdfg')
+    nsdfg.add_scalar('inner_A', dace.int32)
+    nsdfg.add_scalar('inner_B', dace.int32)
 
-        outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0')
-        outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1')
-        outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0')
-        outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1')
+    nstate = nsdfg.add_state('child', is_start_state=True)
+    na = nstate.add_access('inner_A')
+    nb = nstate.add_access('inner_B')
+    ta = nstate.add_tasklet('tasklet_A', {}, {'__out'}, '__out = M')
+    tb = nstate.add_tasklet('tasklet_B', {}, {'__out'}, '__out = M')
+    nstate.add_edge(ta, '__out', na, None, dace.Memlet.from_array('inner_A', nsdfg.arrays['inner_A']))
+    nstate.add_edge(tb, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B']))
 
-        sdutils.consolidate_edges(outer_sdfg)
-        
-        A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy()
-        ref = np.empty_like(A)
-        ref_sdfg = copy.deepcopy(outer_sdfg)
-        ref_sdfg.name = f"{ref_sdfg.name}_ref"
-        ref_sdfg(A=A, B=ref)
+    a = state.add_access('A')
+    b = state.add_access('B')
+    t = state.add_nested_sdfg(nsdfg, None, {}, {'inner_A', 'inner_B'})
+    state.add_nedge(me, t, dace.Memlet())
+    state.add_memlet_path(t, mx, a, memlet=dace.Memlet('A[0, i]'), src_conn='inner_A')
+    state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[0, i]'), src_conn='inner_B')
 
-        MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node)
-        val = np.empty_like(A)
-        outer_sdfg(A=A, B=val)
+    num = sdfg.apply_transformations_repeated(MapFission)
+    assert num == 1
 
-        assert np.array_equal(val, ref)
+    A = np.ndarray((2, 10), dtype=np.int32)
+    B = np.ndarray((2, 10), dtype=np.int32)
+    sdfg(A=A, B=B, M=2, N=10)
 
-    def test_dependent_symbol(self):
+    ref = np.full((10, ), fill_value=2, dtype=np.int32)
 
-        outer_sdfg = dace.SDFG('map_fission_with_dependent_symbol')
+    assert np.array_equal(A[0], ref)
+    assert np.array_equal(B[0], ref)
 
-        outer_sdfg.add_symbol('fidx', dace.int32)
-        outer_sdfg.add_symbol('lidx', dace.int32)
 
-        outer_sdfg.add_array('A', (2, 10), dtype=dace.int32)
-        outer_sdfg.add_array('B', (2, 10), dtype=dace.int32)
+def test_two_edges_through_map():
+    """
+    Tests MapFission in the case of a Map with a component that has two inputs from a single data container. In such
+    cases, using `fill_scope_connectors` will lead to broken Map connectors. The tests confirms that new code in the
+    transformation manually adding the appropriate Map connectors works properly.
+    """
 
-        inner_sdfg = dace.SDFG('inner')
+    N = dace.symbol('N')
 
-        inner_sdfg.add_symbol('first', dace.int32)
-        inner_sdfg.add_symbol('last', dace.int32)
+    sdfg = dace.SDFG('two_edges_through_map')
+    sdfg.add_array('A', (N, ), dace.int32)
+    sdfg.add_array('B', (N, ), dace.int32)
 
-        inner_sdfg.add_array('A0', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('A1', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('B0', (10,), dtype=dace.int32)
-        inner_sdfg.add_array('B1', (10,), dtype=dace.int32)
+    state = sdfg.add_state('parent', is_start_state=True)
+    me, mx = state.add_map('parent_map', {'i': '0:N'})
 
-        inner_state = inner_sdfg.add_state('inner_state', is_start_state=True)
+    nsdfg = dace.SDFG('nested_sdfg')
+    nsdfg.add_array('inner_A', (N, ), dace.int32)
+    nsdfg.add_scalar('inner_B', dace.int32)
 
-        inner_state.add_mapped_tasklet(name='plus',
-                                       map_ranges={'j': 'first:last'},
-                                       inputs={'__a0': dace.Memlet(data='A0', subset='j'),
-                                               '__a1': dace.Memlet(data='A1', subset='j')},
-                                       outputs={'__b0': dace.Memlet(data='B0', subset='j')},
-                                       code='__b0 = __a0 + __a1',
-                                       external_edges=True)
+    nstate = nsdfg.add_state('child', is_start_state=True)
+    na = nstate.add_access('inner_A')
+    nb = nstate.add_access('inner_B')
+    t = nstate.add_tasklet('tasklet', {'__in1', '__in2'}, {'__out'}, '__out = __in1 + __in2')
+    nstate.add_edge(na, None, t, '__in1', dace.Memlet('inner_A[i]'))
+    nstate.add_edge(na, None, t, '__in2', dace.Memlet('inner_A[N-i-1]'))
+    nstate.add_edge(t, '__out', nb, None, dace.Memlet.from_array('inner_B', nsdfg.arrays['inner_B']))
+
+    a = state.add_access('A')
+    b = state.add_access('B')
+    t = state.add_nested_sdfg(nsdfg, None, {'inner_A'}, {'inner_B'}, {'N': 'N', 'i': 'i'})
+    state.add_memlet_path(a, me, t, memlet=dace.Memlet.from_array('A', sdfg.arrays['A']), dst_conn='inner_A')
+    state.add_memlet_path(t, mx, b, memlet=dace.Memlet('B[i]'), src_conn='inner_B')
+
+    num = sdfg.apply_transformations_repeated(MapFission)
+    assert num == 1
+
+    A = np.arange(10, dtype=np.int32)
+    B = np.ndarray((10, ), dtype=np.int32)
+    sdfg(A=A, B=B, N=10)
+
+    ref = np.full((10, ), fill_value=9, dtype=np.int32)
+
+    assert np.array_equal(B, ref)
+
+
+def test_if_scope():
 
-        inner_sdfg2 = dace.SDFG('inner2')
+    @dace.program
+    def map_with_if(A: dace.int32[10]):
+        for i in dace.map[0:10]:
+            if i < 5:
+                A[i] = 0
+            else:
+                A[i] = 1
 
-        inner_sdfg2.add_symbol('first', dace.int32)
-        inner_sdfg2.add_symbol('last', dace.int32)
+    ref = np.array([0] * 5 + [1] * 5, dtype=np.int32)
 
-        inner_sdfg2.add_array('A0', (10,), dtype=dace.int32)
-        inner_sdfg2.add_array('A1', (10,), dtype=dace.int32)
-        inner_sdfg2.add_array('B1', (10,), dtype=dace.int32)
+    sdfg = map_with_if.to_sdfg()
+    val0 = np.ndarray((10, ), dtype=np.int32)
+    sdfg(A=val0)
+    assert np.array_equal(val0, ref)
+
+    sdfg.apply_transformations_repeated(MapFission)
+
+    val1 = np.ndarray((10, ), dtype=np.int32)
+    sdfg(A=val1)
+    assert np.array_equal(val1, ref)
+
+
+def test_if_scope_2():
+
+    @dace.program
+    def map_with_if_2(A: dace.int32[10]):
+        for i in dace.map[0:10]:
+            j = i < 5
+            if j:
+                A[i] = 0
+            else:
+                A[i] = 1
+
+    ref = np.array([0] * 5 + [1] * 5, dtype=np.int32)
+
+    sdfg = map_with_if_2.to_sdfg()
+    val0 = np.ndarray((10, ), dtype=np.int32)
+    sdfg(A=val0)
+    assert np.array_equal(val0, ref)
+
+    sdfg.apply_transformations_repeated(MapFission)
+
+    val1 = np.ndarray((10, ), dtype=np.int32)
+    sdfg(A=val1)
+    assert np.array_equal(val1, ref)
+
+
+def test_array_copy_outside_scope():
+    """
+    This test checks for two issues occuring when MapFission applies on a NestedSDFG with a state-subgraph
+    containing copies among AccessNodes. In such cases, these copies may end up outside the scope of the generated
+    Maps (after MapFssion), potentially leading to the following errors:
+    1. The memlet subset corresponding to a NestedSDFG connector (input/output) may have its dimensionality
+    erroneously increased.
+    2. The memlet subset corresponding to a NestedSDFG connector (input/output) may not be propagated even if it uses
+    the Map's parameters.
+    """
+
+    sdfg = dace.SDFG('array_copy_outside_scope')
+    iname, _ = sdfg.add_array('inp', (10, ), dtype=dace.int32)
+    oname, _ = sdfg.add_array('out', (10, ), dtype=dace.int32)
+
+    nsdfg = dace.SDFG('nested_sdfg')
+    niname, nidesc = nsdfg.add_array('ninp', (1, ), dtype=dace.int32)
+    ntname, ntdesc = nsdfg.add_scalar('ntmp', dtype=dace.int32, transient=True)
+    noname, nodesc = nsdfg.add_array('nout', (1, ), dtype=dace.int32)
 
-        inner_state2 = inner_sdfg2.add_state('inner_state2', is_start_state=True)
+    nstate = nsdfg.add_state('nmain')
+    ninode = nstate.add_access(niname)
+    ntnode = nstate.add_access(ntname)
+    nonode = nstate.add_access(noname)
+    tasklet = nstate.add_tasklet('tasklet', {'__inp'}, {'__out'}, '__out = __inp + 1')
+    nstate.add_edge(ninode, None, tasklet, '__inp', dace.Memlet.from_array(niname, nidesc))
+    nstate.add_edge(tasklet, '__out', ntnode, None, dace.Memlet.from_array(ntname, ntdesc))
+    nstate.add_nedge(ntnode, nonode, dace.Memlet.from_array(noname, nodesc))
 
-        inner_state2.add_mapped_tasklet(name='minus',
-                                        map_ranges={'j': 'first:last'},
-                                        inputs={'__a0': dace.Memlet(data='A0', subset='j'),
-                                                '__a1': dace.Memlet(data='A1', subset='j')},
-                                        outputs={'__b1': dace.Memlet(data='B1', subset='j')},
-                                        code='__b1 = __a0 - __a1',
-                                        external_edges=True)
-        
-        nsdfg = inner_state.add_nested_sdfg(inner_sdfg2, None, {'A0', 'A1'}, {'B1'})
-        a0 = inner_state.add_access('A0')
-        a1 = inner_state.add_access('A1')
-        b1 = inner_state.add_access('B1')
+    state = sdfg.add_state('main')
+    inode = state.add_access(iname)
+    onode = state.add_access(oname)
+    me, mx = state.add_map('map', {'i': '0:10'})
+    snode = state.add_nested_sdfg(nsdfg, None, {'ninp'}, {'nout'})
+    state.add_memlet_path(inode, me, snode, memlet=dace.Memlet(data=iname, subset='i'), dst_conn='ninp')
+    state.add_memlet_path(snode, mx, onode, memlet=dace.Memlet(data=oname, subset='i'), src_conn='nout')
+
+    # Issue no. 1 will be caught by validation after MapFission
+    sdfg.apply_transformations(MapFission)
+
+    # Issue no. 2 will be caught by code-generation due to `i` existing in a memlet outside the Map's scope.
+    A = np.arange(10, dtype=np.int32)
+    B = np.empty((10, ), dtype=np.int32)
+    sdfg(inp=A, out=B)
+    assert np.array_equal(A + 1, B)
+
+
+def test_single_data_multiple_connectors():
+
+    outer_sdfg = dace.SDFG('single_data_multiple_connectors')
+    outer_sdfg.add_array('A', (2, 10), dtype=dace.int32)
+    outer_sdfg.add_array('B', (2, 10), dtype=dace.int32)
+
+    inner_sdfg = dace.SDFG('inner')
+    inner_sdfg.add_array('A0', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('A1', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('B0', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('B1', (10, ), dtype=dace.int32)
+
+    inner_state = inner_sdfg.add_state('inner_state', is_start_state=True)
+
+    inner_state.add_mapped_tasklet(name='plus',
+                                   map_ranges={'j': '0:10'},
+                                   inputs={
+                                       '__a0': dace.Memlet(data='A0', subset='j'),
+                                       '__a1': dace.Memlet(data='A1', subset='j')
+                                   },
+                                   outputs={'__b0': dace.Memlet(data='B0', subset='j')},
+                                   code='__b0 = __a0 + __a1',
+                                   external_edges=True)
+    inner_state.add_mapped_tasklet(name='minus',
+                                   map_ranges={'j': '0:10'},
+                                   inputs={
+                                       '__a0': dace.Memlet(data='A0', subset='j'),
+                                       '__a1': dace.Memlet(data='A1', subset='j')
+                                   },
+                                   outputs={'__b1': dace.Memlet(data='B1', subset='j')},
+                                   code='__b1 = __a0 - __a1',
+                                   external_edges=True)
+
+    outer_state = outer_sdfg.add_state('outer_state', is_start_state=True)
+
+    a = outer_state.add_access('A')
+    b = outer_state.add_access('B')
+
+    me, mx = outer_state.add_map('map', {'i': '0:2'})
+    inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'})
+
+    outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0')
+    outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1')
+    outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0')
+    outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1')
+
+    sdutils.consolidate_edges(outer_sdfg)
+
+    A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy()
+    ref = np.empty_like(A)
+    ref_sdfg = copy.deepcopy(outer_sdfg)
+    ref_sdfg.name = f"{ref_sdfg.name}_ref"
+    ref_sdfg(A=A, B=ref)
+
+    MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node)
+    val = np.empty_like(A)
+    outer_sdfg(A=A, B=val)
+
+    assert np.array_equal(val, ref)
+
+
+def test_dependent_symbol():
+
+    outer_sdfg = dace.SDFG('map_fission_with_dependent_symbol')
+
+    outer_sdfg.add_symbol('fidx', dace.int32)
+    outer_sdfg.add_symbol('lidx', dace.int32)
+
+    outer_sdfg.add_array('A', (2, 10), dtype=dace.int32)
+    outer_sdfg.add_array('B', (2, 10), dtype=dace.int32)
+
+    inner_sdfg = dace.SDFG('inner')
+
+    inner_sdfg.add_symbol('first', dace.int32)
+    inner_sdfg.add_symbol('last', dace.int32)
+
+    inner_sdfg.add_array('A0', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('A1', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('B0', (10, ), dtype=dace.int32)
+    inner_sdfg.add_array('B1', (10, ), dtype=dace.int32)
+
+    inner_state = inner_sdfg.add_state('inner_state', is_start_state=True)
+
+    inner_state.add_mapped_tasklet(name='plus',
+                                   map_ranges={'j': 'first:last'},
+                                   inputs={
+                                       '__a0': dace.Memlet(data='A0', subset='j'),
+                                       '__a1': dace.Memlet(data='A1', subset='j')
+                                   },
+                                   outputs={'__b0': dace.Memlet(data='B0', subset='j')},
+                                   code='__b0 = __a0 + __a1',
+                                   external_edges=True)
+
+    inner_sdfg2 = dace.SDFG('inner2')
+
+    inner_sdfg2.add_symbol('first', dace.int32)
+    inner_sdfg2.add_symbol('last', dace.int32)
+
+    inner_sdfg2.add_array('A0', (10, ), dtype=dace.int32)
+    inner_sdfg2.add_array('A1', (10, ), dtype=dace.int32)
+    inner_sdfg2.add_array('B1', (10, ), dtype=dace.int32)
+
+    inner_state2 = inner_sdfg2.add_state('inner_state2', is_start_state=True)
+
+    inner_state2.add_mapped_tasklet(name='minus',
+                                    map_ranges={'j': 'first:last'},
+                                    inputs={
+                                        '__a0': dace.Memlet(data='A0', subset='j'),
+                                        '__a1': dace.Memlet(data='A1', subset='j')
+                                    },
+                                    outputs={'__b1': dace.Memlet(data='B1', subset='j')},
+                                    code='__b1 = __a0 - __a1',
+                                    external_edges=True)
 
-        inner_state.add_edge(a0, None, nsdfg, 'A0', dace.Memlet(data='A0', subset='0:10'))
-        inner_state.add_edge(a1, None, nsdfg, 'A1', dace.Memlet(data='A1', subset='0:10'))
-        inner_state.add_edge(nsdfg, 'B1', b1, None, dace.Memlet(data='B1', subset='0:10'))
+    nsdfg = inner_state.add_nested_sdfg(inner_sdfg2, None, {'A0', 'A1'}, {'B1'})
+    a0 = inner_state.add_access('A0')
+    a1 = inner_state.add_access('A1')
+    b1 = inner_state.add_access('B1')
 
-        outer_state = outer_sdfg.add_state('outer_state', is_start_state=True)
+    inner_state.add_edge(a0, None, nsdfg, 'A0', dace.Memlet(data='A0', subset='0:10'))
+    inner_state.add_edge(a1, None, nsdfg, 'A1', dace.Memlet(data='A1', subset='0:10'))
+    inner_state.add_edge(nsdfg, 'B1', b1, None, dace.Memlet(data='B1', subset='0:10'))
 
-        a = outer_state.add_access('A')
-        b = outer_state.add_access('B')
+    outer_state = outer_sdfg.add_state('outer_state', is_start_state=True)
 
-        me, mx = outer_state.add_map('map', {'i': '0:2'})
-        inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg, None, {'A0', 'A1'}, {'B0', 'B1'},
-                                                      symbol_mapping={'first': 'max(0, i - fidx)',
-                                                                      'last': 'min(10, i + lidx)'})
+    a = outer_state.add_access('A')
+    b = outer_state.add_access('B')
 
-        outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0')
-        outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1')
-        outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0')
-        outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1')
+    me, mx = outer_state.add_map('map', {'i': '0:2'})
+    inner_sdfg_node = outer_state.add_nested_sdfg(inner_sdfg,
+                                                  None, {'A0', 'A1'}, {'B0', 'B1'},
+                                                  symbol_mapping={
+                                                      'first': 'max(0, i - fidx)',
+                                                      'last': 'min(10, i + lidx)'
+                                                  })
 
-        sdutils.consolidate_edges(outer_sdfg)
-        A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy()
-        ref = np.zeros_like(A)
-        ref_sdfg = copy.deepcopy(outer_sdfg)
-        ref_sdfg.name = f"{ref_sdfg.name}_ref"
-        ref_sdfg(A=A, B=ref, fidx=1, lidx=5)
+    outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='0, 0:10'), dst_conn='A0')
+    outer_state.add_memlet_path(a, me, inner_sdfg_node, memlet=dace.Memlet(data='A', subset='1, 0:10'), dst_conn='A1')
+    outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='0, 0:10'), src_conn='B0')
+    outer_state.add_memlet_path(inner_sdfg_node, mx, b, memlet=dace.Memlet(data='B', subset='1, 0:10'), src_conn='B1')
 
-        MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node)
-        outer_sdfg.apply_transformations_repeated(InlineSDFG)
-        val = np.zeros_like(A)
-        outer_sdfg(A=A, B=val, fidx=1, lidx=5)
+    sdutils.consolidate_edges(outer_sdfg)
+    A = np.arange(20, dtype=np.int32).reshape((2, 10)).copy()
+    ref = np.zeros_like(A)
+    ref_sdfg = copy.deepcopy(outer_sdfg)
+    ref_sdfg.name = f"{ref_sdfg.name}_ref"
+    ref_sdfg(A=A, B=ref, fidx=1, lidx=5)
 
-        assert np.array_equal(val, ref)
+    MapFission.apply_to(outer_sdfg, expr_index=1, map_entry=me, nested_sdfg=inner_sdfg_node)
+    outer_sdfg.apply_transformations_repeated(InlineSDFG)
+    val = np.zeros_like(A)
+    outer_sdfg(A=A, B=val, fidx=1, lidx=5)
 
+    assert np.array_equal(val, ref)
 
 
 if __name__ == '__main__':
-    unittest.main()
+    test_subgraph()
+    test_nested_sdfg()
+    test_nested_transient()
+    test_inputs_outputs()
+    test_multidim()
+    test_offsets()
+    test_offsets_array()
+    test_mapfission_with_symbols()
+    test_two_edges_through_map()
+    test_if_scope()
+    test_if_scope_2()
+    test_array_copy_outside_scope()
+    test_single_data_multiple_connectors()
+    test_dependent_symbol()

From 7ad61767fa8263af7b164e4bbc5a0d77772d5814 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 3 Aug 2023 13:43:24 +0200
Subject: [PATCH 344/392] Added test.

---
 dace/sdfg/nodes.py                        |  4 --
 tests/sdfg/validation/nested_sdfg_test.py | 47 +++++++++++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 91284013f9..28431deeea 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -636,10 +636,6 @@ def validate(self, sdfg, state, references: Optional[Set[int]] = None, **context
                     f'Connector "{conn}" was given but is not a registered data descriptor in the nested SDFG. '
                     'Example: parameter passed to a function without a matching array within it.')
         for dname, desc in self.sdfg.arrays.items():
-            # TODO(later): Disallow scalars without access nodes (so that this
-            #              check passes for them too).
-            # if isinstance(desc, data.Scalar):
-            #     continue
             if not desc.transient and dname not in connectors:
                 raise NameError('Data descriptor "%s" not found in nested SDFG connectors' % dname)
             if dname in connectors and desc.transient:
diff --git a/tests/sdfg/validation/nested_sdfg_test.py b/tests/sdfg/validation/nested_sdfg_test.py
index 398a1635ef..100568507e 100644
--- a/tests/sdfg/validation/nested_sdfg_test.py
+++ b/tests/sdfg/validation/nested_sdfg_test.py
@@ -64,6 +64,53 @@ def test_inout_connector_validation_fail():
     assert False, "SDFG should not validate"
 
 
+def test_nested_sdfg_with_transient_connector():
+
+    sdfg = dace.SDFG('nested_main')
+    sdfg.add_array('A', [2], dace.float32)
+
+    def mystate(state, src, dst):
+        src_node = state.add_read(src)
+        dst_node = state.add_write(dst)
+        tasklet = state.add_tasklet('aaa2', {'a'}, {'b'}, 'b = a + 1')
+
+        # input path (src->tasklet[a])
+        state.add_memlet_path(src_node, tasklet, dst_conn='a', memlet=dace.Memlet(data=src, subset='0'))
+        # output path (tasklet[b]->dst)
+        state.add_memlet_path(tasklet, dst_node, src_conn='b', memlet=dace.Memlet(data=dst, subset='0'))
+
+
+    sub_sdfg = dace.SDFG('nested_sub')
+    sub_sdfg.add_scalar('sA', dace.float32)
+    sub_sdfg.add_scalar('sB', dace.float32, transient=True)
+    sub_sdfg.add_scalar('sC', dace.float32, transient=True)
+
+    state0 = sub_sdfg.add_state('subs0')
+    mystate(state0, 'sA', 'sB')
+    state1 = sub_sdfg.add_state('subs1')
+    mystate(state1, 'sB', 'sC')
+
+    sub_sdfg.add_edge(state0, state1, dace.InterstateEdge())
+
+
+    state = sdfg.add_state('s0')
+    me, mx = state.add_map('mymap', dict(k='0:2'))
+    nsdfg = state.add_nested_sdfg(sub_sdfg, sdfg, {'sA'}, {'sC'})
+    Ain = state.add_read('A')
+    Aout = state.add_write('A')
+
+    state.add_memlet_path(Ain, me, nsdfg, memlet=dace.Memlet(data='A', subset='k'), dst_conn='sA')
+    state.add_memlet_path(nsdfg, mx, Aout, memlet=dace.Memlet(data='A', subset='k'), src_conn='sC')
+ 
+    try:
+        sdfg.validate()
+    except dace.sdfg.InvalidSDFGError:
+        return
+
+    assert False, "SDFG should not validate"
+
+
 if __name__ == "__main__":
     test_inout_connector_validation_success()
     test_inout_connector_validation_fail()
+    test_nested_sdfg_with_transient_connector()

From b47d82b72decce012b088602acc9b8290da04f8e Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 3 Aug 2023 13:55:34 +0200
Subject: [PATCH 345/392] Add fix plus testcase

---
 dace/frontend/fortran/fortran_parser.py |  1 +
 tests/fortran/array_test.py             | 50 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 6d1be7138a..d7112892fe 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -463,6 +463,7 @@ def subroutine2sdfg(self, node: ast_internal_classes.Subroutine_Subprogram_Node,
                                 if i.type == "ALL":
                                     shape.append(array.shape[indices])
                                     mysize = mysize * array.shape[indices]
+                                    index_list.append(None)
                                 else:
                                     raise NotImplementedError("Index in ParDecl should be ALL")
                             else:
diff --git a/tests/fortran/array_test.py b/tests/fortran/array_test.py
index 8685628012..a8ece680a6 100644
--- a/tests/fortran/array_test.py
+++ b/tests/fortran/array_test.py
@@ -11,6 +11,7 @@
 from dace.frontend.fortran import fortran_parser
 from fparser.two.symbol_table import SymbolTable
 from dace.sdfg import utils as sdutil
+from dace.sdfg.nodes import AccessNode
 
 import dace.frontend.fortran.ast_components as ast_components
 import dace.frontend.fortran.ast_transforms as ast_transforms
@@ -167,6 +168,54 @@ def test_fortran_frontend_input_output_connector():
     assert (a[1, 2] == 0)
 
 
+def test_fortran_frontend_memlet_in_map_test():
+    """
+    Tests that no assumption is made where the iteration variable is inside a memlet subset
+    """
+    test_string = """
+        PROGRAM memlet_range_test
+        implicit None
+        REAL INP(100, 10)
+        REAL OUT(100, 10)
+        CALL memlet_range_test_routine(INP, OUT)
+        END PROGRAM
+
+        SUBROUTINE memlet_range_test_routine(INP, OUT)
+            REAL INP(100, 10)
+            REAL OUT(100, 10)
+            DO I=1,100
+                CALL inner_loops(INP(I, :), OUT(I, :))
+            ENDDO
+        END SUBROUTINE memlet_range_test_routine
+
+        SUBROUTINE inner_loops(INP, OUT)
+            REAL INP(10)
+            REAL OUT(10)
+            DO J=1,10
+                OUT(J) = INP(J) + 1
+            ENDDO
+        END SUBROUTINE inner_loops
+
+    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "memlet_range_test")
+    sdfg.simplify()
+    # Expect that start is begin of for loop -> only one out edge to guard defining iterator variable
+    assert len(sdfg.out_edges(sdfg.start_state)) == 1
+    iter_var = symbolic.symbol(list(sdfg.out_edges(sdfg.start_state)[0].data.assignments.keys())[0])
+
+    for state in sdfg.states():
+        if len(state.nodes()) > 1:
+            for node in state.nodes():
+                if isinstance(node, AccessNode) and node.data in ['INP', 'OUT']:
+                    edges = [*state.in_edges(node), *state.out_edges(node)]
+                    # There should be only one edge in/to the access node
+                    assert len(edges) == 1
+                    memlet = edges[0].data
+                    # Check that the correct memlet has the iteration variable
+                    assert memlet.subset[0] == (iter_var, iter_var, 1)
+                    assert memlet.subset[1] == (1, 10, 1)
+
+
 if __name__ == "__main__":
 
     test_fortran_frontend_array_3dmap()
@@ -174,3 +223,4 @@ def test_fortran_frontend_input_output_connector():
     test_fortran_frontend_input_output_connector()
     test_fortran_frontend_array_ranges()
     test_fortran_frontend_twoconnector()
+    test_fortran_frontend_memlet_in_map_test()

From 4c824a310a53c2aefd6d03113dda091f4c48bad8 Mon Sep 17 00:00:00 2001
From: Samuel Martin <martisam@student.ethz.ch>
Date: Thu, 3 Aug 2023 13:59:20 +0200
Subject: [PATCH 346/392] Tried to undo wrong update of dependency

---
 dace/external/hlslib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/external/hlslib b/dace/external/hlslib
index 1403cd016c..1b5b3aee5d 160000
--- a/dace/external/hlslib
+++ b/dace/external/hlslib
@@ -1 +1 @@
-Subproject commit 1403cd016ce63a9961eeb3899bea70c873a929ce
+Subproject commit 1b5b3aee5dab19adcc443fa9a7cd45244bd246b1

From 7171ecc79c716137465a4e05e5dd204ab7bba2d8 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 3 Aug 2023 22:04:07 -0700
Subject: [PATCH 347/392] Fix for None set properties (#1345)

---
 dace/properties.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dace/properties.py b/dace/properties.py
index 6e883f8549..951a0564cc 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -888,9 +888,13 @@ def from_string(s):
         return [eval(i) for i in re.sub(r"[\{\}\(\)\[\]]", "", s).split(",")]
 
     def to_json(self, l):
+        if l is None:
+            return None
         return list(sorted(l))
 
     def from_json(self, l, sdfg=None):
+        if l is None:
+            return None
         return set(l)
 
     def __get__(self, obj, objtype=None):

From 425fed6ba3941d4fa46499e69eec6c0702522137 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Fri, 4 Aug 2023 07:49:20 -0700
Subject: [PATCH 348/392] Add Object to defined types in code generation and
 some documentation (#1343)

Co-authored-by: acalotoiu <61420859+acalotoiu@users.noreply.github.com>
---
 dace/codegen/dispatcher.py      | 13 +++++++------
 dace/codegen/targets/cpp.py     |  6 +++---
 dace/codegen/targets/cpu.py     |  2 +-
 samples/codegen/tensor_cores.py |  2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 0b4f58d5ef..be032556a0 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -19,12 +19,13 @@ class DefinedType(aenum.AutoNumberEnum):
     
         :see: DefinedMemlets
     """
-    Pointer = ()
-    Scalar = ()
-    Stream = ()
-    StreamArray = ()
-    FPGA_ShiftRegister = ()
-    ArrayInterface = ()
+    Pointer = ()  # Pointer
+    Scalar = ()   # A copyable scalar moved by value (e.g., POD)
+    Object = ()   # An object moved by reference
+    Stream = ()   # A stream object moved by reference and accessed via a push/pop API
+    StreamArray = ()  # An array of Streams
+    FPGA_ShiftRegister = ()  # A shift-register object used in FPGA code generation
+    ArrayInterface = ()  # An object representing an interface to an array, used mostly in FPGA
 
 
 class DefinedMemlets:
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index afbc6fca12..264311a45c 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -108,7 +108,7 @@ def copy_expr(
     elif def_type == DefinedType.FPGA_ShiftRegister:
         return expr
 
-    elif def_type in [DefinedType.Scalar, DefinedType.Stream]:
+    elif def_type in [DefinedType.Scalar, DefinedType.Stream, DefinedType.Object]:
 
         if add_offset:
             raise TypeError("Tried to offset address of scalar {}: {}".format(data_name, offset_cppstr))
@@ -327,7 +327,7 @@ def make_const(expr: str) -> str:
         ref = '&' if is_scalar else ''
         defined_type = DefinedType.Scalar if is_scalar else DefinedType.Pointer
         offset_expr = ''
-    elif defined_type == DefinedType.Stream:
+    elif defined_type in (DefinedType.Stream, DefinedType.Object):
         typedef = defined_ctype
         ref = '&'
         offset_expr = ''
@@ -1232,7 +1232,7 @@ def visit_Name(self, node: ast.Name):
             defined_type = None
         if (self.allow_casts and isinstance(dtype, dtypes.pointer) and memlet.subset.num_elements() == 1):
             return ast.parse(f"{name}[0]").body[0].value
-        elif (self.allow_casts and (defined_type == DefinedType.Stream or defined_type == DefinedType.StreamArray)
+        elif (self.allow_casts and (defined_type in (DefinedType.Stream, DefinedType.StreamArray))
               and memlet.dynamic):
             return ast.parse(f"{name}.pop()").body[0].value
         else:
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 3b7b592775..9bca137d51 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -1228,7 +1228,7 @@ def memlet_ctor(self, sdfg, memlet, dtype, is_output):
         ptrname = cpp.ptr(memlet.data, sdfg.arrays[memlet.data], sdfg, self._frame)
         def_type, _ = self._dispatcher.defined_vars.get(ptrname)
 
-        if def_type in [DefinedType.Stream, DefinedType.StreamArray]:
+        if def_type in [DefinedType.Stream, DefinedType.Object, DefinedType.StreamArray]:
             return self.memlet_stream_ctor(sdfg, memlet)
 
         elif def_type in [DefinedType.Pointer, DefinedType.Scalar]:
diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py
index 92ea28eacf..eaad543e6c 100644
--- a/samples/codegen/tensor_cores.py
+++ b/samples/codegen/tensor_cores.py
@@ -98,7 +98,7 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int,
             
         # Add the ctype to defined_vars so that the codegen can properly pass
         # fragments to functions as an object reference.
-        self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype)
+        self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype)
 
     def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode,
                          nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream):

From 20240a8552108c5939ad088d3c62c47c39da39e7 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Fri, 4 Aug 2023 07:49:49 -0700
Subject: [PATCH 349/392] Fix symbolic parsing for ternary operators (#1346)

Co-authored-by: acalotoiu <61420859+acalotoiu@users.noreply.github.com>
---
 dace/runtime/include/dace/pyinterop.h |  5 +++++
 dace/symbolic.py                      | 21 +++++++++++++++++++-
 tests/passes/scalar_to_symbol_test.py | 28 +++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/dace/runtime/include/dace/pyinterop.h b/dace/runtime/include/dace/pyinterop.h
index e8f255af70..f93cbab770 100644
--- a/dace/runtime/include/dace/pyinterop.h
+++ b/dace/runtime/include/dace/pyinterop.h
@@ -52,5 +52,10 @@ template <typename T>
 static DACE_HDFI T Abs(T val) {
     return abs(val);
 }
+template <typename T, typename U>
+DACE_CONSTEXPR DACE_HDFI typename std::common_type<T, U>::type IfExpr(bool condition, const T& iftrue, const U& iffalse)
+{
+    return condition ? iftrue : iffalse;
+}
 
 #endif  // __DACE_INTEROP_H
diff --git a/dace/symbolic.py b/dace/symbolic.py
index 01440d465e..0ab6e3f6ff 100644
--- a/dace/symbolic.py
+++ b/dace/symbolic.py
@@ -658,6 +658,21 @@ def eval(cls, x, y):
     def _eval_is_boolean(self):
         return True
 
+class IfExpr(sympy.Function):
+
+    @classmethod
+    def eval(cls, x, y, z):
+        """
+        Evaluates a ternary operator.
+
+        :param x: Predicate.
+        :param y: If true return this.
+        :param z: If false return this.
+        :return: Return value (literal or symbolic).
+        """
+        if x.is_Boolean:
+            return (y if x else z)
+
 
 class BitwiseAnd(sympy.Function):
     pass
@@ -968,6 +983,9 @@ def visit_Constant(self, node):
     def visit_NameConstant(self, node):
         return self.visit_Constant(node)
 
+    def visit_IfExp(self, node):
+        new_node = ast.Call(func=ast.Name(id='IfExpr', ctx=ast.Load), args=[node.test, node.body, node.orelse], keywords=[])
+        return ast.copy_location(new_node, node)
 
 class BitwiseOpConverter(ast.NodeTransformer):
     """ 
@@ -1050,6 +1068,7 @@ def pystr_to_symbolic(expr, symbol_map=None, simplify=None) -> sympy.Basic:
         'RightShift': RightShift,
         'int_floor': int_floor,
         'int_ceil': int_ceil,
+        'IfExpr': IfExpr,
         'Mod': sympy.Mod,
     }
     # _clash1 enables all one-letter variables like N as symbols
@@ -1059,7 +1078,7 @@ def pystr_to_symbolic(expr, symbol_map=None, simplify=None) -> sympy.Basic:
     if isinstance(expr, str):
         # Sympy processes "not/and/or" as direct evaluation. Replace with
         # And/Or(x, y), Not(x)
-        if re.search(r'\bnot\b|\band\b|\bor\b|\bNone\b|==|!=|\bis\b', expr):
+        if re.search(r'\bnot\b|\band\b|\bor\b|\bNone\b|==|!=|\bis\b|\bif\b', expr):
             expr = unparse(SympyBooleanConverter().visit(ast.parse(expr).body[0]))
 
         # NOTE: If the expression contains bitwise operations, replace them with user-functions.
diff --git a/tests/passes/scalar_to_symbol_test.py b/tests/passes/scalar_to_symbol_test.py
index 9ec23e3886..02cc57a204 100644
--- a/tests/passes/scalar_to_symbol_test.py
+++ b/tests/passes/scalar_to_symbol_test.py
@@ -666,6 +666,32 @@ def prog(inp: dace.int32[4, 2], out: dace.float64[5, 5]):
     sdfg.compile()
 
 
+@pytest.mark.parametrize('compile_time_evaluatable', (False, True))
+def test_ternary_expression(compile_time_evaluatable):
+    sdfg = dace.SDFG('tester')
+    sdfg.add_symbol('N', dace.int32)
+    sdfg.add_symbol('M', dace.int32)
+    sdfg.add_scalar('a', dace.int32, transient=True)
+    state = sdfg.add_state()
+
+    if compile_time_evaluatable:
+        expr = '1 if N > N else 2'
+    else:
+        expr = '1 if N > M else 2'
+
+    # Test that symbolic conversion works
+    symexpr = dace.symbolic.pystr_to_symbolic(expr)
+    if compile_time_evaluatable:
+        assert symexpr == 2
+
+    t = state.add_tasklet('doit', {}, {'out'}, f'out = {expr}')
+    state.add_edge(t, 'out', state.add_access('a'), None, dace.Memlet('a[0]'))
+
+    promoted = scalar_to_symbol.ScalarToSymbolPromotion().apply_pass(sdfg, {})
+    assert promoted == {'a'}
+    sdfg.compile()
+
+
 if __name__ == '__main__':
     test_find_promotable()
     test_promote_simple()
@@ -687,3 +713,5 @@ def prog(inp: dace.int32[4, 2], out: dace.float64[5, 5]):
     test_multiple_boolop()
     test_multidim_cpp()
     test_dynamic_mapind()
+    test_ternary_expression(False)
+    test_ternary_expression(True)

From 2420440daa48178f752fa353763b4921513a169a Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 10:52:24 +0200
Subject: [PATCH 350/392] Support in Fortran frontend arrays with offset
 declaration

---
 dace/frontend/fortran/ast_components.py       | 18 +++++-
 dace/frontend/fortran/ast_internal_classes.py |  1 +
 tests/fortran/index_offset_test.py            | 60 +++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 tests/fortran/index_offset_test.py

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index a66ee5c0d6..97281ebd27 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -574,6 +574,7 @@ def type_declaration_stmt(self, node: FASTNode):
 
         alloc = False
         symbol = False
+        dimensions = None
         for i in attributes:
             if i.string.lower() == "allocatable":
                 alloc = True
@@ -591,16 +592,30 @@ def type_declaration_stmt(self, node: FASTNode):
             if len(array_sizes) == 1:
                 array_sizes = array_sizes[0]
                 size = []
+                offset = []
                 for dim in array_sizes.children:
                     #sanity check
                     if isinstance(dim, f03.Explicit_Shape_Spec):
                         dim_expr = [i for i in dim.children if i is not None]
+                        # handle size definition
                         if len(dim_expr) == 1:
                             dim_expr = dim_expr[0]
                             #now to add the dimension to the size list after processing it if necessary
                             size.append(self.create_ast(dim_expr))
+                            offset.append(1)
+                        elif len(dim_expr) == 2:
+                            # extract offets
+                            for expr in dim_expr:
+                                if not isinstance(expr, f03.Int_Literal_Constant):
+                                    raise TypeError("Array offsets must be constant expressions!")
+                            offset.append(int(dim_expr[0].tostr()))
+
+                            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
+                            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
+
+                            size.append(self.create_ast(fortran_ast_size))
                         else:
-                            raise TypeError("Array dimension must be a single expression")
+                            raise TypeError("Array dimension must be at most two expressions")
             #handle initializiation
             init = None
 
@@ -637,6 +652,7 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     type=testtype,
                                                                     alloc=alloc,
                                                                     sizes=size,
+                                                                    offsets=offset,
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index 6bdfb61faf..daddfbe8ef 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -199,6 +199,7 @@ class Symbol_Array_Decl_Node(Statement_Node):
     )
     _fields = (
         'sizes',
+        'offsets'
         'typeref',
         'init',
     )
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/index_offset_test.py
new file mode 100644
index 0000000000..5e38a0adc6
--- /dev/null
+++ b/tests/fortran/index_offset_test.py
@@ -0,0 +1,60 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+import dace
+from dace import SDFG, SDFGState, instrument, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+def test_fortran_frontend_index_offset():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision d(50:54)
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision d(50:54)
+
+                    do i=50,54
+                       d(i) = i * 2.0
+                    end do
+                    
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([60], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(50,54):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
+
+
+if __name__ == "__main__":
+
+    #test_fortran_frontend_index_offset()
+    test_fortran_frontend_index_offset_dimensions()

From 63b074b94050696957cedfdc532435dc1440842d Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 20:22:27 +0200
Subject: [PATCH 351/392] Support shape attribute specification in the Fortran
 frontend

---
 dace/frontend/fortran/ast_components.py       | 112 +++++++++++++-----
 dace/frontend/fortran/ast_internal_classes.py |   1 +
 tests/fortran/index_offset_test.py            |  44 ++++++-
 3 files changed, 125 insertions(+), 32 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 97281ebd27..4b48f81367 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -1,5 +1,6 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from fparser.two.Fortran2008 import Fortran2008 as f08
+from fparser.two import Fortran2008
 from fparser.two import Fortran2003 as f03
 from fparser.two import symbol_table
 
@@ -523,6 +524,31 @@ def declaration_type_spec(self, node: FASTNode):
     def assumed_shape_spec_list(self, node: FASTNode):
         return node
 
+    def parse_shape_specification(self, dim: f03.Explicit_Shape_Spec, size: List[FASTNode], offset: List[int]):
+
+        dim_expr = [i for i in dim.children if i is not None]
+
+        # handle size definition
+        if len(dim_expr) == 1:
+            dim_expr = dim_expr[0]
+            #now to add the dimension to the size list after processing it if necessary
+            size.append(self.create_ast(dim_expr))
+            offset.append(1)
+        # Here we support arrays that have size declaration - with initial offset.
+        elif len(dim_expr) == 2:
+            # extract offets
+            for expr in dim_expr:
+                if not isinstance(expr, f03.Int_Literal_Constant):
+                    raise TypeError("Array offsets must be constant expressions!")
+            offset.append(int(dim_expr[0].tostr()))
+
+            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
+            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
+
+            size.append(self.create_ast(fortran_ast_size))
+        else:
+            raise TypeError("Array dimension must be at most two expressions")
+
     def type_declaration_stmt(self, node: FASTNode):
 
         #decide if its a intrinsic variable type or a derived type
@@ -574,18 +600,39 @@ def type_declaration_stmt(self, node: FASTNode):
 
         alloc = False
         symbol = False
-        dimensions = None
+        attr_size = None
+        attr_offset = None
         for i in attributes:
             if i.string.lower() == "allocatable":
                 alloc = True
             if i.string.lower() == "parameter":
                 symbol = True
 
+            if isinstance(i, Fortran2008.Attr_Spec_List):
+
+                attr_size = []
+                attr_offset = []
+                sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
+                
+                for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
+                    print(shape_spec)
+                    self.parse_shape_specification(shape_spec, attr_size, attr_offset)
+                print(sizes.children)
+                print(type(sizes))
+                #print(sizes.children)
+
+                #if len(i.children) > 0 and isinstance(i.children[0], f03.Dimension_Attr_Spec):
+                #    print(i, dir(i), type(i.children[0]), dir(i.children[0]))
+
+        #sizes = get_child(attributes, ["Attr_Spec_List"])
+        #print(sizes)
+
         vardecls = []
 
         for var in names:
             #first handle dimensions
             size = None
+            offset = None
             var_components = self.create_children(var)
             array_sizes = get_children(var, "Explicit_Shape_Spec_List")
             actual_name = get_child(var_components, ast_internal_classes.Name_Node)
@@ -596,26 +643,7 @@ def type_declaration_stmt(self, node: FASTNode):
                 for dim in array_sizes.children:
                     #sanity check
                     if isinstance(dim, f03.Explicit_Shape_Spec):
-                        dim_expr = [i for i in dim.children if i is not None]
-                        # handle size definition
-                        if len(dim_expr) == 1:
-                            dim_expr = dim_expr[0]
-                            #now to add the dimension to the size list after processing it if necessary
-                            size.append(self.create_ast(dim_expr))
-                            offset.append(1)
-                        elif len(dim_expr) == 2:
-                            # extract offets
-                            for expr in dim_expr:
-                                if not isinstance(expr, f03.Int_Literal_Constant):
-                                    raise TypeError("Array offsets must be constant expressions!")
-                            offset.append(int(dim_expr[0].tostr()))
-
-                            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
-                            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
-
-                            size.append(self.create_ast(fortran_ast_size))
-                        else:
-                            raise TypeError("Array dimension must be at most two expressions")
+                        self.parse_shape_specification(dim, size, offset)
             #handle initializiation
             init = None
 
@@ -628,17 +656,30 @@ def type_declaration_stmt(self, node: FASTNode):
                 raw_init = initialization.children[1]
                 init = self.create_ast(raw_init)
 
+            print('t', symbol, size, attr_size)
+            print(offset, attr_offset)
             if symbol == False:
 
-                vardecls.append(
-                    ast_internal_classes.Var_Decl_Node(name=actual_name.name,
-                                                       type=testtype,
-                                                       alloc=alloc,
-                                                       sizes=size,
-                                                       kind=kind,
-                                                       line_number=node.item.span))
+                if attr_size is None:
+                    vardecls.append(
+                        ast_internal_classes.Var_Decl_Node(name=actual_name.name,
+                                                        type=testtype,
+                                                        alloc=alloc,
+                                                        sizes=size,
+                                                        offsets=offset,
+                                                        kind=kind,
+                                                        line_number=node.item.span))
+                else:
+                    vardecls.append(
+                        ast_internal_classes.Var_Decl_Node(name=actual_name.name,
+                                                        type=testtype,
+                                                        alloc=alloc,
+                                                        sizes=attr_size,
+                                                        offsets=attr_offset,
+                                                        kind=kind,
+                                                        line_number=node.item.span))
             else:
-                if size is None:
+                if size is None and attr_size is None:
                     self.symbols[actual_name.name] = init
                     vardecls.append(
                         ast_internal_classes.Symbol_Decl_Node(name=actual_name.name,
@@ -646,6 +687,16 @@ def type_declaration_stmt(self, node: FASTNode):
                                                               alloc=alloc,
                                                               init=init,
                                                               line_number=node.item.span))
+                elif attr_size is not None:
+                    vardecls.append(
+                        ast_internal_classes.Symbol_Array_Decl_Node(name=actual_name.name,
+                                                                    type=testtype,
+                                                                    alloc=alloc,
+                                                                    sizes=attr_size,
+                                                                    offsets=attr_offset,
+                                                                    kind=kind,
+                                                                    init=init,
+                                                                    line_number=node.item.span))
                 else:
                     vardecls.append(
                         ast_internal_classes.Symbol_Array_Decl_Node(name=actual_name.name,
@@ -656,7 +707,8 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
-
+        #print(vardecls[0].sizes)
+        #print(vardecls[0].offsets)
         return ast_internal_classes.Decl_Stmt_Node(vardecl=vardecls, line_number=node.item.span)
 
     def entity_decl(self, node: FASTNode):
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index daddfbe8ef..f9bf97ca08 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -214,6 +214,7 @@ class Var_Decl_Node(Statement_Node):
     )
     _fields = (
         'sizes',
+        'offsets',
         'typeref',
         'init',
     )
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/index_offset_test.py
index 5e38a0adc6..564df31634 100644
--- a/tests/fortran/index_offset_test.py
+++ b/tests/fortran/index_offset_test.py
@@ -18,6 +18,46 @@
 import dace.frontend.fortran.ast_utils as ast_utils
 import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
 
+def test_fortran_frontend_index_offset_attributes():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54) :: d
+                    !double precision, dimension(5) :: d
+                    !double precision d(50:54)
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    !double precision d(50:54)
+                    !double precision d(5)
+                    double precision, dimension(50:54) :: d
+                    !double precision, intent(inout) :: d(50:54)
+
+                    do i=50,54
+                       d(i) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([60], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(50,54):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
+
 def test_fortran_frontend_index_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
@@ -56,5 +96,5 @@ def test_fortran_frontend_index_offset():
 
 if __name__ == "__main__":
 
-    #test_fortran_frontend_index_offset()
-    test_fortran_frontend_index_offset_dimensions()
+    test_fortran_frontend_index_offset()
+    test_fortran_frontend_index_offset_attributes()

From e1b4399874d2021608a17e63a73c2e851c10854d Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 20:50:31 +0200
Subject: [PATCH 352/392] Rename array attributes test

---
 dace/frontend/fortran/ast_components.py       | 12 ----
 ...ffset_test.py => array_attributes_test.py} | 56 +++++++++++++------
 2 files changed, 39 insertions(+), 29 deletions(-)
 rename tests/fortran/{index_offset_test.py => array_attributes_test.py} (65%)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 4b48f81367..b11c970973 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -615,17 +615,7 @@ def type_declaration_stmt(self, node: FASTNode):
                 sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
                 
                 for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
-                    print(shape_spec)
                     self.parse_shape_specification(shape_spec, attr_size, attr_offset)
-                print(sizes.children)
-                print(type(sizes))
-                #print(sizes.children)
-
-                #if len(i.children) > 0 and isinstance(i.children[0], f03.Dimension_Attr_Spec):
-                #    print(i, dir(i), type(i.children[0]), dir(i.children[0]))
-
-        #sizes = get_child(attributes, ["Attr_Spec_List"])
-        #print(sizes)
 
         vardecls = []
 
@@ -656,8 +646,6 @@ def type_declaration_stmt(self, node: FASTNode):
                 raw_init = initialization.children[1]
                 init = self.create_ast(raw_init)
 
-            print('t', symbol, size, attr_size)
-            print(offset, attr_offset)
             if symbol == False:
 
                 if attr_size is None:
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/array_attributes_test.py
similarity index 65%
rename from tests/fortran/index_offset_test.py
rename to tests/fortran/array_attributes_test.py
index 564df31634..1ccb3c5f57 100644
--- a/tests/fortran/index_offset_test.py
+++ b/tests/fortran/array_attributes_test.py
@@ -1,24 +1,45 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 
-from fparser.common.readfortran import FortranStringReader
-from fparser.common.readfortran import FortranFileReader
-from fparser.two.parser import ParserFactory
-import sys, os
 import numpy as np
-import pytest
 
-import dace
-from dace import SDFG, SDFGState, instrument, nodes, dtypes, data, subsets, symbolic
 from dace.frontend.fortran import fortran_parser
-from fparser.two.symbol_table import SymbolTable
-from dace.sdfg import utils as sdutil
 
-import dace.frontend.fortran.ast_components as ast_components
-import dace.frontend.fortran.ast_transforms as ast_transforms
-import dace.frontend.fortran.ast_utils as ast_utils
-import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+def test_fortran_frontend_array_attribute_no_offset():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(5) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(5) :: d
+
+                    do i=1,5
+                       d(i) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([5], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(1,5):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
 
-def test_fortran_frontend_index_offset_attributes():
+def test_fortran_frontend_array_attribute_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
     """
@@ -58,7 +79,7 @@ def test_fortran_frontend_index_offset_attributes():
         # offset -1 is already added
         assert a[i-1] == i * 2
 
-def test_fortran_frontend_index_offset():
+def test_fortran_frontend_array_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
     """
@@ -96,5 +117,6 @@ def test_fortran_frontend_index_offset():
 
 if __name__ == "__main__":
 
-    test_fortran_frontend_index_offset()
-    test_fortran_frontend_index_offset_attributes()
+    test_fortran_frontend_array_offset()
+    test_fortran_frontend_array_attribute_no_offset()
+    test_fortran_frontend_array_attribute_offset()

From 37fa5800a0af6344a736d85983ce25ed2b82bcbb Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 14 Aug 2023 18:41:54 +0200
Subject: [PATCH 353/392] Remove old code

---
 dace/frontend/fortran/ast_components.py | 2 --
 tests/fortran/array_attributes_test.py  | 5 -----
 2 files changed, 7 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index b11c970973..492c819322 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -695,8 +695,6 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
-        #print(vardecls[0].sizes)
-        #print(vardecls[0].offsets)
         return ast_internal_classes.Decl_Stmt_Node(vardecl=vardecls, line_number=node.item.span)
 
     def entity_decl(self, node: FASTNode):
diff --git a/tests/fortran/array_attributes_test.py b/tests/fortran/array_attributes_test.py
index 1ccb3c5f57..af433905bc 100644
--- a/tests/fortran/array_attributes_test.py
+++ b/tests/fortran/array_attributes_test.py
@@ -47,16 +47,11 @@ def test_fortran_frontend_array_attribute_offset():
                     PROGRAM index_offset_test
                     implicit none
                     double precision, dimension(50:54) :: d
-                    !double precision, dimension(5) :: d
-                    !double precision d(50:54)
                     CALL index_test_function(d)
                     end
 
                     SUBROUTINE index_test_function(d)
-                    !double precision d(50:54)
-                    !double precision d(5)
                     double precision, dimension(50:54) :: d
-                    !double precision, intent(inout) :: d(50:54)
 
                     do i=50,54
                        d(i) = i * 2.0

From b9e9f6123dfe85e8595a0a8c670deb36d36cd5ac Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 14 Aug 2023 19:29:11 +0200
Subject: [PATCH 354/392] Fix handling of non-dimensional attributes in Fortran
 frontend

---
 dace/frontend/fortran/ast_components.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 492c819322..1e5bfb4528 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -610,9 +610,13 @@ def type_declaration_stmt(self, node: FASTNode):
 
             if isinstance(i, Fortran2008.Attr_Spec_List):
 
+                dimension_spec = get_children(i, "Dimension_Attr_Spec")
+                if len(dimension_spec) == 0:
+                    continue
+
                 attr_size = []
                 attr_offset = []
-                sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
+                sizes = get_child(dimension_spec[0], ["Explicit_Shape_Spec_List"])
                 
                 for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
                     self.parse_shape_specification(shape_spec, attr_size, attr_offset)

From 22718af782d2e36ea7004aa00c79b8fce176fe03 Mon Sep 17 00:00:00 2001
From: Cliff Hodel <111381329+hodelcl@users.noreply.github.com>
Date: Wed, 16 Aug 2023 14:15:03 +0200
Subject: [PATCH 355/392] Work Depth Analysis for SDFGs (#1327)

* initial push of work_depth analysis script

* adding tests to work_depth analysis

* rename work depth analysis

* todos added

* code ready for PR

* yapf for formatting

* put tests into dace/tests/sdfg

* fixed import after merge

* merged propgatate_states_symbolically into propagate_states

* fixed format issue in work_depth.py

* small bugfix

---------

Co-authored-by: Cliff Hodel <hodelcl@student.ethz.ch>
Co-authored-by: Cliff Hodel <hodelcl@ethz.ch>
Co-authored-by: Philipp Schaad <schaad.phil@gmail.com>
---
 dace/sdfg/propagation.py                    |  51 +-
 dace/sdfg/work_depth_analysis/helpers.py    | 331 ++++++++++
 dace/sdfg/work_depth_analysis/work_depth.py | 653 ++++++++++++++++++++
 tests/sdfg/work_depth_tests.py              | 201 ++++++
 4 files changed, 1224 insertions(+), 12 deletions(-)
 create mode 100644 dace/sdfg/work_depth_analysis/helpers.py
 create mode 100644 dace/sdfg/work_depth_analysis/work_depth.py
 create mode 100644 tests/sdfg/work_depth_tests.py

diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py
index 89ba6928c7..0fec4812b7 100644
--- a/dace/sdfg/propagation.py
+++ b/dace/sdfg/propagation.py
@@ -10,7 +10,7 @@
 import itertools
 import functools
 import sympy
-from sympy import ceiling
+from sympy import ceiling, Symbol
 from sympy.concrete.summations import Sum
 import warnings
 import networkx as nx
@@ -564,8 +564,7 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states):
     Annotate each valid for loop construct with its loop variable ranges.
 
     :param sdfg: The SDFG in which to look.
-    :param unannotated_cycle_states: List of states in cycles without valid
-                                     for loop ranges.
+    :param unannotated_cycle_states: List of lists. Each sub-list contains the states of one unannotated cycle.
     """
 
     # We import here to avoid cyclic imports.
@@ -652,7 +651,7 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states):
             res = find_for_loop(sdfg, guard, begin, itervar=itvar)
             if res is None:
                 # No range detected, mark as unbounded.
-                unannotated_cycle_states.extend(cycle)
+                unannotated_cycle_states.append(cycle)
             else:
                 itervar, rng, _ = res
 
@@ -674,10 +673,10 @@ def _annotate_loop_ranges(sdfg, unannotated_cycle_states):
         else:
             # There's no guard state, so this cycle marks all states in it as
             # dynamically unbounded.
-            unannotated_cycle_states.extend(cycle)
+            unannotated_cycle_states.append(cycle)
 
 
-def propagate_states(sdfg) -> None:
+def propagate_states(sdfg, concretize_dynamic_unbounded=False) -> None:
     """
     Annotate the states of an SDFG with the number of executions.
 
@@ -728,6 +727,9 @@ def propagate_states(sdfg) -> None:
            once.
 
     :param sdfg: The SDFG to annotate.
+    :param concretize_dynamic_unbounded: If True, we annotate dyncamic unbounded states with symbols of the
+                                         form "num_execs_{sdfg_id}_{loop_start_state_id}". Hence, for each
+                                         unbounded loop its states will have the same number of symbolic executions.
     :note: This operates on the SDFG in-place.
     """
 
@@ -759,6 +761,9 @@ def propagate_states(sdfg) -> None:
     # cycle should be marked as unannotated.
     unannotated_cycle_states = []
     _annotate_loop_ranges(sdfg, unannotated_cycle_states)
+    if not concretize_dynamic_unbounded:
+        # Flatten the list. This keeps the old behavior of propagate_states.
+        unannotated_cycle_states = [state for cycle in unannotated_cycle_states for state in cycle]
 
     # Keep track of states that fully merge a previous conditional split. We do
     # this so we can remove the dynamic executions flag for those states.
@@ -800,7 +805,7 @@ def propagate_states(sdfg) -> None:
                 # The only exception to this rule: If the state is in an
                 # unannotated loop, i.e. should be annotated as dynamic
                 # unbounded instead, we do that.
-                if (state in unannotated_cycle_states):
+                if (not concretize_dynamic_unbounded) and state in unannotated_cycle_states:
                     state.executions = 0
                     state.dynamic_executions = True
                 else:
@@ -872,17 +877,39 @@ def propagate_states(sdfg) -> None:
                 else:
                     # Conditional split or unannotated (dynamic unbounded) loop.
                     unannotated_loop_edge = None
-                    for oedge in out_edges:
-                        if oedge.dst in unannotated_cycle_states:
-                            # This is an unannotated loop down this branch.
-                            unannotated_loop_edge = oedge
+                    if concretize_dynamic_unbounded:
+                        to_remove = []
+                        for oedge in out_edges:
+                            for cycle in unannotated_cycle_states:
+                                if oedge.dst in cycle:
+                                    # This is an unannotated loop down this branch.
+                                    unannotated_loop_edge = oedge
+                                    # remove cycle, since it is now annotated with symbol
+                                    to_remove.append(cycle)
+
+                        for c in to_remove:
+                            unannotated_cycle_states.remove(c)
+                    else:
+                        for oedge in out_edges:
+                            if oedge.dst in unannotated_cycle_states:
+                                # This is an unannotated loop down this branch.
+                                unannotated_loop_edge = oedge
 
                     if unannotated_loop_edge is not None:
                         # Traverse as an unbounded loop.
                         out_edges.remove(unannotated_loop_edge)
                         for oedge in out_edges:
                             traversal_q.append((oedge.dst, state.executions, False, itvar_stack))
-                        traversal_q.append((unannotated_loop_edge.dst, 0, True, itvar_stack))
+                        if concretize_dynamic_unbounded:
+                            # Here we introduce the num_exec symbol and propagate it down the loop.
+                            # We can always assume these symbols to be non-negative.
+                            traversal_q.append(
+                                (unannotated_loop_edge.dst,
+                                 Symbol(f'num_execs_{sdfg.sdfg_id}_{sdfg.node_id(unannotated_loop_edge.dst)}',
+                                        nonnegative=True), False, itvar_stack))
+                        else:
+                            # Propagate dynamic unbounded.
+                            traversal_q.append((unannotated_loop_edge.dst, 0, True, itvar_stack))
                     else:
                         # Traverse as a conditional split.
                         proposed_executions = state.executions
diff --git a/dace/sdfg/work_depth_analysis/helpers.py b/dace/sdfg/work_depth_analysis/helpers.py
new file mode 100644
index 0000000000..a80e769f64
--- /dev/null
+++ b/dace/sdfg/work_depth_analysis/helpers.py
@@ -0,0 +1,331 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Helper functions used by the work depth analysis. """
+
+from dace import SDFG, SDFGState, nodes
+from collections import deque
+from typing import List, Dict, Set, Tuple, Optional, Union
+import networkx as nx
+
+NodeT = str
+EdgeT = Tuple[NodeT, NodeT]
+
+
+class NodeCycle:
+
+    nodes: Set[NodeT] = []
+
+    def __init__(self, nodes: List[NodeT]) -> None:
+        self.nodes = set(nodes)
+
+    @property
+    def length(self) -> int:
+        return len(self.nodes)
+
+
+UUID_SEPARATOR = '/'
+
+
+def ids_to_string(sdfg_id, state_id=-1, node_id=-1, edge_id=-1):
+    return (str(sdfg_id) + UUID_SEPARATOR + str(state_id) + UUID_SEPARATOR + str(node_id) + UUID_SEPARATOR +
+            str(edge_id))
+
+
+def get_uuid(element, state=None):
+    if isinstance(element, SDFG):
+        return ids_to_string(element.sdfg_id)
+    elif isinstance(element, SDFGState):
+        return ids_to_string(element.parent.sdfg_id, element.parent.node_id(element))
+    elif isinstance(element, nodes.Node):
+        return ids_to_string(state.parent.sdfg_id, state.parent.node_id(state), state.node_id(element))
+    else:
+        return ids_to_string(-1)
+
+
+def get_domtree(graph: nx.DiGraph, start_node: str, idom: Dict[str, str] = None):
+    idom = idom or nx.immediate_dominators(graph, start_node)
+
+    alldominated = {n: set() for n in graph.nodes}
+    domtree = nx.DiGraph()
+
+    for node, dom in idom.items():
+        if node is dom:
+            continue
+        domtree.add_edge(dom, node)
+        alldominated[dom].add(node)
+
+        nextidom = idom[dom]
+        ndom = nextidom if nextidom != dom else None
+
+        while ndom:
+            alldominated[ndom].add(node)
+            nextidom = idom[ndom]
+            ndom = nextidom if nextidom != ndom else None
+
+    # 'Rank' the tree, i.e., annotate each node with the level it is on.
+    q = deque()
+    q.append((start_node, 0))
+    while q:
+        node, level = q.popleft()
+        domtree.add_node(node, level=level)
+        for s in domtree.successors(node):
+            q.append((s, level + 1))
+
+    return alldominated, domtree
+
+
+def get_backedges(graph: nx.DiGraph,
+                  start: Optional[NodeT],
+                  strict: bool = False) -> Union[Set[EdgeT], Tuple[Set[EdgeT], Set[EdgeT]]]:
+    '''Find all backedges in a directed graph.
+
+    Note:
+        This algorithm has an algorithmic complexity of O((|V|+|E|)*C) for a
+        graph with vertices V, edges E, and C cycles.
+
+    Args:
+        graph (nx.DiGraph): The graph for which to search backedges.
+        start (str): Start node of the graph. If no start is provided, a node
+            with no incoming edges is used as the start. If no such node can
+            be found, a `ValueError` is raised.
+
+    Returns:
+        A set of backedges in the graph.
+
+    Raises:
+        ValueError: If no `start` is provided and the graph contains no nodes
+            with no incoming edges.
+    '''
+    backedges = set()
+    eclipsed_backedges = set()
+
+    if start is None:
+        for node in graph.nodes():
+            if graph.in_degree(node) == 0:
+                start = node
+                break
+    if start is None:
+        raise ValueError('No start node provided and no start node could ' + 'be determined automatically')
+
+    # Gather all cycles in the graph. Cycles are represented as a sequence of
+    # nodes.
+    # O((|V|+|E|)*(C+1)), for C cycles.
+    all_cycles_nx: List[List[NodeT]] = nx.cycles.simple_cycles(graph)
+    #all_cycles_nx: List[List[NodeT]] = nx.simple_cycles(graph)
+    all_cycles: Set[NodeCycle] = set()
+    for cycle in all_cycles_nx:
+        all_cycles.add(NodeCycle(cycle))
+
+    # Construct a dictionary mapping a node to the cycles containing that node.
+    # O(|V|*|C|)
+    cycle_map: Dict[NodeT, Set[NodeCycle]] = dict()
+    for cycle in all_cycles:
+        for node in cycle.nodes:
+            try:
+                cycle_map[node].add(cycle)
+            except KeyError:
+                cycle_map[node] = set([cycle])
+
+    # Do a BFS traversal of the graph to detect the back edges.
+    # For each node that is part of an (unhandled) cycle, find the longest
+    # still unhandled cycle and try to use it to find the back edge for it.
+    bfs_frontier = [start]
+    visited: Set[NodeT] = set([start])
+    handled_cycles: Set[NodeCycle] = set()
+    unhandled_cycles = all_cycles
+    while bfs_frontier:
+        node = bfs_frontier.pop(0)
+        pred = [p for p in graph.predecessors(node) if p not in visited]
+        longest_cycles: Dict[NodeT, NodeCycle] = dict()
+        try:
+            cycles = cycle_map[node]
+            remove_cycles = set()
+            for cycle in cycles:
+                if cycle not in handled_cycles:
+                    for p in pred:
+                        if p in cycle.nodes:
+                            if p not in longest_cycles:
+                                longest_cycles[p] = cycle
+                            else:
+                                if cycle.length > longest_cycles[p].length:
+                                    longest_cycles[p] = cycle
+                else:
+                    remove_cycles.add(cycle)
+            for cycle in remove_cycles:
+                cycles.remove(cycle)
+        except KeyError:
+            longest_cycles = dict()
+
+        # For the current node, find the incoming edge which belongs to the
+        # cycle and has not been visited yet, which indicates a backedge.
+        node_backedge_candidates: Set[Tuple[EdgeT, NodeCycle]] = set()
+        for p, longest_cycle in longest_cycles.items():
+            handled_cycles.add(longest_cycle)
+            unhandled_cycles.remove(longest_cycle)
+            cycle_map[node].remove(longest_cycle)
+            backedge_candidates = graph.in_edges(node)
+            for candidate in backedge_candidates:
+                src = candidate[0]
+                dst = candidate[0]
+                if src not in visited and src in longest_cycle.nodes:
+                    node_backedge_candidates.add((candidate, longest_cycle))
+                    if not strict:
+                        backedges.add(candidate)
+
+                    # Make sure that any cycle containing this back edge is
+                    # not evaluated again, i.e., mark as handled.
+                    remove_cycles = set()
+                    for cycle in unhandled_cycles:
+                        if src in cycle.nodes and dst in cycle.nodes:
+                            handled_cycles.add(cycle)
+                            remove_cycles.add(cycle)
+                    for cycle in remove_cycles:
+                        unhandled_cycles.remove(cycle)
+
+        # If strict is set, we only report the longest cycle's back edges for
+        # any given node, and separately return any other backedges as
+        # 'eclipsed' backedges. In the case of a while-loop, for example,
+        # the loop edge is considered a backedge, while a continue inside the
+        # loop is considered an 'eclipsed' backedge.
+        if strict:
+            longest_candidate: Tuple[EdgeT, NodeCycle] = None
+            eclipsed_candidates = set()
+            for be_candidate in node_backedge_candidates:
+                if longest_candidate is None:
+                    longest_candidate = be_candidate
+                elif longest_candidate[1].length < be_candidate[1].length:
+                    eclipsed_candidates.add(longest_candidate[0])
+                    longest_candidate = be_candidate
+                else:
+                    eclipsed_candidates.add(be_candidate[0])
+            if longest_candidate is not None:
+                backedges.add(longest_candidate[0])
+            if eclipsed_candidates:
+                eclipsed_backedges.update(eclipsed_candidates)
+
+        # Continue BFS.
+        for neighbour in graph.successors(node):
+            if neighbour not in visited:
+                visited.add(neighbour)
+                bfs_frontier.append(neighbour)
+
+    if strict:
+        return backedges, eclipsed_backedges
+    else:
+        return backedges
+
+
+def find_loop_guards_tails_exits(sdfg_nx: nx.DiGraph):
+    """
+    Detects loops in a SDFG. For each loop, it identifies (node, oNode, exit).
+    We know that there is a backedge from oNode to node that creates the loop and that exit is the exit state of the loop.
+    
+    :param sdfg_nx: The networkx representation of a SDFG.
+    """
+
+    # preparation phase: compute dominators, backedges etc
+    for node in sdfg_nx.nodes():
+        if sdfg_nx.in_degree(node) == 0:
+            start = node
+            break
+    if start is None:
+        raise ValueError('No start node could be determined')
+
+    # sdfg can have multiple end nodes --> not good for postDomTree
+    # --> add a new end node
+    artificial_end_node = 'artificial_end_node'
+    sdfg_nx.add_node(artificial_end_node)
+    for node in sdfg_nx.nodes():
+        if sdfg_nx.out_degree(node) == 0 and node != artificial_end_node:
+            # this is an end node of the sdfg
+            sdfg_nx.add_edge(node, artificial_end_node)
+
+    # sanity check:
+    if sdfg_nx.in_degree(artificial_end_node) == 0:
+        raise ValueError('No end node could be determined in the SDFG')
+
+    # compute dominators and backedges
+    iDoms = nx.immediate_dominators(sdfg_nx, start)
+    allDom, domTree = get_domtree(sdfg_nx, start, iDoms)
+
+    reversed_sdfg_nx = sdfg_nx.reverse()
+    iPostDoms = nx.immediate_dominators(reversed_sdfg_nx, artificial_end_node)
+    allPostDoms, postDomTree = get_domtree(reversed_sdfg_nx, artificial_end_node, iPostDoms)
+
+    backedges = get_backedges(sdfg_nx, start)
+    backedgesDstDict = {}
+    for be in backedges:
+        if be[1] in backedgesDstDict:
+            backedgesDstDict[be[1]].add(be)
+        else:
+            backedgesDstDict[be[1]] = set([be])
+
+    # This list will be filled with triples (node, oNode, exit), one triple for each loop construct in the SDFG.
+    # There will always be a backedge from oNode to node. Either node or oNode will be the corresponding loop guard,
+    # depending on whether it is a while-do or a do-while loop. exit will always be the exit state of the loop.
+    nodes_oNodes_exits = []
+
+    # iterate over all nodes
+    for node in sdfg_nx.nodes():
+        # Check if any backedge ends in node.
+        if node in backedgesDstDict:
+            inc_backedges = backedgesDstDict[node]
+
+            # gather all successors of node that are not reached by backedges
+            successors = []
+            for edge in sdfg_nx.out_edges(node):
+                if not edge in backedges:
+                    successors.append(edge[1])
+
+            # For each incoming backedge, we want to find oNode and exit. There can be multiple backedges, in case
+            # we have a continue statement in the original code. But we can handle these backedges normally.
+            for be in inc_backedges:
+                # since node has an incoming backedge, it is either a loop guard or loop tail
+                # oNode will exactly be the other thing
+                oNode = be[0]
+                exitCandidates = set()
+                # search for exit candidates:
+                # a state is a exit candidate if:
+                #   - it is in successor and it does not dominate oNode (else it dominates
+                #           the last loop state, and hence is inside the loop itself)
+                #   - is is a successor of oNode (but not node)
+                # This handles both cases of while-do and do-while loops
+                for succ in successors:
+                    if succ != oNode and oNode not in allDom[succ]:
+                        exitCandidates.add(succ)
+                for succ in sdfg_nx.successors(oNode):
+                    if succ != node:
+                        exitCandidates.add(succ)
+
+                if len(exitCandidates) == 0:
+                    raise ValueError('failed to find any exit nodes')
+                elif len(exitCandidates) > 1:
+                    # Find the exit candidate that sits highest up in the
+                    # postdominator tree (i.e., has the lowest level).
+                    # That must be the exit node (it must post-dominate)
+                    # everything inside the loop. If there are multiple
+                    # candidates on the lowest level (i.e., disjoint set of
+                    # postdominated nodes), there are multiple exit paths,
+                    # and they all share one level.
+                    cand = exitCandidates.pop()
+                    minSet = set([cand])
+                    minLevel = nx.get_node_attributes(postDomTree, 'level')[cand]
+                    for cand in exitCandidates:
+                        curr_level = nx.get_node_attributes(postDomTree, 'level')[cand]
+                        if curr_level < minLevel:
+                            # new minimum found
+                            minLevel = curr_level
+                            minSet.clear()
+                            minSet.add(cand)
+                        elif curr_level == minLevel:
+                            # add cand to curr set
+                            minSet.add(cand)
+
+                    if len(minSet) > 0:
+                        exitCandidates = minSet
+                    else:
+                        raise ValueError('failed to find exit minSet')
+
+                # now we have a triple (node, oNode, exitCandidates)
+                nodes_oNodes_exits.append((node, oNode, exitCandidates))
+
+    return nodes_oNodes_exits
diff --git a/dace/sdfg/work_depth_analysis/work_depth.py b/dace/sdfg/work_depth_analysis/work_depth.py
new file mode 100644
index 0000000000..a05fe10266
--- /dev/null
+++ b/dace/sdfg/work_depth_analysis/work_depth.py
@@ -0,0 +1,653 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Work depth analysis for any input SDFG. Can be used with the DaCe VS Code extension or
+from command line as a Python script. """
+
+import argparse
+from collections import deque
+from dace.sdfg import nodes as nd, propagation, InterstateEdge
+from dace import SDFG, SDFGState, dtypes
+from dace.subsets import Range
+from typing import Tuple, Dict
+import os
+import sympy as sp
+from copy import deepcopy
+from dace.libraries.blas import MatMul
+from dace.libraries.standard import Reduce, Transpose
+from dace.symbolic import pystr_to_symbolic
+import ast
+import astunparse
+import warnings
+
+from dace.sdfg.work_depth_analysis.helpers import get_uuid, find_loop_guards_tails_exits
+
+
+def get_array_size_symbols(sdfg):
+    """
+    Returns all symbols that appear isolated in shapes of the SDFG's arrays.
+    These symbols can then be assumed to be positive.
+
+    :note: This only works if a symbol appears in isolation, i.e. array A[N].
+           If we have A[N+1], we cannot assume N to be positive.
+    :param sdfg: The SDFG in which it searches for symbols.
+    :return: A set containing symbols which we can assume to be positive.
+    """
+    symbols = set()
+    for _, _, arr in sdfg.arrays_recursive():
+        for s in arr.shape:
+            if isinstance(s, sp.Symbol):
+                symbols.add(s)
+    return symbols
+
+
+def posify_certain_symbols(expr, syms_to_posify):
+    """
+    Takes an expression and evaluates it while assuming that certain symbols are positive.
+
+    :param expr: The expression to evaluate.
+    :param syms_to_posify: List of symbols we assume to be positive.
+    :note: This is adapted from the Sympy function posify.
+    """
+
+    expr = sp.sympify(expr)
+
+    reps = {s: sp.Dummy(s.name, positive=True, **s.assumptions0) for s in syms_to_posify if s.is_positive is None}
+    expr = expr.subs(reps)
+    return expr.subs({r: s for s, r in reps.items()})
+
+
+def symeval(val, symbols):
+    """
+    Takes a sympy expression and substitutes its symbols according to a dict { old_symbol: new_symbol}.
+
+    :param val: The expression we are updating.
+    :param symbols: Dictionary of key value pairs { old_symbol: new_symbol}.
+    """
+    first_replacement = {pystr_to_symbolic(k): pystr_to_symbolic('__REPLSYM_' + k) for k in symbols.keys()}
+    second_replacement = {pystr_to_symbolic('__REPLSYM_' + k): v for k, v in symbols.items()}
+    return val.subs(first_replacement).subs(second_replacement)
+
+
+def evaluate_symbols(base, new):
+    result = {}
+    for k, v in new.items():
+        result[k] = symeval(v, base)
+    return result
+
+
+def count_work_matmul(node, symbols, state):
+    A_memlet = next(e for e in state.in_edges(node) if e.dst_conn == '_a')
+    B_memlet = next(e for e in state.in_edges(node) if e.dst_conn == '_b')
+    C_memlet = next(e for e in state.out_edges(node) if e.src_conn == '_c')
+    result = 2  # Multiply, add
+    # Batch
+    if len(C_memlet.data.subset) == 3:
+        result *= symeval(C_memlet.data.subset.size()[0], symbols)
+    # M*N
+    result *= symeval(C_memlet.data.subset.size()[-2], symbols)
+    result *= symeval(C_memlet.data.subset.size()[-1], symbols)
+    # K
+    result *= symeval(A_memlet.data.subset.size()[-1], symbols)
+    return result
+
+
+def count_work_reduce(node, symbols, state):
+    result = 0
+    if node.wcr is not None:
+        result += count_arithmetic_ops_code(node.wcr)
+    in_memlet = None
+    in_edges = state.in_edges(node)
+    if in_edges is not None and len(in_edges) == 1:
+        in_memlet = in_edges[0]
+    if in_memlet is not None and in_memlet.data.volume is not None:
+        result *= in_memlet.data.volume
+    else:
+        result = 0
+    return result
+
+
+LIBNODES_TO_WORK = {
+    MatMul: count_work_matmul,
+    Transpose: lambda *args: 0,
+    Reduce: count_work_reduce,
+}
+
+
+def count_depth_matmul(node, symbols, state):
+    # For now we set it equal to work: see comments in count_depth_reduce just below
+    return count_work_matmul(node, symbols, state)
+
+
+def count_depth_reduce(node, symbols, state):
+    # depth of reduction is log2 of the work
+    # TODO: Can we actually assume this? Or is it equal to the work?
+    #       Another thing to consider is that we essetially do NOT count wcr edges as operations for now...
+
+    # return sp.ceiling(sp.log(count_work_reduce(node, symbols, state), 2))
+    # set it equal to work for now
+    return count_work_reduce(node, symbols, state)
+
+
+LIBNODES_TO_DEPTH = {
+    MatMul: count_depth_matmul,
+    Transpose: lambda *args: 0,
+    Reduce: count_depth_reduce,
+}
+
+bigo = sp.Function('bigo')
+PYFUNC_TO_ARITHMETICS = {
+    'float': 0,
+    'dace.float64': 0,
+    'dace.int64': 0,
+    'math.exp': 1,
+    'exp': 1,
+    'math.tanh': 1,
+    'sin': 1,
+    'cos': 1,
+    'tanh': 1,
+    'math.sqrt': 1,
+    'sqrt': 1,
+    'atan2:': 1,
+    'min': 0,
+    'max': 0,
+    'ceiling': 0,
+    'floor': 0,
+    'abs': 0
+}
+
+
+class ArithmeticCounter(ast.NodeVisitor):
+
+    def __init__(self):
+        self.count = 0
+
+    def visit_BinOp(self, node):
+        if isinstance(node.op, ast.MatMult):
+            raise NotImplementedError('MatMult op count requires shape '
+                                      'inference')
+        self.count += 1
+        return self.generic_visit(node)
+
+    def visit_UnaryOp(self, node):
+        self.count += 1
+        return self.generic_visit(node)
+
+    def visit_Call(self, node):
+        fname = astunparse.unparse(node.func)[:-1]
+        if fname not in PYFUNC_TO_ARITHMETICS:
+            print(
+                'WARNING: Unrecognized python function "%s". If this is a type conversion, like "dace.float64", then this is fine.'
+                % fname)
+            return self.generic_visit(node)
+        self.count += PYFUNC_TO_ARITHMETICS[fname]
+        return self.generic_visit(node)
+
+    def visit_AugAssign(self, node):
+        return self.visit_BinOp(node)
+
+    def visit_For(self, node):
+        raise NotImplementedError
+
+    def visit_While(self, node):
+        raise NotImplementedError
+
+
+def count_arithmetic_ops_code(code):
+    ctr = ArithmeticCounter()
+    if isinstance(code, (tuple, list)):
+        for stmt in code:
+            ctr.visit(stmt)
+    elif isinstance(code, str):
+        ctr.visit(ast.parse(code))
+    else:
+        ctr.visit(code)
+    return ctr.count
+
+
+class DepthCounter(ast.NodeVisitor):
+    # so far this is identical to the ArithmeticCounter above.
+    def __init__(self):
+        self.count = 0
+
+    def visit_BinOp(self, node):
+        if isinstance(node.op, ast.MatMult):
+            raise NotImplementedError('MatMult op count requires shape '
+                                      'inference')
+        self.count += 1
+        return self.generic_visit(node)
+
+    def visit_UnaryOp(self, node):
+        self.count += 1
+        return self.generic_visit(node)
+
+    def visit_Call(self, node):
+        fname = astunparse.unparse(node.func)[:-1]
+        if fname not in PYFUNC_TO_ARITHMETICS:
+            print(
+                'WARNING: Unrecognized python function "%s". If this is a type conversion, like "dace.float64", then this is fine.'
+                % fname)
+            return self.generic_visit(node)
+        self.count += PYFUNC_TO_ARITHMETICS[fname]
+        return self.generic_visit(node)
+
+    def visit_AugAssign(self, node):
+        return self.visit_BinOp(node)
+
+    def visit_For(self, node):
+        raise NotImplementedError
+
+    def visit_While(self, node):
+        raise NotImplementedError
+
+
+def count_depth_code(code):
+    # so far this is the same as the work counter, since work = depth for each tasklet, as we can't assume any parallelism
+    ctr = ArithmeticCounter()
+    if isinstance(code, (tuple, list)):
+        for stmt in code:
+            ctr.visit(stmt)
+    elif isinstance(code, str):
+        ctr.visit(ast.parse(code))
+    else:
+        ctr.visit(code)
+    return ctr.count
+
+
+def tasklet_work(tasklet_node, state):
+    if tasklet_node.code.language == dtypes.Language.CPP:
+        for oedge in state.out_edges(tasklet_node):
+            return bigo(oedge.data.num_accesses)
+
+    elif tasklet_node.code.language == dtypes.Language.Python:
+        return count_arithmetic_ops_code(tasklet_node.code.code)
+    else:
+        # other languages not implemented, count whole tasklet as work of 1
+        warnings.warn('Work of tasklets only properly analyzed for Python or CPP. For all other '
+                      'languages work = 1 will be counted for each tasklet.')
+        return 1
+
+
+def tasklet_depth(tasklet_node, state):
+    # TODO: how to get depth of CPP tasklets?
+    # For now we use depth == work:
+    if tasklet_node.code.language == dtypes.Language.CPP:
+        for oedge in state.out_edges(tasklet_node):
+            return bigo(oedge.data.num_accesses)
+    if tasklet_node.code.language == dtypes.Language.Python:
+        return count_depth_code(tasklet_node.code.code)
+    else:
+        # other languages not implemented, count whole tasklet as work of 1
+        warnings.warn('Depth of tasklets only properly analyzed for Python code. For all other '
+                      'languages depth = 1 will be counted for each tasklet.')
+        return 1
+
+
+def get_tasklet_work(node, state):
+    return tasklet_work(node, state), -1
+
+
+def get_tasklet_work_depth(node, state):
+    return tasklet_work(node, state), tasklet_depth(node, state)
+
+
+def get_tasklet_avg_par(node, state):
+    return tasklet_work(node, state), tasklet_depth(node, state)
+
+
+def sdfg_work_depth(sdfg: SDFG, w_d_map: Dict[str, Tuple[sp.Expr, sp.Expr]], analyze_tasklet,
+                    symbols) -> Tuple[sp.Expr, sp.Expr]:
+    """
+    Analyze the work and depth of a given SDFG.
+    First we determine the work and depth of each state. Then we break loops in the state machine, such that we get a DAG.
+    Lastly, we compute the path with most work and the path with the most depth in order to get the total work depth.
+
+    :param sdfg: The SDFG to analyze.
+    :param w_d_map: Dictionary which will save the result.
+    :param analyze_tasklet: Function used to analyze tasklet nodes.
+    :param symbols: A dictionary mapping local nested SDFG symbols to global symbols.
+    :return: A tuple containing the work and depth of the SDFG.
+    """
+
+    # First determine the work and depth of each state individually.
+    # Keep track of the work and depth for each state in a dictionary, where work and depth are multiplied by the number
+    # of times the state will be executed.
+    state_depths: Dict[SDFGState, sp.Expr] = {}
+    state_works: Dict[SDFGState, sp.Expr] = {}
+    for state in sdfg.nodes():
+        state_work, state_depth = state_work_depth(state, w_d_map, analyze_tasklet, symbols)
+        state_works[state] = sp.simplify(state_work * state.executions)
+        state_depths[state] = sp.simplify(state_depth * state.executions)
+        w_d_map[get_uuid(state)] = (state_works[state], state_depths[state])
+
+    # Prepare the SDFG for a depth analysis by breaking loops. This removes the edge between the last loop state and
+    # the guard, and instead places an edge between the last loop state and the exit state.
+    # This transforms the state machine into a DAG. Hence, we can find the "heaviest" and "deepest" paths in linear time.
+    # Additionally, construct a dummy exit state and connect every state that has no outgoing edges to it.
+
+    # identify all loops in the SDFG
+    nodes_oNodes_exits = find_loop_guards_tails_exits(sdfg._nx)
+
+    # Now we need to go over each triple (node, oNode, exits). For each triple, we
+    #       - remove edge (oNode, node), i.e. the backward edge
+    #       - for all exits e, add edge (oNode, e). This edge may already exist
+    for node, oNode, exits in nodes_oNodes_exits:
+        sdfg.remove_edge(sdfg.edges_between(oNode, node)[0])
+        for e in exits:
+            if len(sdfg.edges_between(oNode, e)) == 0:
+                # no edge there yet
+                sdfg.add_edge(oNode, e, InterstateEdge())
+
+    # add a dummy exit to the SDFG, such that each path ends there.
+    dummy_exit = sdfg.add_state('dummy_exit')
+    for state in sdfg.nodes():
+        if len(sdfg.out_edges(state)) == 0 and state != dummy_exit:
+            sdfg.add_edge(state, dummy_exit, InterstateEdge())
+
+    # These two dicts save the current length of the "heaviest", resp. "deepest", paths at each state.
+    work_map: Dict[SDFGState, sp.Expr] = {}
+    depth_map: Dict[SDFGState, sp.Expr] = {}
+    # The dummy state has 0 work and depth.
+    state_depths[dummy_exit] = sp.sympify(0)
+    state_works[dummy_exit] = sp.sympify(0)
+
+    # Perform a BFS traversal of the state machine and calculate the maximum work / depth at each state. Only advance to
+    # the next state in the BFS if all incoming edges have been visited, to ensure the maximum work / depth expressions
+    # have been calculated.
+    traversal_q = deque()
+    traversal_q.append((sdfg.start_state, sp.sympify(0), sp.sympify(0), None))
+    visited = set()
+    while traversal_q:
+        state, depth, work, ie = traversal_q.popleft()
+
+        if ie is not None:
+            visited.add(ie)
+
+        n_depth = sp.simplify(depth + state_depths[state])
+        n_work = sp.simplify(work + state_works[state])
+
+        # If we are analysing average parallelism, we don't search "heaviest" and "deepest" paths separately, but we want one
+        # single path with the least average parallelsim (of all paths with more than 0 work).
+        if analyze_tasklet == get_tasklet_avg_par:
+            if state in depth_map:  # and hence als state in work_map
+                # if current path has 0 depth, we don't do anything.
+                if n_depth != 0:
+                    # see if we need to update the work and depth of the current state
+                    # we update if avg parallelism of new incoming path is less than current avg parallelism
+                    old_avg_par = sp.simplify(work_map[state] / depth_map[state])
+                    new_avg_par = sp.simplify(n_work / n_depth)
+
+                    if depth_map[state] == 0 or new_avg_par < old_avg_par:
+                        # old value was divided by zero or new path gives actually worse avg par, then we keep new value
+                        depth_map[state] = n_depth
+                        work_map[state] = n_work
+            else:
+                depth_map[state] = n_depth
+                work_map[state] = n_work
+        else:
+            # search heaviest and deepest path separately
+            if state in depth_map:  # and consequently also in work_map
+                depth_map[state] = sp.Max(depth_map[state], n_depth)
+                work_map[state] = sp.Max(work_map[state], n_work)
+            else:
+                depth_map[state] = n_depth
+                work_map[state] = n_work
+
+        out_edges = sdfg.out_edges(state)
+        # only advance after all incoming edges were visited (meaning that current work depth values of state are final).
+        if any(iedge not in visited for iedge in sdfg.in_edges(state)):
+            pass
+        else:
+            for oedge in out_edges:
+                traversal_q.append((oedge.dst, depth_map[state], work_map[state], oedge))
+
+    try:
+        max_depth = depth_map[dummy_exit]
+        max_work = work_map[dummy_exit]
+    except KeyError:
+        # If we get a KeyError above, this means that the traversal never reached the dummy_exit state.
+        # This happens if the loops were not properly detected and broken.
+        raise Exception(
+            'Analysis failed, since not all loops got detected. It may help to use more structured loop constructs.')
+
+    sdfg_result = (sp.simplify(max_work), sp.simplify(max_depth))
+    w_d_map[get_uuid(sdfg)] = sdfg_result
+    return sdfg_result
+
+
+def scope_work_depth(state: SDFGState,
+                     w_d_map: Dict[str, sp.Expr],
+                     analyze_tasklet,
+                     symbols,
+                     entry: nd.EntryNode = None) -> Tuple[sp.Expr, sp.Expr]:
+    """
+    Analyze the work and depth of a scope.
+    This works by traversing through the scope analyzing the work and depth of each encountered node.
+    Depending on what kind of node we encounter, we do the following:
+        - EntryNode: Recursively analyze work depth of scope.
+        - Tasklet: use analyze_tasklet to get work depth of tasklet node.
+        - NestedSDFG: After translating its local symbols to global symbols, we analyze the nested SDFG recursively.
+        - LibraryNode: Library nodes are analyzed with special functions depending on their type.
+    Work inside a state can simply be summed up, but for the depth we need to find the longest path. Since dataflow is a DAG,
+    this can be done in linear time by traversing the graph in topological order.
+
+    :param state: The state in which the scope to analyze is contained.
+    :param sym_map: A dictionary mapping symbols to their values.
+    :param entry: The entry node of the scope to analyze. If None, the entire state is analyzed.
+    :return: A tuple containing the work and depth of the scope.
+    """
+
+    # find the work and depth of each node
+    # for maps and nested SDFG, we do it recursively
+    work = sp.sympify(0)
+    max_depth = sp.sympify(0)
+    scope_nodes = state.scope_children()[entry]
+    scope_exit = None if entry is None else state.exit_node(entry)
+    for node in scope_nodes:
+        # add node to map
+        w_d_map[get_uuid(node, state)] = (sp.sympify(0), sp.sympify(0))
+        if isinstance(node, nd.EntryNode):
+            # If the scope contains an entry node, we need to recursively analyze the sub-scope of the entry node first.
+            # The resulting work/depth are summarized into the entry node
+            s_work, s_depth = scope_work_depth(state, w_d_map, analyze_tasklet, symbols, node)
+            # add up work for whole state, but also save work for this sub-scope scope in w_d_map
+            work += s_work
+            w_d_map[get_uuid(node, state)] = (s_work, s_depth)
+        elif node == scope_exit:
+            # don't do anything for exit nodes, everthing handled already in the corresponding entry node.
+            pass
+        elif isinstance(node, nd.Tasklet):
+            # add up work for whole state, but also save work for this node in w_d_map
+            t_work, t_depth = analyze_tasklet(node, state)
+            work += t_work
+            w_d_map[get_uuid(node, state)] = (sp.sympify(t_work), sp.sympify(t_depth))
+        elif isinstance(node, nd.NestedSDFG):
+            # keep track of nested symbols: "symbols" maps local nested SDFG symbols to global symbols.
+            # We only want global symbols in our final work depth expressions.
+            nested_syms = {}
+            nested_syms.update(symbols)
+            nested_syms.update(evaluate_symbols(symbols, node.symbol_mapping))
+            # Nested SDFGs are recursively analyzed first.
+            nsdfg_work, nsdfg_depth = sdfg_work_depth(node.sdfg, w_d_map, analyze_tasklet, nested_syms)
+
+            # add up work for whole state, but also save work for this nested SDFG in w_d_map
+            work += nsdfg_work
+            w_d_map[get_uuid(node, state)] = (nsdfg_work, nsdfg_depth)
+        elif isinstance(node, nd.LibraryNode):
+            lib_node_work = LIBNODES_TO_WORK[type(node)](node, symbols, state)
+            work += lib_node_work
+            lib_node_depth = -1  # not analyzed
+            if analyze_tasklet != get_tasklet_work:
+                # we are analyzing depth
+                lib_node_depth = LIBNODES_TO_DEPTH[type(node)](node, symbols, state)
+            w_d_map[get_uuid(node, state)] = (lib_node_work, lib_node_depth)
+
+    if entry is not None:
+        # If the scope being analyzed is a map, multiply the work by the number of iterations of the map.
+        if isinstance(entry, nd.MapEntry):
+            nmap: nd.Map = entry.map
+            range: Range = nmap.range
+            n_exec = range.num_elements_exact()
+            work = work * sp.simplify(n_exec)
+        else:
+            print('WARNING: Only Map scopes are supported in work analysis for now. Assuming 1 iteration.')
+
+    # Work inside a state can simply be summed up. But now we need to find the depth of a state (i.e. longest path).
+    # Since dataflow graph is a DAG, this can be done in linear time.
+    max_depth = sp.sympify(0)
+    # only do this if we are analyzing depth
+    if analyze_tasklet == get_tasklet_work_depth or analyze_tasklet == get_tasklet_avg_par:
+        # Calculate the maximum depth of the scope by finding the 'deepest' path from the source to the sink. This is done by
+        # a traversal in topological order, where each node propagates its current max depth for all incoming paths.
+        traversal_q = deque()
+        visited = set()
+        # find all starting nodes
+        if entry:
+            # the entry is the starting node
+            traversal_q.append((entry, sp.sympify(0), None))
+        else:
+            for node in scope_nodes:
+                if len(state.in_edges(node)) == 0:
+                    # This node is a start node of the traversal
+                    traversal_q.append((node, sp.sympify(0), None))
+        # this map keeps track of the length of the longest path ending at each state so far seen.
+        depth_map = {}
+        while traversal_q:
+            node, in_depth, in_edge = traversal_q.popleft()
+
+            if in_edge is not None:
+                visited.add(in_edge)
+
+            n_depth = sp.simplify(in_depth + w_d_map[get_uuid(node, state)][1])
+
+            if node in depth_map:
+                depth_map[node] = sp.Max(depth_map[node], n_depth)
+            else:
+                depth_map[node] = n_depth
+
+            out_edges = state.out_edges(node)
+            # Only advance to next node, if all incoming edges have been visited or the current node is the entry (aka starting node).
+            # If the current node is the exit of the scope, we stop, such that we don't leave the scope.
+            if (all(iedge in visited for iedge in state.in_edges(node)) or node == entry) and node != scope_exit:
+                # If we encounter a nested map, we must not analyze its contents (as they have already been recursively analyzed).
+                # Hence, we continue from the outgoing edges of the corresponding exit.
+                if isinstance(node, nd.EntryNode) and node != entry:
+                    exit_node = state.exit_node(node)
+                    # replace out_edges with the out_edges of the scope exit node
+                    out_edges = state.out_edges(exit_node)
+                for oedge in out_edges:
+                    traversal_q.append((oedge.dst, depth_map[node], oedge))
+            if len(out_edges) == 0 or node == scope_exit:
+                # We have reached an end node --> update max_depth
+                max_depth = sp.Max(max_depth, depth_map[node])
+
+    # summarise work / depth of the whole scope in the dictionary
+    scope_result = (sp.simplify(work), sp.simplify(max_depth))
+    w_d_map[get_uuid(state)] = scope_result
+    return scope_result
+
+
+def state_work_depth(state: SDFGState, w_d_map: Dict[str, sp.Expr], analyze_tasklet,
+                     symbols) -> Tuple[sp.Expr, sp.Expr]:
+    """
+    Analyze the work and depth of a state.
+
+    :param state: The state to analyze.
+    :param w_d_map: The result will be saved to this map.
+    :param analyze_tasklet: Function used to analyze tasklet nodes.
+    :param symbols: A dictionary mapping local nested SDFG symbols to global symbols.
+    :return: A tuple containing the work and depth of the state.
+    """
+    work, depth = scope_work_depth(state, w_d_map, analyze_tasklet, symbols, None)
+    return work, depth
+
+
+def analyze_sdfg(sdfg: SDFG, w_d_map: Dict[str, sp.Expr], analyze_tasklet) -> None:
+    """
+    Analyze a given SDFG. We can either analyze work, work and depth or average parallelism.
+
+    :note: SDFGs should have split interstate edges. This means there should be no interstate edges containing both a
+        condition and an assignment.
+    :param sdfg: The SDFG to analyze.
+    :param w_d_map: Dictionary of SDFG elements to (work, depth) tuples. Result will be saved in here.
+    :param analyze_tasklet: The function used to analyze tasklet nodes. Analyzes either just work, work and depth or average parallelism.
+    """
+
+    # deepcopy such that original sdfg not changed
+    sdfg = deepcopy(sdfg)
+
+    # Run state propagation for all SDFGs recursively. This is necessary to determine the number of times each state
+    # will be executed, or to determine upper bounds for that number (such as in the case of branching)
+    for sd in sdfg.all_sdfgs_recursive():
+        propagation.propagate_states(sd, concretize_dynamic_unbounded=True)
+
+    # Analyze the work and depth of the SDFG.
+    symbols = {}
+    sdfg_work_depth(sdfg, w_d_map, analyze_tasklet, symbols)
+
+    # Note: This posify could be done more often to improve performance.
+    array_symbols = get_array_size_symbols(sdfg)
+    for k, (v_w, v_d) in w_d_map.items():
+        # The symeval replaces nested SDFG symbols with their global counterparts.
+        v_w = posify_certain_symbols(symeval(v_w, symbols), array_symbols)
+        v_d = posify_certain_symbols(symeval(v_d, symbols), array_symbols)
+        w_d_map[k] = (v_w, v_d)
+
+
+################################################################################
+# Utility functions for running the analysis from the command line #############
+################################################################################
+
+
+def main() -> None:
+
+    parser = argparse.ArgumentParser('work_depth',
+                                     usage='python work_depth.py [-h] filename --analyze {work,workDepth,avgPar}',
+                                     description='Analyze the work/depth of an SDFG.')
+
+    parser.add_argument('filename', type=str, help='The SDFG file to analyze.')
+    parser.add_argument('--analyze',
+                        choices=['work', 'workDepth', 'avgPar'],
+                        default='workDepth',
+                        help='Choose what to analyze. Default: workDepth')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.filename):
+        print(args.filename, 'does not exist.')
+        exit()
+
+    if args.analyze == 'workDepth':
+        analyze_tasklet = get_tasklet_work_depth
+    elif args.analyze == 'avgPar':
+        analyze_tasklet = get_tasklet_avg_par
+    elif args.analyze == 'work':
+        analyze_tasklet = get_tasklet_work
+
+    sdfg = SDFG.from_file(args.filename)
+    work_depth_map = {}
+    analyze_sdfg(sdfg, work_depth_map, analyze_tasklet)
+
+    if args.analyze == 'workDepth':
+        for k, v, in work_depth_map.items():
+            work_depth_map[k] = (str(sp.simplify(v[0])), str(sp.simplify(v[1])))
+    elif args.analyze == 'work':
+        for k, v, in work_depth_map.items():
+            work_depth_map[k] = str(sp.simplify(v[0]))
+    elif args.analyze == 'avgPar':
+        for k, v, in work_depth_map.items():
+            work_depth_map[k] = str(sp.simplify(v[0] / v[1]) if str(v[1]) != '0' else 0)  # work / depth = avg par
+
+    result_whole_sdfg = work_depth_map[get_uuid(sdfg)]
+
+    print(80 * '-')
+    if args.analyze == 'workDepth':
+        print("Work:\t", result_whole_sdfg[0])
+        print("Depth:\t", result_whole_sdfg[1])
+    elif args.analyze == 'work':
+        print("Work:\t", result_whole_sdfg)
+    elif args.analyze == 'avgPar':
+        print("Average Parallelism:\t", result_whole_sdfg)
+    print(80 * '-')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/sdfg/work_depth_tests.py b/tests/sdfg/work_depth_tests.py
new file mode 100644
index 0000000000..133afe8ae4
--- /dev/null
+++ b/tests/sdfg/work_depth_tests.py
@@ -0,0 +1,201 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" Contains test cases for the work depth analysis. """
+import dace as dc
+from dace.sdfg.work_depth_analysis.work_depth import analyze_sdfg, get_tasklet_work_depth
+from dace.sdfg.work_depth_analysis.helpers import get_uuid
+import sympy as sp
+
+from dace.transformation.interstate import NestSDFG
+from dace.transformation.dataflow import MapExpansion
+
+# TODO: add tests for library nodes (e.g. reduce, matMul)
+
+N = dc.symbol('N')
+M = dc.symbol('M')
+K = dc.symbol('K')
+
+
+@dc.program
+def single_map(x: dc.float64[N], y: dc.float64[N], z: dc.float64[N]):
+    z[:] = x + y
+
+
+@dc.program
+def single_for_loop(x: dc.float64[N], y: dc.float64[N]):
+    for i in range(N):
+        x[i] += y[i]
+
+
+@dc.program
+def if_else(x: dc.int64[1000], y: dc.int64[1000], z: dc.int64[1000], sum: dc.int64[1]):
+    if x[10] > 50:
+        z[:] = x + y  # 1000 work, 1 depth
+    else:
+        for i in range(100):  # 100 work, 100 depth
+            sum += x[i]
+
+
+@dc.program
+def if_else_sym(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], sum: dc.int64[1]):
+    if x[10] > 50:
+        z[:] = x + y  # N work, 1 depth
+    else:
+        for i in range(K):  # K work, K depth
+            sum += x[i]
+
+
+@dc.program
+def nested_sdfg(x: dc.float64[N], y: dc.float64[N], z: dc.float64[N]):
+    single_map(x, y, z)
+    single_for_loop(x, y)
+
+
+@dc.program
+def nested_maps(x: dc.float64[N, M], y: dc.float64[N, M], z: dc.float64[N, M]):
+    z[:, :] = x + y
+
+
+@dc.program
+def nested_for_loops(x: dc.float64[N], y: dc.float64[K]):
+    for i in range(N):
+        for j in range(K):
+            x[i] += y[j]
+
+
+@dc.program
+def nested_if_else(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], sum: dc.int64[1]):
+    if x[10] > 50:
+        if x[9] > 50:
+            z[:] = x + y  # N work, 1 depth
+        z[:] += 2 * x  # 2*N work, 2 depth     --> total outer if: 3*N work, 3 depth
+    else:
+        if y[9] > 50:
+            for i in range(K):
+                sum += x[i]  # K work, K depth
+        else:
+            for j in range(M):
+                sum += x[j]  # M work, M depth
+            z[:] = x + y  # N work, depth 1       --> total inner else: M+N work, M+1 depth
+            # --> total outer else: Max(K, M+N) work, Max(K, M+1) depth
+            # --> total over both branches: Max(K, M+N, 3*N) work, Max(K, M+1, 3) depth
+
+
+@dc.program
+def max_of_positive_symbol(x: dc.float64[N]):
+    if x[0] > 0:
+        for i in range(2 * N):  # work 2*N^2, depth 2*N
+            x += 1
+    else:
+        for j in range(3 * N):  # work 3*N^2, depth 3*N
+            x += 1
+            # total is work 3*N^2, depth 3*N without any max
+
+
+@dc.program
+def multiple_array_sizes(x: dc.int64[N], y: dc.int64[N], z: dc.int64[N], x2: dc.int64[M], y2: dc.int64[M],
+                         z2: dc.int64[M], x3: dc.int64[K], y3: dc.int64[K], z3: dc.int64[K]):
+    if x[0] > 0:
+        z[:] = 2 * x + y  # work 2*N, depth 2
+    elif x[1] > 0:
+        z2[:] = 2 * x2 + y2  # work 2*M + 3, depth 5
+        z2[0] += 3 + z[1] + z[2]
+    elif x[2] > 0:
+        z3[:] = 2 * x3 + y3  # work 2*K, depth 2
+    elif x[3] > 0:
+        z[:] = 3 * x + y + 1  # work 3*N, depth 3
+        # --> work= Max(3*N, 2*M, 2*K) and depth = 5
+
+
+@dc.program
+def unbounded_while_do(x: dc.float64[N]):
+    while x[0] < 100:
+        x += 1
+
+
+@dc.program
+def unbounded_do_while(x: dc.float64[N]):
+    while True:
+        x += 1
+        if x[0] >= 100:
+            break
+
+
+@dc.program
+def unbounded_nonnegify(x: dc.float64[N]):
+    while x[0] < 100:
+        if x[1] < 42:
+            x += 3 * x
+        else:
+            x += x
+
+
+@dc.program
+def continue_for_loop(x: dc.float64[N]):
+    for i in range(N):
+        if x[i] > 100:
+            continue
+        x += 1
+
+
+@dc.program
+def break_for_loop(x: dc.float64[N]):
+    for i in range(N):
+        if x[i] > 100:
+            break
+        x += 1
+
+
+@dc.program
+def break_while_loop(x: dc.float64[N]):
+    while x[0] > 10:
+        if x[1] > 100:
+            break
+        x += 1
+
+
+tests_cases = [
+    (single_map, (N, 1)),
+    (single_for_loop, (N, N)),
+    (if_else, (1000, 100)),
+    (if_else_sym, (sp.Max(K, N), sp.Max(1, K))),
+    (nested_sdfg, (2 * N, N + 1)),
+    (nested_maps, (M * N, 1)),
+    (nested_for_loops, (K * N, K * N)),
+    (nested_if_else, (sp.Max(K, 3 * N, M + N), sp.Max(3, K, M + 1))),
+    (max_of_positive_symbol, (3 * N**2, 3 * N)),
+    (multiple_array_sizes, (sp.Max(2 * K, 3 * N, 2 * M + 3), 5)),
+    (unbounded_while_do, (sp.Symbol('num_execs_0_2', nonnegative=True) * N, sp.Symbol('num_execs_0_2',
+                                                                                      nonnegative=True))),
+    # We get this Max(1, num_execs), since it is a do-while loop, but the num_execs symbol does not capture this.
+    (unbounded_do_while, (sp.Max(1, sp.Symbol('num_execs_0_1', nonnegative=True)) * N,
+                          sp.Max(1, sp.Symbol('num_execs_0_1', nonnegative=True)))),
+    (unbounded_nonnegify, (2 * sp.Symbol('num_execs_0_7', nonnegative=True) * N,
+                           2 * sp.Symbol('num_execs_0_7', nonnegative=True))),
+    (continue_for_loop, (sp.Symbol('num_execs_0_6', nonnegative=True) * N, sp.Symbol('num_execs_0_6',
+                                                                                     nonnegative=True))),
+    (break_for_loop, (N**2, N)),
+    (break_while_loop, (sp.Symbol('num_execs_0_5', nonnegative=True) * N, sp.Symbol('num_execs_0_5', nonnegative=True)))
+]
+
+
+def test_work_depth():
+    good = 0
+    failed = 0
+    exception = 0
+    failed_tests = []
+    for test, correct in tests_cases:
+        w_d_map = {}
+        sdfg = test.to_sdfg()
+        if 'nested_sdfg' in test.name:
+            sdfg.apply_transformations(NestSDFG)
+        if 'nested_maps' in test.name:
+            sdfg.apply_transformations(MapExpansion)
+
+        analyze_sdfg(sdfg, w_d_map, get_tasklet_work_depth)
+        res = w_d_map[get_uuid(sdfg)]
+        # check result
+        assert correct == res
+
+
+if __name__ == '__main__':
+    test_work_depth()

From 1cb9f9fa459390df0267b1f9365bb62793563b95 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 17 Aug 2023 13:58:33 +0200
Subject: [PATCH 356/392] Added support for StructureViews.

---
 dace/codegen/compiled_sdfg.py     |  2 +-
 dace/codegen/dispatcher.py        |  4 ++--
 dace/codegen/targets/cpu.py       | 20 ++++++++++++++++----
 dace/codegen/targets/framecode.py |  2 +-
 dace/data.py                      |  1 +
 dace/sdfg/utils.py                |  2 +-
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 863e804802..9ee0772eeb 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -473,7 +473,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
                 else:
                     warnings.warn(f'Casting scalar argument "{a}" from {type(arg).__name__} to {atype.dtype.type}')
                     arglist[i] = atype.dtype.type(arg)
-            elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray)
+            elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and not isinstance(atype, dt.StructArray)
                   and atype.dtype.as_numpy_dtype() != arg.dtype):
                 # Make exception for vector types
                 if (isinstance(atype.dtype, dtypes.vector) and atype.dtype.vtype.as_numpy_dtype() == arg.dtype):
diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 0b4f58d5ef..5972f5759d 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -504,11 +504,11 @@ def get_copy_dispatcher(self, src_node, dst_node, edge, sdfg, state):
             dst_is_data = True
 
         # Skip copies to/from views where edge matches
-        if src_is_data and isinstance(src_node.desc(sdfg), dt.View):
+        if src_is_data and isinstance(src_node.desc(sdfg), (dt.StructureView, dt.View)):
             e = sdutil.get_view_edge(state, src_node)
             if e is edge:
                 return None
-        if dst_is_data and isinstance(dst_node.desc(sdfg), dt.View):
+        if dst_is_data and isinstance(dst_node.desc(sdfg), (dt.StructureView, dt.View)):
             e = sdutil.get_view_edge(state, dst_node)
             if e is edge:
                 return None
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 3cd262e050..1fa4778806 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -215,9 +215,21 @@ def allocate_view(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.A
                                                         ancestor=0,
                                                         is_write=is_write)
         if not declared:
-            declaration_stream.write(f'{atype} {aname};', sdfg, state_id, node)
             ctypedef = dtypes.pointer(nodedesc.dtype).ctype
             self._dispatcher.declared_arrays.add(aname, DefinedType.Pointer, ctypedef)
+            if isinstance(nodedesc, data.StructureView):
+                for k, v in nodedesc.members.items():
+                    if isinstance(v, data.Data):
+                        ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype
+                        defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer
+                        self._dispatcher.declared_arrays.add(f"{name}.{k}", defined_type, ctypedef)
+                        self._dispatcher.defined_vars.add(f"{name}.{k}", defined_type, ctypedef)
+                # TODO: Find a better way to do this (the issue is with pointers of pointers)
+                if atype.endswith('*'):
+                    atype = atype[:-1]
+                if value.startswith('&'):
+                    value = value[1:]
+            declaration_stream.write(f'{atype} {aname};', sdfg, state_id, node)
         allocation_stream.write(f'{aname} = {value};', sdfg, state_id, node)
 
     def allocate_reference(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.AccessNode,
@@ -311,7 +323,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         if not isinstance(nodedesc.dtype, dtypes.opaque):
             arrsize_bytes = arrsize * nodedesc.dtype.bytes
 
-        if isinstance(nodedesc, data.Structure):
+        if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView):
             declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n")
             define_var(name, DefinedType.Pointer, nodedesc.ctype)
             for k, v in nodedesc.members.items():
@@ -322,7 +334,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
                     self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream,
                                         declaration_stream, allocation_stream)
             return
-        if isinstance(nodedesc, data.View):
+        if isinstance(nodedesc, (data.StructureView, data.View)):
             return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream)
         if isinstance(nodedesc, data.Reference):
             return self.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream,
@@ -487,7 +499,7 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream,
                                               dtypes.AllocationLifetime.External)
             self._dispatcher.declared_arrays.remove(alloc_name, is_global=is_global)
 
-        if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
+        if isinstance(nodedesc, (data.Scalar, data.StructureView, data.View, data.Stream, data.Reference)):
             return
         elif (nodedesc.storage == dtypes.StorageType.CPU_Heap
               or (nodedesc.storage == dtypes.StorageType.Register and symbolic.issymbolic(arrsize, sdfg.constants))):
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 52915f51b5..9ee5c2ef17 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -749,7 +749,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
                     instances = access_instances[sdfg.sdfg_id][name]
 
                     # A view gets "allocated" everywhere it appears
-                    if isinstance(desc, data.View):
+                    if isinstance(desc, (data.StructureView, data.View)):
                         for s, n in instances:
                             self.to_allocate[s].append((sdfg, s, n, False, True, False))
                             self.to_allocate[s].append((sdfg, s, n, False, False, True))
diff --git a/dace/data.py b/dace/data.py
index 99d7ffc774..bf771db1d4 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -510,6 +510,7 @@ def validate(self):
         if self.lifetime != dtypes.AllocationLifetime.Scope:
             raise ValueError('Only Scope allocation lifetime is supported for Views')
 
+
 @make_properties
 class Scalar(Data):
     """ Data descriptor of a scalar value. """
diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py
index d08518b10c..3396335ece 100644
--- a/dace/sdfg/utils.py
+++ b/dace/sdfg/utils.py
@@ -1396,7 +1396,7 @@ def is_nonfree_sym_dependent(node: nd.AccessNode, desc: dt.Data, state: SDFGStat
     :param state: the state that contains the node
     :param fsymbols: the free symbols to check against
     """
-    if isinstance(desc, dt.View):
+    if isinstance(desc, (dt.StructureView, dt.View)):
         # Views can be non-free symbol dependent due to the adjacent edges.
         e = get_view_edge(state, node)
         if e.data:

From 5a2c4602c2341f057a5159c3cbe2437f33ab24e8 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 17 Aug 2023 13:58:58 +0200
Subject: [PATCH 357/392] Added tests for StructArrays.

---
 tests/sdfg/data/struct_array_test.py | 184 +++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/sdfg/data/struct_array_test.py

diff --git a/tests/sdfg/data/struct_array_test.py b/tests/sdfg/data/struct_array_test.py
new file mode 100644
index 0000000000..9b40379e53
--- /dev/null
+++ b/tests/sdfg/data/struct_array_test.py
@@ -0,0 +1,184 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import ctypes
+import dace
+import numpy as np
+
+from scipy import sparse
+
+
+def test_read_struct_array():
+
+    L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz'))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
+                                  name='CSRMatrix')
+    csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
+                                  name='CSRMatrix',
+                                  transient=True)
+
+    sdfg = dace.SDFG('array_of_csr_to_dense')
+
+    sdfg.add_datadesc('A', csr_obj[L])
+    sdfg.add_array('B', [L, M, N], dace.float32)
+
+    sdfg.add_datadesc('vcsr', csr_obj_view)
+    sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
+
+    state = sdfg.add_state()
+
+    A = state.add_access('A')
+    B = state.add_access('B')
+
+    bme, bmx = state.add_map('b', dict(b='0:L'))
+    bme.map.schedule = dace.ScheduleType.Sequential
+
+    vcsr = state.add_access('vcsr')
+    indptr = state.add_access('vindptr')
+    indices = state.add_access('vindices')
+    data = state.add_access('vdata')
+
+    state.add_memlet_path(A, bme, vcsr, dst_conn='views', memlet=dace.Memlet(data='A', subset='b'))
+    state.add_edge(vcsr, None, indptr, 'views', memlet=dace.Memlet.from_array('vcsr.indptr', csr_obj.members['indptr']))
+    state.add_edge(vcsr, None, indices, 'views', memlet=dace.Memlet.from_array('vcsr.indices', csr_obj.members['indices']))
+    state.add_edge(vcsr, None, data, 'views', memlet=dace.Memlet.from_array('vcsr.data', csr_obj.members['data']))
+
+    ime, imx = state.add_map('i', dict(i='0:M'))
+    jme, jmx = state.add_map('idx', dict(idx='start:stop'))
+    jme.add_in_connector('start')
+    jme.add_in_connector('stop')
+    t = state.add_tasklet('indirection', {'j', '__val'}, {'__out'}, '__out[i, j] = __val')
+
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i'), dst_conn='start')
+    state.add_memlet_path(indptr, ime, jme, memlet=dace.Memlet(data='vindptr', subset='i+1'), dst_conn='stop')
+    state.add_memlet_path(indices, ime, jme, t, memlet=dace.Memlet(data='vindices', subset='idx'), dst_conn='j')
+    state.add_memlet_path(data, ime, jme, t, memlet=dace.Memlet(data='vdata', subset='idx'), dst_conn='__val')
+    state.add_memlet_path(t, jmx, imx, bmx, B, memlet=dace.Memlet(data='B', subset='b, 0:M, 0:N', volume=1), src_conn='__out')
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    A = np.ndarray((10,), dtype=sparse.csr_matrix)
+    dace_A = np.ndarray((10,), dtype=ctypes.c_void_p)  
+    B = np.zeros((10, 20, 20), dtype=np.float32)
+
+    ctypes_A = []
+    for b in range(10):
+        A[b] = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+        ctypes_obj = csr_obj.dtype._typeclass.as_ctypes()(indptr=A[b].indptr.__array_interface__['data'][0],
+                                                          indices=A[b].indices.__array_interface__['data'][0],
+                                                          data=A[b].data.__array_interface__['data'][0])
+        ctypes_A.append(ctypes_obj)  # This is needed to keep the object alive ...
+        dace_A[b] = ctypes.addressof(ctypes_obj)
+
+    func(A=dace_A, B=B, L=A.shape[0], M=A[0].shape[0], N=A[0].shape[1], nnz=A[0].nnz)
+    ref = np.ndarray((10, 20, 20), dtype=np.float32)
+    for b in range(10):
+        ref[b] = A[b].toarray()
+
+    assert np.allclose(B, ref)
+
+
+def test_write_struct_array():
+
+    L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz'))
+    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
+                                  name='CSRMatrix')
+    csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+                                  order=['indptr', 'indices', 'data'],
+                                  name='CSRMatrix',
+                                  transient=True)
+
+    sdfg = dace.SDFG('array_dense_to_csr')
+
+    sdfg.add_array('A', [L, M, N], dace.float32)
+    sdfg.add_datadesc('B', csr_obj[L])
+
+    sdfg.add_datadesc('vcsr', csr_obj_view)
+    sdfg.add_view('vindptr', csr_obj.members['indptr'].shape, csr_obj.members['indptr'].dtype)
+    sdfg.add_view('vindices', csr_obj.members['indices'].shape, csr_obj.members['indices'].dtype)
+    sdfg.add_view('vdata', csr_obj.members['data'].shape, csr_obj.members['data'].dtype)
+
+    # Make If
+    if_before = sdfg.add_state('if_before')
+    if_guard = sdfg.add_state('if_guard')
+    if_body = sdfg.add_state('if_body')
+    if_after = sdfg.add_state('if_after')
+    sdfg.add_edge(if_before, if_guard, dace.InterstateEdge())
+    sdfg.add_edge(if_guard, if_body, dace.InterstateEdge(condition='A[k, i, j] != 0'))
+    sdfg.add_edge(if_body, if_after, dace.InterstateEdge(assignments={'idx': 'idx + 1'}))
+    sdfg.add_edge(if_guard, if_after, dace.InterstateEdge(condition='A[k, i, j] == 0'))
+    A = if_body.add_access('A')
+    vcsr = if_body.add_access('vcsr')
+    B = if_body.add_access('B')
+    indices = if_body.add_access('vindices')
+    data = if_body.add_access('vdata')
+    if_body.add_edge(A, None, data, None, dace.Memlet(data='A', subset='k, i, j', other_subset='idx'))
+    if_body.add_edge(data, 'views', vcsr, None, dace.Memlet(data='vcsr.data', subset='0:nnz'))
+    t = if_body.add_tasklet('set_indices', {}, {'__out'}, '__out = j')
+    if_body.add_edge(t, '__out', indices, None, dace.Memlet(data='vindices', subset='idx'))
+    if_body.add_edge(indices, 'views', vcsr, None, dace.Memlet(data='vcsr.indices', subset='0:nnz'))
+    if_body.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k'))
+    # Make For Loop  for j
+    j_before, j_guard, j_after = sdfg.add_loop(None,
+                                               if_before,
+                                               None,
+                                               'j',
+                                               '0',
+                                               'j < N',
+                                               'j + 1',
+                                               loop_end_state=if_after)
+    # Make For Loop  for i
+    i_before, i_guard, i_after = sdfg.add_loop(None, j_before, None, 'i', '0', 'i < M', 'i + 1', loop_end_state=j_after)
+    sdfg.start_state = sdfg.node_id(i_before)
+    i_before_guard = sdfg.edges_between(i_before, i_guard)[0]
+    i_before_guard.data.assignments['idx'] = '0'
+    vcsr = i_guard.add_access('vcsr')
+    B = i_guard.add_access('B')
+    indptr = i_guard.add_access('vindptr')
+    t = i_guard.add_tasklet('set_indptr', {}, {'__out'}, '__out = idx')
+    i_guard.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='i'))
+    i_guard.add_edge(indptr, 'views', vcsr, None, dace.Memlet(data='vcsr.indptr', subset='0:M+1'))
+    i_guard.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k'))
+    vcsr = i_after.add_access('vcsr')
+    B = i_after.add_access('B')
+    indptr = i_after.add_access('vindptr')
+    t = i_after.add_tasklet('set_indptr', {}, {'__out'}, '__out = nnz')
+    i_after.add_edge(t, '__out', indptr, None, dace.Memlet(data='vindptr', subset='M'))
+    i_after.add_edge(indptr, 'views', vcsr, None, dace.Memlet(data='vcsr.indptr', subset='0:M+1'))
+    i_after.add_edge(vcsr, 'views', B, None, dace.Memlet(data='B', subset='k'))
+
+    k_before, k_guard, k_after = sdfg.add_loop(None, i_before, None, 'k', '0', 'k < L', 'k + 1', loop_end_state=i_after)
+
+    func = sdfg.compile()
+
+    rng = np.random.default_rng(42)
+    B = np.ndarray((10,), dtype=sparse.csr_matrix)
+    dace_B = np.ndarray((10,), dtype=ctypes.c_void_p)  
+    A = np.empty((10, 20, 20), dtype=np.float32)
+
+    ctypes_B = []
+    for b in range(10):
+        B[b] = sparse.random(20, 20, density=0.1, format='csr', dtype=np.float32, random_state=rng)
+        A[b] = B[b].toarray()
+        nnz = B[b].nnz
+        B[b].indptr[:] = -1
+        B[b].indices[:] = -1
+        B[b].data[:] = -1
+        ctypes_obj = csr_obj.dtype._typeclass.as_ctypes()(indptr=B[b].indptr.__array_interface__['data'][0],
+                                                          indices=B[b].indices.__array_interface__['data'][0],
+                                                          data=B[b].data.__array_interface__['data'][0])
+        ctypes_B.append(ctypes_obj)  # This is needed to keep the object alive ...
+        dace_B[b] = ctypes.addressof(ctypes_obj)
+
+    func(A=A, B=dace_B, L=B.shape[0], M=B[0].shape[0], N=B[0].shape[1], nnz=nnz)
+    for b in range(10):
+        assert np.allclose(A[b], B[b].toarray())
+
+
+if __name__ == '__main__':
+    test_read_struct_array()
+    test_write_struct_array()

From f1b0c73dffee4468119cd1575edecc9f1fa7bdab Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Thu, 17 Aug 2023 15:15:24 +0200
Subject: [PATCH 358/392] Fixed serialization.

---
 dace/data.py       | 22 +++++++++++++++++++++-
 dace/properties.py |  2 +-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index bf771db1d4..37d532ac44 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1102,9 +1102,29 @@ def __init__(self,
                  pool=False):
 
         self.stype = stype
-        dtype = stype.dtype
+        if stype:
+            dtype = stype.dtype
+        else:
+            dtype = dtypes.int8
         super(StructArray, self).__init__(dtype, shape, transient, allow_conflicts, storage, location, strides, offset,
                                           may_alias, lifetime, alignment, debuginfo, total_size, start_offset, optional, pool)
+    
+    @classmethod
+    def from_json(cls, json_obj, context=None):
+        # Create dummy object
+        ret = cls(None, ())
+        serialize.set_properties_from_json(ret, json_obj, context=context)
+
+        # Default shape-related properties
+        if not ret.offset:
+            ret.offset = [0] * len(ret.shape)
+        if not ret.strides:
+            # Default strides are C-ordered
+            ret.strides = [_prod(ret.shape[i + 1:]) for i in range(len(ret.shape))]
+        if ret.total_size == 0:
+            ret.total_size = _prod(ret.shape)
+        
+        return ret
 
 
 @make_properties
diff --git a/dace/properties.py b/dace/properties.py
index fb37ec7a7c..0bec65d0ec 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -1408,7 +1408,7 @@ def to_string(obj):
     def to_json(self, obj):
         if obj is None:
             return None
-        return obj.dtype.to_json()
+        return obj.to_json()
 
     @staticmethod
     def from_json(obj, context=None):

From 82c2bb82315fdb94a2033b84295ed888859c5b62 Mon Sep 17 00:00:00 2001
From: Tiziano De Matteis <5871117+TizianoDeMatteis@users.noreply.github.com>
Date: Mon, 21 Aug 2023 16:44:27 +0200
Subject: [PATCH 359/392] Have memory type as argument for fpga auto interleave
 (#1352)

Co-authored-by: Tiziano De Matteis <tdematt@inf.ethz.ch>
---
 dace/transformation/auto/fpga.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/auto/fpga.py b/dace/transformation/auto/fpga.py
index 4295699cdb..573341e1f6 100644
--- a/dace/transformation/auto/fpga.py
+++ b/dace/transformation/auto/fpga.py
@@ -44,24 +44,28 @@ def fpga_global_to_local(sdfg: SDFG, max_size: int = 1048576) -> None:
         print(f'Applied {len(converted)} Global-To-Local{": " if len(converted)>0 else "."} {", ".join(converted)}')
 
 
-def fpga_rr_interleave_containers_to_banks(sdfg: SDFG, num_banks: int = 4):
+def fpga_rr_interleave_containers_to_banks(sdfg: SDFG, num_banks: int = 4, memory_type: str = "DDR"):
     """
     Allocates the (global) arrays to FPGA off-chip memory banks, interleaving them in a
     Round-Robin (RR) fashion. This applies to all the arrays in the SDFG hierarchy.
 
     :param sdfg: The SDFG to operate on.
     :param num_banks: number of off-chip memory banks to consider
+    :param memory_type: type of off-chip memory, either "DDR"  or "HBM" (if the target FPGA supports it)
     :return: a list containing  the number of (transient) arrays allocated to each bank
     :note: Operates in-place on the SDFG.
     """
 
+    if memory_type.upper() not in {"DDR", "HBM"}:
+        raise ValueError("Memory type should be either \"DDR\" or \"HBM\"")
+
     # keep track of memory allocated to each bank
     num_allocated = [0 for i in range(num_banks)]
 
     i = 0
     for sd, aname, desc in sdfg.arrays_recursive():
         if not isinstance(desc, dt.Stream) and desc.storage == dtypes.StorageType.FPGA_Global and desc.transient:
-            desc.location["memorytype"] = "ddr"
+            desc.location["memorytype"] = memory_type.upper()
             desc.location["bank"] = str(i % num_banks)
             num_allocated[i % num_banks] = num_allocated[i % num_banks] + 1
             i = i + 1

From c5889a4e3092a89a5466f6b8c2fe29d3ea3ad1a1 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Mon, 21 Aug 2023 17:20:43 +0200
Subject: [PATCH 360/392] Addressed comments.

---
 dace/codegen/targets/cpp.py |  2 ++
 dace/codegen/targets/cpu.py | 15 +++++++++------
 dace/data.py                |  6 +++---
 dace/dtypes.py              |  2 +-
 dace/properties.py          |  8 +++++---
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 093a324d9a..d3d4f50ccd 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -370,6 +370,8 @@ def make_const(expr: str) -> str:
     # Register defined variable
     dispatcher.defined_vars.add(pointer_name, defined_type, typedef, allow_shadowing=True)
 
+    # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and structures.
+    # NOTE: Since structures are implemented as pointers, we replace dots with arrows.
     expr = expr.replace('.', '->')
 
     return (typedef + ref, pointer_name, expr)
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 20615a3136..0464672390 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -55,10 +55,13 @@ def __init__(self, frame_codegen, sdfg):
         # Keep track of generated NestedSDG, and the name of the assigned function
         self._generated_nested_sdfg = dict()
 
+        # NOTE: Multi-nesting with StructArrays must be further investigated.
         def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
             for k, v in struct.members.items():
                 if isinstance(v, data.Structure):
                     _visit_structure(v, args, f'{prefix}.{k}')
+                elif isinstance(v, data.StructArray):
+                    _visit_structure(v.stype, args, f'{prefix}.{k}')
                 elif isinstance(v, data.Data):
                     args[f'{prefix}.{k}'] = v
 
@@ -71,11 +74,7 @@ def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
             elif isinstance(arg_type, data.StructArray):
                 desc = sdfg.arrays[name]
                 desc = desc.stype
-                for attr in dir(desc):
-                    value = getattr(desc, attr)
-                    if isinstance(value, data.Data):
-                        assert attr in sdfg.arrays
-                        arglist[attr] = value
+                _visit_structure(desc, arglist, name)
 
         for name, arg_type in arglist.items():
             if isinstance(arg_type, (data.Scalar, data.Structure)):
@@ -300,6 +299,8 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         name = node.data
         alloc_name = cpp.ptr(name, nodedesc, sdfg, self._frame)
         name = alloc_name
+        # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and
+        # NOTE: structures. Since structures are implemented as pointers, we replace dots with arrows.
         alloc_name = alloc_name.replace('.', '->')
 
         if nodedesc.transient is False:
@@ -324,7 +325,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
             arrsize_bytes = arrsize * nodedesc.dtype.bytes
 
         if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView):
-            declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type}();\n")
+            declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type};\n")
             define_var(name, DefinedType.Pointer, nodedesc.ctype)
             for k, v in nodedesc.members.items():
                 if isinstance(v, data.Data):
@@ -1183,6 +1184,8 @@ def memlet_definition(self,
         if not types:
             types = self._dispatcher.defined_vars.get(ptr, is_global=True)
         var_type, ctypedef = types
+        # NOTE: `expr` may only be a name or a sequence of names and dots. The latter indicates nested data and
+        # NOTE: structures. Since structures are implemented as pointers, we replace dots with arrows.
         ptr = ptr.replace('.', '->')
 
         if fpga.is_fpga_array(desc):
diff --git a/dace/data.py b/dace/data.py
index 37d532ac44..5f05cbfcc8 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -374,7 +374,7 @@ class Structure(Data):
                        desc="Dictionary of structure members",
                        from_json=_arrays_from_json,
                        to_json=_arrays_to_json)
-    name = Property(dtype=str, desc="Structure name")
+    name = Property(dtype=str, desc="Structure type name")
 
     def __init__(self,
                  members: Dict[str, Data],
@@ -478,7 +478,7 @@ def as_arg(self, with_types=True, for_call=False, name=None):
     def __getitem__(self, s):
         """ This is syntactic sugar that allows us to define an array type
             with the following syntax: ``Structure[N,M]``
-            :return: A ``data.Array`` data descriptor.
+            :return: A ``data.StructArray`` data descriptor.
         """
         if isinstance(s, list) or isinstance(s, tuple):
             return StructArray(self, tuple(s))
@@ -1084,7 +1084,7 @@ class StructArray(Array):
     stype = NestedDataClassProperty(allow_none=True, default=None)
 
     def __init__(self,
-                 stype,
+                 stype: Structure,
                  shape,
                  transient=False,
                  allow_conflicts=False,
diff --git a/dace/dtypes.py b/dace/dtypes.py
index 888f74f6b9..f0bac23958 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -835,9 +835,9 @@ def as_ctypes(self):
                 fields.append((k, v.as_ctypes()))
             else:
                 fields.append((k, _FFI_CTYPES[v.type]))
-        # fields = sorted(fields, key=lambda f: f[0])
         # Create new struct class.
         struct_class = type("NewStructClass", (ctypes.Structure, ), {"_fields_": fields})
+        # NOTE: Each call to `type` returns a different class, so we need to cache it to ensure uniqueness.
         _FFI_CTYPES[self] = struct_class
         return struct_class
 
diff --git a/dace/properties.py b/dace/properties.py
index 0bec65d0ec..0adcfe3e97 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -1392,12 +1392,14 @@ def __get__(self, obj, objtype=None) -> 'Data':
 
     @property
     def dtype(self):
-        return pydoc.locate("dace.data.Data")
+        from dace import data as dt
+        return dt.Data
 
     @staticmethod
     def from_string(s):
-        dtype = pydoc.locate("dace.data.{}".format(s))
-        if dtype is None or not isinstance(dtype, pydoc.locate("dace.data.Data")):
+        from dace import data as dt
+        dtype = getattr(dt, s, None)
+        if dtype is None or not isinstance(dtype, dt.Data):
             raise ValueError("Not a valid data type: {}".format(s))
         return dtype
 

From eabbd1d6cd451556813ffea93cfa771767ef8561 Mon Sep 17 00:00:00 2001
From: Alexandros Nikolaos Ziogas <alexandros.ziogas@inf.ethz.ch>
Date: Tue, 22 Aug 2023 15:52:45 +0200
Subject: [PATCH 361/392] Addressed comments.

---
 dace/data.py                         | 27 +++++++++++----------------
 dace/properties.py                   |  4 ++++
 tests/sdfg/data/struct_array_test.py | 23 +++++++++++------------
 tests/sdfg/data/structure_test.py    |  8 --------
 4 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 5f05cbfcc8..3b571e6537 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -5,7 +5,7 @@
 
 from collections import OrderedDict
 from numbers import Number
-from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
 
 import numpy
 import sympy as sp
@@ -19,7 +19,8 @@
 from dace import serialize, symbolic
 from dace.codegen import cppunparse
 from dace.properties import (DebugInfoProperty, DictProperty, EnumProperty, ListProperty, NestedDataClassProperty,
-                             Property, ShapeProperty, SymbolicProperty, TypeClassProperty, make_properties)
+                             OrderedDictProperty, Property, ShapeProperty, SymbolicProperty, TypeClassProperty,
+                             make_properties)
 
 
 def create_datadescriptor(obj, no_custom_desc=False):
@@ -370,15 +371,14 @@ def _arrays_from_json(obj, context=None):
 class Structure(Data):
     """ Base class for structures. """
 
-    members = Property(dtype=OrderedDict,
-                       desc="Dictionary of structure members",
-                       from_json=_arrays_from_json,
-                       to_json=_arrays_to_json)
+    members = OrderedDictProperty(default=OrderedDict(),
+                                  desc="Dictionary of structure members",
+                                  from_json=_arrays_from_json,
+                                  to_json=_arrays_to_json)
     name = Property(dtype=str, desc="Structure type name")
 
     def __init__(self,
-                 members: Dict[str, Data],
-                 order: List[str] = None,
+                 members: Union[Dict[str, Data], List[Tuple[str, Data]]],
                  name: str = 'Structure',
                  transient: bool = False,
                  storage: dtypes.StorageType = dtypes.StorageType.Default,
@@ -386,19 +386,14 @@ def __init__(self,
                  lifetime: dtypes.AllocationLifetime = dtypes.AllocationLifetime.Scope,
                  debuginfo: dtypes.DebugInfo = None):
 
-        order = order or list(members.keys())
-        if set(members.keys()) != set(order):
-            raise ValueError('Order must contain all members of the structure.')
-        
-        # TODO: Should we make a deep-copy here?
-        self.members = OrderedDict((k, members[k]) for k in order)
-
+        self.members = OrderedDict(members)
         for k, v in self.members.items():
             v.transient = transient
+
         self.name = name
         fields_and_types = OrderedDict()
         symbols = set()
-        for k, v in members.items():
+        for k, v in self.members.items():
             if isinstance(v, Structure):
                 symbols |= v.free_symbols
                 fields_and_types[k] = (v.dtype, str(v.total_size))
diff --git a/dace/properties.py b/dace/properties.py
index 0adcfe3e97..61e569341f 100644
--- a/dace/properties.py
+++ b/dace/properties.py
@@ -145,11 +145,15 @@ def fs(obj, *args, **kwargs):
                 self._from_json = lambda *args, **kwargs: dace.serialize.from_json(*args, known_type=dtype, **kwargs)
         else:
             self._from_json = from_json
+            if self.from_json != from_json:
+                self.from_json = from_json
 
         if to_json is None:
             self._to_json = dace.serialize.to_json
         else:
             self._to_json = to_json
+            if self.to_json != to_json:
+                self.to_json = to_json
 
         if meta_to_json is None:
 
diff --git a/tests/sdfg/data/struct_array_test.py b/tests/sdfg/data/struct_array_test.py
index 9b40379e53..8e0f2f4739 100644
--- a/tests/sdfg/data/struct_array_test.py
+++ b/tests/sdfg/data/struct_array_test.py
@@ -10,12 +10,11 @@ def test_read_struct_array():
 
     L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
-    csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
-                                  name='CSRMatrix',
-                                  transient=True)
+    csr_obj_view = dace.data.StructureView(
+        [('indptr', dace.int32[M + 1]), ('indices', dace.int32[nnz]), ('data', dace.float32[nnz])],
+        name='CSRMatrix',
+        transient=True)
 
     sdfg = dace.SDFG('array_of_csr_to_dense')
 
@@ -84,13 +83,13 @@ def test_read_struct_array():
 def test_write_struct_array():
 
     L, M, N, nnz = (dace.symbol(s) for s in ('L', 'M', 'N', 'nnz'))
-    csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
-                                  name='CSRMatrix')
-    csr_obj_view = dace.data.StructureView(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
-                                  name='CSRMatrix',
-                                  transient=True)
+    csr_obj = dace.data.Structure(
+        [('indptr', dace.int32[M + 1]), ('indices', dace.int32[nnz]), ('data', dace.float32[nnz])],
+        name='CSRMatrix')
+    csr_obj_view = dace.data.StructureView(
+        dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
+        name='CSRMatrix',
+        transient=True)
 
     sdfg = dace.SDFG('array_dense_to_csr')
 
diff --git a/tests/sdfg/data/structure_test.py b/tests/sdfg/data/structure_test.py
index 995aacb2fd..02b8f0c174 100644
--- a/tests/sdfg/data/structure_test.py
+++ b/tests/sdfg/data/structure_test.py
@@ -12,7 +12,6 @@ def test_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense')
@@ -69,7 +68,6 @@ def test_write_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('dense_to_csr')
@@ -147,10 +145,8 @@ def test_local_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     tmp_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix',
                                   transient=True)
 
@@ -258,7 +254,6 @@ def test_local_structure():
 def test_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -320,7 +315,6 @@ def test_write_nested_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 
@@ -402,7 +396,6 @@ def test_direct_read_structure():
 
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
 
     sdfg = dace.SDFG('csr_to_dense_direct')
@@ -453,7 +446,6 @@ def test_direct_read_structure():
 def test_direct_read_nested_structure():
     M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
     csr_obj = dace.data.Structure(dict(indptr=dace.int32[M + 1], indices=dace.int32[nnz], data=dace.float32[nnz]),
-                                  order=['indptr', 'indices', 'data'],
                                   name='CSRMatrix')
     wrapper_obj = dace.data.Structure(dict(csr=csr_obj), name='Wrapper')
 

From 602220eb7fab11fbf9190c7db4568a3371ff1ab7 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 29 Aug 2023 20:06:19 -0700
Subject: [PATCH 362/392] Codegen: Make thread/block index type configurable

---
 dace/codegen/targets/cuda.py | 15 +++++++++++----
 dace/config_schema.yml       | 11 +++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index ee49f04d03..a465d2bbc0 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1939,6 +1939,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
                               kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream):
         node = dfg_scope.source_nodes()[0]
 
+        # Get the thread/block index type
+        ttype = Config.get('compiler', 'cuda', 'thread_id_type')
+        tidtype = getattr(dtypes, ttype, False)
+        if not isinstance(tidtype, dtypes.typeclass):
+            raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. '
+                             'See ``dace.dtypes`` for available types (for example ``int32``).')
+
         # allocating shared memory for dynamic threadblock maps
         if has_dtbmap:
             kernel_stream.write(
@@ -1990,8 +1997,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
 
                 expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
 
-                kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
-                self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
+                kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
+                self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)
 
             # Delinearize beyond the third dimension
             if len(krange) > 3:
@@ -2010,8 +2017,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
                     )
 
                     expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
-                    kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
-                    self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
+                    kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
+                    self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)
 
         # Dispatch internal code
         assert CUDACodeGen._in_device_code is False
diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index e378b6c1f2..08a427aa52 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -413,6 +413,17 @@ required:
                             a specified larger block size in the third dimension. Default value is
                             derived from hardware limits on common GPUs.
 
+                    thread_id_type:
+                        type: str
+                        title: Thread/block index data type
+                        default: int32
+                        description: >
+                            Defines the data type for a thread and block index in the generated code.
+                            The type is based on the type-classes in ``dace.dtypes``. For example,
+                            ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large
+                            index types are needed to address memory offsets that are beyond the 32-bit
+                            range, or to reduce memory usage.
+
 
             #############################################
             # General FPGA flags

From 5f6e371f2905b835da8f594db94bb7b44b0305da Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 29 Aug 2023 20:06:46 -0700
Subject: [PATCH 363/392] Rename alpha/beta in library node to avoid clashes

---
 dace/libraries/blas/nodes/gemm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py
index 2db2055ae5..83be99d78b 100644
--- a/dace/libraries/blas/nodes/gemm.py
+++ b/dace/libraries/blas/nodes/gemm.py
@@ -184,11 +184,11 @@ def expansion(node, state, sdfg):
         code = ''
         if dtype in (dace.complex64, dace.complex128):
             code = f'''
-            {dtype.ctype} alpha = {alpha};
-            {dtype.ctype} beta = {beta};
+            {dtype.ctype} __alpha = {alpha};
+            {dtype.ctype} __beta = {beta};
             '''
-            opt['alpha'] = '&alpha'
-            opt['beta'] = '&beta'
+            opt['alpha'] = '&__alpha'
+            opt['beta'] = '&__beta'
 
         code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, "
                  "{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, "
@@ -287,12 +287,12 @@ def expansion(cls, node, state, sdfg):
 
             # Set pointer mode to host
             call_prefix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_host});
-            {dtype.ctype} alpha = {alpha};
-            {dtype.ctype} beta = {beta};
+            {dtype.ctype} __alpha = {alpha};
+            {dtype.ctype} __beta = {beta};
             '''
             call_suffix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_device});'''
-            alpha = f'({cdtype} *)&alpha'
-            beta = f'({cdtype} *)&beta'
+            alpha = f'({cdtype} *)&__alpha'
+            beta = f'({cdtype} *)&__beta'
         else:
             alpha = constants[node.alpha]
             beta = constants[node.beta]

From acd58851e66ee561e3a60bef79719a9ca9f7ffaf Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 29 Aug 2023 20:56:08 -0700
Subject: [PATCH 364/392] Respect return type of get_external_memory_size

---
 dace/codegen/compiled_sdfg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 9ee0772eeb..22f95d01d7 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -287,6 +287,7 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
         result: Dict[dtypes.StorageType, int] = {}
         for storage in self.external_memory_types:
             func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}')
+            func.restype = ctypes.c_size_t
             result[storage] = func(self._libhandle, *self._lastargs[1])
 
         return result

From 30fdcf7916f419bbb4484d8eac4342a302592705 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 29 Aug 2023 20:56:36 -0700
Subject: [PATCH 365/392] Handle large integer values in C code generation

---
 dace/codegen/cppunparse.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
index eae0ed229e..31dae08f79 100644
--- a/dace/codegen/cppunparse.py
+++ b/dace/codegen/cppunparse.py
@@ -78,6 +78,7 @@
 import numpy as np
 import os
 import tokenize
+import warnings
 
 import sympy
 import dace
@@ -733,6 +734,21 @@ def _Num(self, t):
         if isinstance(t.n, complex):
             dtype = dtypes.DTYPE_TO_TYPECLASS[complex]
 
+        # Handle large integer values
+        if isinstance(t.n, int):
+            bits = t.n.bit_length()
+            if bits == 32:  # Integer, potentially unsigned
+                if t.n >= 0:  # unsigned
+                    repr_n += 'U'
+                else:  # signed, 64-bit
+                    repr_n += 'LL'
+            elif 32 < bits <= 63:
+                repr_n += 'LL'
+            elif bits == 64 and t.n >= 0:
+                repr_n += 'ULL'
+            elif bits >= 64:
+                warnings.warn(f'Value wider than 64 bits encountered in expression ({t.n}), emitting as-is')
+
         if repr_n.endswith("j"):
             self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
         else:

From c5ca99ad37e7ceef6da71026c3c8bb579f64117f Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Tue, 29 Aug 2023 23:05:10 -0700
Subject: [PATCH 366/392] Eliminate extraneous branch-end gotos in code
 generation (#1355)

---
 dace/codegen/control_flow.py                 | 77 +++++++++++++++-----
 dace/codegen/targets/framecode.py            |  2 +-
 tests/codegen/control_flow_detection_test.py | 29 ++++++++
 3 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/dace/codegen/control_flow.py b/dace/codegen/control_flow.py
index 182604c892..1b97241e47 100644
--- a/dace/codegen/control_flow.py
+++ b/dace/codegen/control_flow.py
@@ -82,6 +82,9 @@ class ControlFlow:
     # a string with its generated code.
     dispatch_state: Callable[[SDFGState], str]
 
+    # The parent control flow block of this one, used to avoid generating extraneous ``goto``s
+    parent: Optional['ControlFlow']
+
     @property
     def first_state(self) -> SDFGState:
         """ 
@@ -222,11 +225,18 @@ def as_cpp(self, codegen, symbols) -> str:
                 out_edges = sdfg.out_edges(elem.state)
                 for j, e in enumerate(out_edges):
                     if e not in self.gotos_to_ignore:
-                        # If this is the last generated edge and it leads
-                        # to the next state, skip emitting goto
+                        # Skip gotos to immediate successors
                         successor = None
-                        if (j == (len(out_edges) - 1) and (i + 1) < len(self.elements)):
-                            successor = self.elements[i + 1].first_state
+                        # If this is the last generated edge
+                        if j == (len(out_edges) - 1):
+                            if (i + 1) < len(self.elements):
+                                # If last edge leads to next state in block
+                                successor = self.elements[i + 1].first_state
+                            elif i == len(self.elements) - 1:
+                                # If last edge leads to first state in next block
+                                next_block = _find_next_block(self) 
+                                if next_block is not None:
+                                    successor = next_block.first_state
 
                         expr += elem.generate_transition(sdfg, e, successor)
                     else:
@@ -478,13 +488,14 @@ def children(self) -> List[ControlFlow]:
 
 def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[InterstateEdge],
                          leave_edge: Edge[InterstateEdge], back_edges: List[Edge[InterstateEdge]],
-                         dispatch_state: Callable[[SDFGState], str]) -> Union[ForScope, WhileScope]:
+                         dispatch_state: Callable[[SDFGState],
+                                                  str], parent_block: GeneralBlock) -> Union[ForScope, WhileScope]:
     """ 
     Helper method that constructs the correct structured loop construct from a
     set of states. Can construct for or while loops.
     """
 
-    body = GeneralBlock(dispatch_state, [], [], [], [], [], True)
+    body = GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True)
 
     guard_inedges = sdfg.in_edges(guard)
     increment_edges = [e for e in guard_inedges if e in back_edges]
@@ -535,10 +546,10 @@ def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[Intersta
             # Also ignore assignments in increment edge (handled in for stmt)
             body.assignments_to_ignore.append(increment_edge)
 
-            return ForScope(dispatch_state, itvar, guard, init, condition, update, body, init_edges)
+            return ForScope(dispatch_state, parent_block, itvar, guard, init, condition, update, body, init_edges)
 
     # Otherwise, it is a while loop
-    return WhileScope(dispatch_state, guard, condition, body)
+    return WhileScope(dispatch_state, parent_block, guard, condition, body)
 
 
 def _cases_from_branches(
@@ -617,6 +628,31 @@ def _child_of(node: SDFGState, parent: SDFGState, ptree: Dict[SDFGState, SDFGSta
     return False
 
 
+def _find_next_block(block: ControlFlow) -> Optional[ControlFlow]:
+    """
+    Returns the immediate successor control flow block.
+    """
+    # Find block in parent
+    parent = block.parent
+    if parent is None:
+        return None
+    ind = next(i for i, b in enumerate(parent.children) if b is block)
+    if ind == len(parent.children) - 1 or isinstance(parent, (IfScope, IfElseChain, SwitchCaseScope)):
+        # If last block, or other children are not reachable from current node (branches),
+        # recursively continue upwards
+        return _find_next_block(parent)
+    return parent.children[ind + 1]
+
+
+def _reset_block_parents(block: ControlFlow):
+    """
+    Fixes block parents after processing.
+    """
+    for child in block.children:
+        child.parent = block
+        _reset_block_parents(child)
+
+
 def _structured_control_flow_traversal(sdfg: SDFG,
                                        start: SDFGState,
                                        ptree: Dict[SDFGState, SDFGState],
@@ -645,7 +681,7 @@ def _structured_control_flow_traversal(sdfg: SDFG,
     """
 
     def make_empty_block():
-        return GeneralBlock(dispatch_state, [], [], [], [], [], True)
+        return GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True)
 
     # Traverse states in custom order
     visited = set() if visited is None else visited
@@ -657,7 +693,7 @@ def make_empty_block():
         if node in visited or node is stop:
             continue
         visited.add(node)
-        stateblock = SingleState(dispatch_state, node)
+        stateblock = SingleState(dispatch_state, parent_block, node)
 
         oe = sdfg.out_edges(node)
         if len(oe) == 0:  # End state
@@ -708,12 +744,14 @@ def make_empty_block():
             if (len(oe) == 2 and oe[0].data.condition_sympy() == sp.Not(oe[1].data.condition_sympy())):
                 # If without else
                 if oe[0].dst is mergestate:
-                    branch_block = IfScope(dispatch_state, sdfg, node, oe[1].data.condition, cblocks[oe[1]])
+                    branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[1].data.condition,
+                                           cblocks[oe[1]])
                 elif oe[1].dst is mergestate:
-                    branch_block = IfScope(dispatch_state, sdfg, node, oe[0].data.condition, cblocks[oe[0]])
+                    branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition,
+                                           cblocks[oe[0]])
                 else:
-                    branch_block = IfScope(dispatch_state, sdfg, node, oe[0].data.condition, cblocks[oe[0]],
-                                           cblocks[oe[1]])
+                    branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition,
+                                           cblocks[oe[0]], cblocks[oe[1]])
             else:
                 # If there are 2 or more edges (one is not the negation of the
                 # other):
@@ -721,10 +759,10 @@ def make_empty_block():
                 if switch:
                     # If all edges are of form "x == y" for a single x and
                     # integer y, it is a switch/case
-                    branch_block = SwitchCaseScope(dispatch_state, sdfg, node, switch[0], switch[1])
+                    branch_block = SwitchCaseScope(dispatch_state, parent_block, sdfg, node, switch[0], switch[1])
                 else:
                     # Otherwise, create if/else if/.../else goto exit chain
-                    branch_block = IfElseChain(dispatch_state, sdfg, node,
+                    branch_block = IfElseChain(dispatch_state, parent_block, sdfg, node,
                                                [(e.data.condition, cblocks[e] if e in cblocks else make_empty_block())
                                                 for e in oe])
             # End of branch classification
@@ -739,11 +777,11 @@ def make_empty_block():
             loop_exit = None
             scope = None
             if ptree[oe[0].dst] == node and ptree[oe[1].dst] != node:
-                scope = _loop_from_structure(sdfg, node, oe[0], oe[1], back_edges, dispatch_state)
+                scope = _loop_from_structure(sdfg, node, oe[0], oe[1], back_edges, dispatch_state, parent_block)
                 body_start = oe[0].dst
                 loop_exit = oe[1].dst
             elif ptree[oe[1].dst] == node and ptree[oe[0].dst] != node:
-                scope = _loop_from_structure(sdfg, node, oe[1], oe[0], back_edges, dispatch_state)
+                scope = _loop_from_structure(sdfg, node, oe[1], oe[0], back_edges, dispatch_state, parent_block)
                 body_start = oe[1].dst
                 loop_exit = oe[0].dst
 
@@ -836,7 +874,8 @@ def structured_control_flow_tree(sdfg: SDFG, dispatch_state: Callable[[SDFGState
         if len(common_frontier) == 1:
             branch_merges[state] = next(iter(common_frontier))
 
-    root_block = GeneralBlock(dispatch_state, [], [], [], [], [], True)
+    root_block = GeneralBlock(dispatch_state, None, [], [], [], [], [], True)
     _structured_control_flow_traversal(sdfg, sdfg.start_state, ptree, branch_merges, back_edges, dispatch_state,
                                        root_block)
+    _reset_block_parents(root_block)
     return root_block
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 9ee5c2ef17..dfdbbb392b 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -471,7 +471,7 @@ def dispatch_state(state: SDFGState) -> str:
             # If disabled, generate entire graph as general control flow block
             states_topological = list(sdfg.topological_sort(sdfg.start_state))
             last = states_topological[-1]
-            cft = cflow.GeneralBlock(dispatch_state,
+            cft = cflow.GeneralBlock(dispatch_state, None,
                                      [cflow.SingleState(dispatch_state, s, s is last) for s in states_topological], [],
                                      [], [], [], False)
 
diff --git a/tests/codegen/control_flow_detection_test.py b/tests/codegen/control_flow_detection_test.py
index 99d6a39b29..982140f7ed 100644
--- a/tests/codegen/control_flow_detection_test.py
+++ b/tests/codegen/control_flow_detection_test.py
@@ -120,6 +120,33 @@ def test_single_outedge_branch():
     assert np.allclose(res, 2)
 
 
+def test_extraneous_goto():
+
+    @dace.program
+    def tester(a: dace.float64[20]):
+        if a[0] < 0:
+            a[1] = 1
+        a[2] = 1
+
+    sdfg = tester.to_sdfg(simplify=True)
+    assert 'goto' not in sdfg.generate_code()[0].code
+
+
+def test_extraneous_goto_nested():
+
+    @dace.program
+    def tester(a: dace.float64[20]):
+        if a[0] < 0:
+            if a[0] < 1:
+                a[1] = 1
+            else:
+                a[1] = 2
+        a[2] = 1
+
+    sdfg = tester.to_sdfg(simplify=True)
+    assert 'goto' not in sdfg.generate_code()[0].code
+
+
 if __name__ == '__main__':
     test_for_loop_detection()
     test_invalid_for_loop_detection()
@@ -128,3 +155,5 @@ def test_single_outedge_branch():
     test_edge_sympy_function('TrueFalse')
     test_edge_sympy_function('SwitchCase')
     test_single_outedge_branch()
+    test_extraneous_goto()
+    test_extraneous_goto_nested()

From 8a8744e1b55f3f3ddae1c162f645eed6f839ac4d Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Wed, 30 Aug 2023 11:28:46 -0700
Subject: [PATCH 367/392] Environments: Work well with external libraries that
 set their own GPU device

---
 dace/libraries/blas/environments/cublas.py    |  2 +-
 dace/libraries/blas/environments/rocblas.py   |  2 +-
 dace/libraries/blas/include/dace_cublas.h     | 12 ++--
 dace/libraries/blas/include/dace_rocblas.h    | 60 ++++++++++---------
 .../lapack/environments/cusolverdn.py         |  2 +-
 .../lapack/include/dace_cusolverdn.h          |  6 +-
 .../libraries/linalg/environments/cutensor.py |  2 +-
 dace/libraries/linalg/include/dace_cutensor.h |  6 +-
 .../libraries/sparse/environments/cusparse.py |  2 +-
 dace/libraries/sparse/include/dace_cusparse.h |  6 +-
 10 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/dace/libraries/blas/environments/cublas.py b/dace/libraries/blas/environments/cublas.py
index d4ab879e61..ef73b511c0 100644
--- a/dace/libraries/blas/environments/cublas.py
+++ b/dace/libraries/blas/environments/cublas.py
@@ -25,7 +25,7 @@ class cuBLAS:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])
diff --git a/dace/libraries/blas/environments/rocblas.py b/dace/libraries/blas/environments/rocblas.py
index 5d752ed690..47e16531ff 100644
--- a/dace/libraries/blas/environments/rocblas.py
+++ b/dace/libraries/blas/environments/rocblas.py
@@ -25,7 +25,7 @@ class rocBLAS:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])
diff --git a/dace/libraries/blas/include/dace_cublas.h b/dace/libraries/blas/include/dace_cublas.h
index 8ec03c2b37..3547a009d2 100644
--- a/dace/libraries/blas/include/dace_cublas.h
+++ b/dace/libraries/blas/include/dace_cublas.h
@@ -21,8 +21,10 @@ static void CheckCublasError(cublasStatus_t const& status) {
 }
 
 static cublasHandle_t CreateCublasHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cublasHandle_t handle;
   CheckCublasError(cublasCreate(&handle));
@@ -65,8 +67,10 @@ class _CublasConstants {
   }
 
   _CublasConstants(int device) {
-    if (cudaSetDevice(device) != cudaSuccess) {
-      throw std::runtime_error("Failed to set CUDA device.");
+    if (device >= 0) {
+      if (cudaSetDevice(device) != cudaSuccess) {
+        throw std::runtime_error("Failed to set CUDA device.");
+      }
     }
     // Allocate constant zero with the largest used size
     cudaMalloc(&zero_, sizeof(cuDoubleComplex) * 1);
diff --git a/dace/libraries/blas/include/dace_rocblas.h b/dace/libraries/blas/include/dace_rocblas.h
index 7a7e4a75ee..00469136a3 100644
--- a/dace/libraries/blas/include/dace_rocblas.h
+++ b/dace/libraries/blas/include/dace_rocblas.h
@@ -24,8 +24,10 @@ static void CheckRocblasError(rocblas_status const& status) {
 }
 
 static rocblas_handle CreateRocblasHandle(int device) {
-  if (hipSetDevice(device) != hipSuccess) {
-    throw std::runtime_error("Failed to set HIP device.");
+  if (device >= 0) {
+    if (hipSetDevice(device) != hipSuccess) {
+      throw std::runtime_error("Failed to set HIP device.");
+    }
   }
   rocblas_handle handle;
   CheckRocblasError(rocblas_create_handle(&handle));
@@ -68,53 +70,55 @@ class _RocblasConstants {
   }
 
   _RocblasConstants(int device) {
-    if (hipSetDevice(device) != hipSuccess) {
-      throw std::runtime_error("Failed to set HIP device.");
+    if (device >= 0) {
+      if (hipSetDevice(device) != hipSuccess) {
+        throw std::runtime_error("Failed to set HIP device.");
+      }
     }
     // Allocate constant zero with the largest used size
-    hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
 
     // Allocate constant one
-    hipMalloc(&half_pone_, sizeof(__half) * 1);
+    (void)hipMalloc(&half_pone_, sizeof(__half) * 1);
     __half half_pone = __float2half(1.0f);
-    hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
+    (void)hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&float_pone_, sizeof(float) * 1);
+    (void)hipMalloc(&float_pone_, sizeof(float) * 1);
     float float_pone = 1.0f;
-    hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
+    (void)hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&double_pone_, sizeof(double) * 1);
+    (void)hipMalloc(&double_pone_, sizeof(double) * 1);
     double double_pone = 1.0;
-    hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
+    (void)hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
+    (void)hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
     hipComplex complex64_pone = make_hipFloatComplex(1.0f, 0.0f);
-    hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
+    (void)hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
                hipMemcpyHostToDevice);
-    hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
     hipDoubleComplex complex128_pone = make_hipDoubleComplex(1.0, 0.0);
-    hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
+    (void)hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
                hipMemcpyHostToDevice);
 
     // Allocate custom factors and default to zero
-    hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
-    hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
-    hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
+    (void)hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
+    (void)hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
   }
 
   _RocblasConstants(_RocblasConstants const&) = delete;
 
   ~_RocblasConstants() {
-    hipFree(zero_);
-    hipFree(half_pone_);
-    hipFree(float_pone_);
-    hipFree(double_pone_);
-    hipFree(complex64_pone_);
-    hipFree(complex128_pone_);
-    hipFree(custom_alpha_);
-    hipFree(custom_beta_);
+    (void)hipFree(zero_);
+    (void)hipFree(half_pone_);
+    (void)hipFree(float_pone_);
+    (void)hipFree(double_pone_);
+    (void)hipFree(complex64_pone_);
+    (void)hipFree(complex128_pone_);
+    (void)hipFree(custom_alpha_);
+    (void)hipFree(custom_beta_);
   }
 
   _RocblasConstants& operator=(_RocblasConstants const&) = delete;
diff --git a/dace/libraries/lapack/environments/cusolverdn.py b/dace/libraries/lapack/environments/cusolverdn.py
index c92c8bf3e7..4daad8062e 100644
--- a/dace/libraries/lapack/environments/cusolverdn.py
+++ b/dace/libraries/lapack/environments/cusolverdn.py
@@ -24,7 +24,7 @@ class cuSolverDn:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])
diff --git a/dace/libraries/lapack/include/dace_cusolverdn.h b/dace/libraries/lapack/include/dace_cusolverdn.h
index 2da65ffa2f..f262541f0b 100644
--- a/dace/libraries/lapack/include/dace_cusolverdn.h
+++ b/dace/libraries/lapack/include/dace_cusolverdn.h
@@ -21,8 +21,10 @@ static void CheckCusolverDnError(cusolverStatus_t const& status) {
 }
 
 static cusolverDnHandle_t CreateCusolverDnHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cusolverDnHandle_t handle;
   CheckCusolverDnError(cusolverDnCreate(&handle));
diff --git a/dace/libraries/linalg/environments/cutensor.py b/dace/libraries/linalg/environments/cutensor.py
index e3572a0673..0022ec1f57 100644
--- a/dace/libraries/linalg/environments/cutensor.py
+++ b/dace/libraries/linalg/environments/cutensor.py
@@ -24,7 +24,7 @@ class cuTensor:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])
diff --git a/dace/libraries/linalg/include/dace_cutensor.h b/dace/libraries/linalg/include/dace_cutensor.h
index 8079892285..ddad2feaa3 100644
--- a/dace/libraries/linalg/include/dace_cutensor.h
+++ b/dace/libraries/linalg/include/dace_cutensor.h
@@ -20,8 +20,10 @@ static void CheckCuTensorError(cutensorStatus_t const& status) {
 }
 
 static cutensorHandle_t CreateCuTensorHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cutensorHandle_t handle;
   CheckCuTensorError(cutensorInit(&handle));
diff --git a/dace/libraries/sparse/environments/cusparse.py b/dace/libraries/sparse/environments/cusparse.py
index 0970557944..a731f75bf7 100644
--- a/dace/libraries/sparse/environments/cusparse.py
+++ b/dace/libraries/sparse/environments/cusparse.py
@@ -24,7 +24,7 @@ class cuSPARSE:
     def handle_setup_code(node):
         location = node.location
         if not location or "gpu" not in node.location:
-            location = 0
+            location = -1  # -1 means current device
         else:
             try:
                 location = int(location["gpu"])
diff --git a/dace/libraries/sparse/include/dace_cusparse.h b/dace/libraries/sparse/include/dace_cusparse.h
index 82470089e0..9d28bb4748 100644
--- a/dace/libraries/sparse/include/dace_cusparse.h
+++ b/dace/libraries/sparse/include/dace_cusparse.h
@@ -20,8 +20,10 @@ static void CheckCusparseError(cusparseStatus_t const& status) {
 }
 
 static cusparseHandle_t CreateCusparseHandle(int device) {
-  if (cudaSetDevice(device) != cudaSuccess) {
-    throw std::runtime_error("Failed to set CUDA device.");
+  if (device >= 0) {
+    if (cudaSetDevice(device) != cudaSuccess) {
+      throw std::runtime_error("Failed to set CUDA device.");
+    }
   }
   cusparseHandle_t handle;
   CheckCusparseError(cusparseCreate(&handle));

From c34de8e3336343b0f11bddd0b61099ab1f22eb47 Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sat, 2 Sep 2023 15:34:08 +0200
Subject: [PATCH 368/392] TaskletFusion: Fix additional edges in case of
 none-connectors

---
 .../transformation/dataflow/tasklet_fusion.py |  3 ++
 tests/transformations/tasklet_fusion_test.py  | 44 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/dace/transformation/dataflow/tasklet_fusion.py b/dace/transformation/dataflow/tasklet_fusion.py
index 99f8f625be..d6b4a3039b 100644
--- a/dace/transformation/dataflow/tasklet_fusion.py
+++ b/dace/transformation/dataflow/tasklet_fusion.py
@@ -249,6 +249,9 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
                                         t1.language)
 
         for in_edge in graph.in_edges(t1):
+            if in_edge.src_conn is None and isinstance(in_edge.src, dace.nodes.EntryNode):
+                if len(new_tasklet.in_connectors) > 0:
+                    continue
             graph.add_edge(in_edge.src, in_edge.src_conn, new_tasklet, in_edge.dst_conn, in_edge.data)
 
         for in_edge in graph.in_edges(t2):
diff --git a/tests/transformations/tasklet_fusion_test.py b/tests/transformations/tasklet_fusion_test.py
index c7fd6802d5..743010e8c9 100644
--- a/tests/transformations/tasklet_fusion_test.py
+++ b/tests/transformations/tasklet_fusion_test.py
@@ -213,6 +213,49 @@ def test_map_with_tasklets(language: str, with_data: bool):
     ref = map_with_tasklets.f(A, B)
     assert (np.allclose(C, ref))
 
+def test_none_connector():
+    @dace.program
+    def sdfg_none_connector(A: dace.float32[32], B: dace.float32[32]):
+        tmp = dace.define_local([32], dace.float32)
+        for i in dace.map[0:32]:
+            with dace.tasklet:
+                a >> tmp[i]
+                a = 0
+
+        tmp2 = dace.define_local([32], dace.float32)
+        for i in dace.map[0:32]:
+            with dace.tasklet:
+                a << A[i]
+                b >> tmp2[i]
+                b = a + 1
+
+
+        for i in dace.map[0:32]:
+            with dace.tasklet:
+                a << tmp[i]
+                b << tmp2[i]
+                c >> B[i]
+                c = a + b
+
+    sdfg = sdfg_none_connector.to_sdfg()
+    sdfg.simplify()
+    applied = sdfg.apply_transformations_repeated(MapFusion)
+    assert applied == 2
+
+    map_entry = None
+    for node in sdfg.start_state.nodes():
+        if isinstance(node, dace.nodes.MapEntry):
+            map_entry = node
+            break
+    
+    assert map_entry is not None
+    assert len([edge.src_conn for edge in sdfg.start_state.out_edges(map_entry) if edge.src_conn is None]) == 1
+
+    applied = sdfg.apply_transformations_repeated(TaskletFusion)
+    assert applied == 2
+
+    assert sdfg.start_state.out_degree(map_entry) == 1
+    assert len([edge.src_conn for edge in sdfg.start_state.out_edges(map_entry) if edge.src_conn is None]) == 0
 
 if __name__ == '__main__':
     test_basic()
@@ -224,3 +267,4 @@ def test_map_with_tasklets(language: str, with_data: bool):
     test_map_with_tasklets(language='Python', with_data=True)
     test_map_with_tasklets(language='CPP', with_data=False)
     test_map_with_tasklets(language='CPP', with_data=True)
+    test_none_connector()

From f95f8162a4e77d7a386ccd20c9e4ef71a3ad9787 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Mon, 4 Sep 2023 23:58:33 -0700
Subject: [PATCH 369/392] Fix dynamic memlet propagation condition (#1364)

---
 dace/sdfg/propagation.py               |  4 ++--
 tests/python_frontend/argument_test.py | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py
index 0fec4812b7..0554775dcd 100644
--- a/dace/sdfg/propagation.py
+++ b/dace/sdfg/propagation.py
@@ -1477,8 +1477,8 @@ def propagate_subset(memlets: List[Memlet],
     new_memlet.volume = simplify(sum(m.volume for m in memlets) * functools.reduce(lambda a, b: a * b, rng.size(), 1))
     if any(m.dynamic for m in memlets):
         new_memlet.dynamic = True
-    elif symbolic.issymbolic(new_memlet.volume) and any(s not in defined_variables
-                                                        for s in new_memlet.volume.free_symbols):
+    if symbolic.issymbolic(new_memlet.volume) and any(s not in defined_variables
+                                                      for s in new_memlet.volume.free_symbols):
         new_memlet.dynamic = True
         new_memlet.volume = 0
 
diff --git a/tests/python_frontend/argument_test.py b/tests/python_frontend/argument_test.py
index 1f43337eb8..cb47188029 100644
--- a/tests/python_frontend/argument_test.py
+++ b/tests/python_frontend/argument_test.py
@@ -2,6 +2,7 @@
 
 import dace
 import pytest
+import numpy as np
 
 N = dace.symbol('N')
 
@@ -16,5 +17,29 @@ def test_extra_args():
         imgcpy([[1, 2], [3, 4]], [[4, 3], [2, 1]], 0.0, 1.0)
 
 
+def test_missing_arguments_regression():
+
+    def nester(a, b, T):
+        for i, j in dace.map[0:20, 0:20]:
+            start = 0
+            end = min(T, 6)
+
+            elem: dace.float64 = 0
+            for ii in range(start, end):
+                if ii % 2 == 0:
+                    elem += b[ii]
+
+            a[j, i] = elem
+
+    @dace.program
+    def tester(x: dace.float64[20, 20]):
+        gdx = np.ones((10, ), dace.float64)
+        for T in range(2):
+            nester(x, gdx, T)
+
+    tester.to_sdfg().compile()
+
+
 if __name__ == '__main__':
     test_extra_args()
+    test_missing_arguments_regression()

From 3e9390937f2823f96eb4a960930b0babe4cf3224 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 7 Sep 2023 14:02:46 -0700
Subject: [PATCH 370/392] cppunparse: Dispatch constants after applying the
 operation

---
 dace/codegen/cppunparse.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
index 31dae08f79..1121aa9f42 100644
--- a/dace/codegen/cppunparse.py
+++ b/dace/codegen/cppunparse.py
@@ -847,8 +847,16 @@ def _Tuple(
         self.write(")")
 
     unop = {"Invert": "~", "Not": "!", "UAdd": "+", "USub": "-"}
+    unop_lambda = {'Invert': (lambda x: ~x), 'Not': (lambda x: not x), 'UAdd': (lambda x: +x), 'USub': (lambda x: -x)}
 
     def _UnaryOp(self, t):
+        # Dispatch constants after applying the operation
+        if t.operand.__class__.__name__ in ('Constant', 'Num'):
+            newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
+            newnode = ast.Constant(value=newval)
+            self.dispatch(newnode)
+            return
+
         self.write("(")
         self.write(self.unop[t.op.__class__.__name__])
         self.write(" ")

From e4322d2eeeb8561f2ef99cc305c44737337af183 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <tbennun@users.noreply.github.com>
Date: Thu, 7 Sep 2023 14:13:16 -0700
Subject: [PATCH 371/392] Fix for Python version compatibility

---
 dace/codegen/cppunparse.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
index 1121aa9f42..77dd34d478 100644
--- a/dace/codegen/cppunparse.py
+++ b/dace/codegen/cppunparse.py
@@ -851,11 +851,18 @@ def _Tuple(
 
     def _UnaryOp(self, t):
         # Dispatch constants after applying the operation
-        if t.operand.__class__.__name__ in ('Constant', 'Num'):
-            newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
-            newnode = ast.Constant(value=newval)
-            self.dispatch(newnode)
-            return
+        if sys.version_info[:2] < (3, 8):
+            if isinstance(t.operand, ast.Num):
+                newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
+                newnode = ast.Num(n=newval)
+                self.dispatch(newnode)
+                return
+        else:
+            if isinstance(t.operand, ast.Constant):
+                newval = self.unop_lambda[t.op.__class__.__name__](t.operand.value)
+                newnode = ast.Constant(value=newval)
+                self.dispatch(newnode)
+                return
 
         self.write("(")
         self.write(self.unop[t.op.__class__.__name__])

From 427f467f01decf089b48b4929905ff81c006d2f7 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 14:43:19 +0200
Subject: [PATCH 372/392] Add Fortran AST transformation assigning to each node
 its parent scope

---
 dace/frontend/fortran/ast_internal_classes.py |  3 +-
 dace/frontend/fortran/ast_transforms.py       | 35 +++++++++++-
 tests/fortran/parent_test.py                  | 54 +++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 tests/fortran/parent_test.py

diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index 6bdfb61faf..9bf841ecfe 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -1,5 +1,5 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from typing import Any, List, Tuple, Type, TypeVar, Union, overload
+from typing import Any, List, Optional, Tuple, Type, TypeVar, Union, overload
 
 # The node class is the base class for all nodes in the AST. It provides attributes including the line number and fields.
 # Attributes are not used when walking the tree, but are useful for debugging and for code generation.
@@ -11,6 +11,7 @@ def __init__(self, *args, **kwargs):  # real signature unknown
         self.integrity_exceptions = []
         self.read_vars = []
         self.written_vars = []
+        self.parent: Optional["FNode"] = None
         for k, v in kwargs.items():
             setattr(self, k, v)
 
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 7e5cd3bf00..b0196506ee 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,7 +1,7 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
 from dace.frontend.fortran import ast_components, ast_internal_classes
-from typing import List, Tuple, Set
+from typing import List, Optional, Tuple, Set
 import copy
 
 
@@ -310,6 +310,39 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
 
         return ast_internal_classes.Execution_Part_Node(execution=newbody)
 
+class ParentScopeAssigner(NodeVisitor):
+    """
+        For each node, it assigns its parent scope - program, subroutine, function.
+
+        If the parent node is one of the "parent" types, we assign it as the parent.
+        Otherwise, we look for the parent of my parent to cover nested AST nodes within
+        a single scope.
+    """
+    def __init__(self):
+        pass
+
+    def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_internal_classes.FNode] = None):
+
+        parent_node_types = [
+            ast_internal_classes.Subroutine_Subprogram_Node,
+            ast_internal_classes.Function_Subprogram_Node,
+            ast_internal_classes.Main_Program_Node,
+            ast_internal_classes.Program_Node
+        ]
+
+        if parent_node is not None and type(parent_node) in parent_node_types:
+            node.parent = parent_node
+        elif parent_node is not None:
+            node.parent = parent_node.parent
+
+        # Copied from `generic_visit` to recursively parse all leafs
+        for field, value in iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, ast_internal_classes.FNode):
+                        self.visit(item, node)
+            elif isinstance(value, ast_internal_classes.FNode):
+                self.visit(value, node)
 
 class IndexExtractorNodeLister(NodeVisitor):
     """
diff --git a/tests/fortran/parent_test.py b/tests/fortran/parent_test.py
new file mode 100644
index 0000000000..c3f0ce71b5
--- /dev/null
+++ b/tests/fortran/parent_test.py
@@ -0,0 +1,54 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from dace.frontend.fortran import fortran_parser
+
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_parent():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM access_test
+                    implicit none
+                    double precision d(4)
+                    d(1)=0
+                    CALL array_access_test_function(d)
+                    end
+
+                    SUBROUTINE array_access_test_function(d)
+                    double precision d(4)
+
+                    d(2)=5.5
+
+                    END SUBROUTINE array_access_test_function
+                    """
+    ast, functions = fortran_parser.create_ast_from_string(test_string, "array_access_test")
+    ast_transforms.ParentScopeAssigner().visit(ast)
+
+    assert ast.parent is None
+    assert ast.main_program.parent == ast
+
+    main_program = ast.main_program
+    # Both executed lines
+    for execution in main_program.execution_part.execution:
+        assert execution.parent == main_program
+    # call to the function
+    call_node = main_program.execution_part.execution[1]
+    assert isinstance(call_node, ast_internal_classes.Call_Expr_Node)
+    for arg in call_node.args:
+        assert arg.parent == main_program
+
+    for subroutine in ast.subroutine_definitions:
+
+        assert subroutine.parent == ast
+        assert subroutine.execution_part.parent == subroutine
+        for execution in subroutine.execution_part.execution:
+            assert execution.parent == subroutine
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_parent()

From 0d19df257526a4a279b9cb278ae8ffcb21d34e54 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 14:44:56 +0200
Subject: [PATCH 373/392] Add new Fortran parser function to export pure AST,
 not SDFG

---
 dace/frontend/fortran/fortran_parser.py | 38 +++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index d7112892fe..b1041ac4eb 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -1015,6 +1015,40 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
         if node.name not in self.contexts[sdfg.name].containers:
             self.contexts[sdfg.name].containers.append(node.name)
 
+def create_ast_from_string(
+    source_string: str,
+    sdfg_name: str,
+    transform: bool = False
+):
+    """
+    Creates an AST from a Fortran file in a string
+    :param source_string: The fortran file as a string
+    :param sdfg_name: The name to be given to the resulting SDFG
+    :return: The resulting AST
+
+    """
+    parser = pf().create(std="f2008")
+    reader = fsr(source_string)
+    ast = parser(reader)
+    tables = SymbolTable
+    own_ast = ast_components.InternalFortranAst(ast, tables)
+    program = own_ast.create_ast(ast)
+
+    functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
+    functions_and_subroutines_builder.visit(program)
+    functions_and_subroutines = functions_and_subroutines_builder.nodes
+
+    if transform:
+        program = ast_transforms.functionStatementEliminator(program)
+        program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
+        program = ast_transforms.CallExtractor().visit(program)
+        program = ast_transforms.SignToIf().visit(program)
+        program = ast_transforms.ArrayToLoop().visit(program)
+        program = ast_transforms.SumToLoop().visit(program)
+        program = ast_transforms.ForDeclarer().visit(program)
+        program = ast_transforms.IndexExtractor().visit(program)
+
+    return (program, functions_and_subroutines)
 
 def create_sdfg_from_string(
     source_string: str,
@@ -1032,7 +1066,7 @@ def create_sdfg_from_string(
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast)
+    program = own_ast.create_ast(ast, None)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1074,7 +1108,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast)
+    program = own_ast.create_ast(ast, None)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes

From db11e939a4cc0ee0a7cbfa861a558dbdeca86555 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 10:52:24 +0200
Subject: [PATCH 374/392] Support in Fortran frontend arrays with offset
 declaration

---
 dace/frontend/fortran/ast_components.py       | 18 +++++-
 dace/frontend/fortran/ast_internal_classes.py |  1 +
 tests/fortran/index_offset_test.py            | 60 +++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 tests/fortran/index_offset_test.py

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index a66ee5c0d6..97281ebd27 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -574,6 +574,7 @@ def type_declaration_stmt(self, node: FASTNode):
 
         alloc = False
         symbol = False
+        dimensions = None
         for i in attributes:
             if i.string.lower() == "allocatable":
                 alloc = True
@@ -591,16 +592,30 @@ def type_declaration_stmt(self, node: FASTNode):
             if len(array_sizes) == 1:
                 array_sizes = array_sizes[0]
                 size = []
+                offset = []
                 for dim in array_sizes.children:
                     #sanity check
                     if isinstance(dim, f03.Explicit_Shape_Spec):
                         dim_expr = [i for i in dim.children if i is not None]
+                        # handle size definition
                         if len(dim_expr) == 1:
                             dim_expr = dim_expr[0]
                             #now to add the dimension to the size list after processing it if necessary
                             size.append(self.create_ast(dim_expr))
+                            offset.append(1)
+                        elif len(dim_expr) == 2:
+                            # extract offets
+                            for expr in dim_expr:
+                                if not isinstance(expr, f03.Int_Literal_Constant):
+                                    raise TypeError("Array offsets must be constant expressions!")
+                            offset.append(int(dim_expr[0].tostr()))
+
+                            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
+                            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
+
+                            size.append(self.create_ast(fortran_ast_size))
                         else:
-                            raise TypeError("Array dimension must be a single expression")
+                            raise TypeError("Array dimension must be at most two expressions")
             #handle initializiation
             init = None
 
@@ -637,6 +652,7 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     type=testtype,
                                                                     alloc=alloc,
                                                                     sizes=size,
+                                                                    offsets=offset,
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index 6bdfb61faf..daddfbe8ef 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -199,6 +199,7 @@ class Symbol_Array_Decl_Node(Statement_Node):
     )
     _fields = (
         'sizes',
+        'offsets'
         'typeref',
         'init',
     )
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/index_offset_test.py
new file mode 100644
index 0000000000..5e38a0adc6
--- /dev/null
+++ b/tests/fortran/index_offset_test.py
@@ -0,0 +1,60 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from fparser.common.readfortran import FortranStringReader
+from fparser.common.readfortran import FortranFileReader
+from fparser.two.parser import ParserFactory
+import sys, os
+import numpy as np
+import pytest
+
+import dace
+from dace import SDFG, SDFGState, instrument, nodes, dtypes, data, subsets, symbolic
+from dace.frontend.fortran import fortran_parser
+from fparser.two.symbol_table import SymbolTable
+from dace.sdfg import utils as sdutil
+
+import dace.frontend.fortran.ast_components as ast_components
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_utils as ast_utils
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+def test_fortran_frontend_index_offset():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision d(50:54)
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision d(50:54)
+
+                    do i=50,54
+                       d(i) = i * 2.0
+                    end do
+                    
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([60], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(50,54):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
+
+
+if __name__ == "__main__":
+
+    #test_fortran_frontend_index_offset()
+    test_fortran_frontend_index_offset_dimensions()

From fdd5a27997680a3f4385fde90470b57a604bbb72 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 20:22:27 +0200
Subject: [PATCH 375/392] Support shape attribute specification in the Fortran
 frontend

---
 dace/frontend/fortran/ast_components.py       | 112 +++++++++++++-----
 dace/frontend/fortran/ast_internal_classes.py |   1 +
 tests/fortran/index_offset_test.py            |  44 ++++++-
 3 files changed, 125 insertions(+), 32 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 97281ebd27..4b48f81367 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -1,5 +1,6 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 from fparser.two.Fortran2008 import Fortran2008 as f08
+from fparser.two import Fortran2008
 from fparser.two import Fortran2003 as f03
 from fparser.two import symbol_table
 
@@ -523,6 +524,31 @@ def declaration_type_spec(self, node: FASTNode):
     def assumed_shape_spec_list(self, node: FASTNode):
         return node
 
+    def parse_shape_specification(self, dim: f03.Explicit_Shape_Spec, size: List[FASTNode], offset: List[int]):
+
+        dim_expr = [i for i in dim.children if i is not None]
+
+        # handle size definition
+        if len(dim_expr) == 1:
+            dim_expr = dim_expr[0]
+            #now to add the dimension to the size list after processing it if necessary
+            size.append(self.create_ast(dim_expr))
+            offset.append(1)
+        # Here we support arrays that have size declaration - with initial offset.
+        elif len(dim_expr) == 2:
+            # extract offets
+            for expr in dim_expr:
+                if not isinstance(expr, f03.Int_Literal_Constant):
+                    raise TypeError("Array offsets must be constant expressions!")
+            offset.append(int(dim_expr[0].tostr()))
+
+            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
+            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
+
+            size.append(self.create_ast(fortran_ast_size))
+        else:
+            raise TypeError("Array dimension must be at most two expressions")
+
     def type_declaration_stmt(self, node: FASTNode):
 
         #decide if its a intrinsic variable type or a derived type
@@ -574,18 +600,39 @@ def type_declaration_stmt(self, node: FASTNode):
 
         alloc = False
         symbol = False
-        dimensions = None
+        attr_size = None
+        attr_offset = None
         for i in attributes:
             if i.string.lower() == "allocatable":
                 alloc = True
             if i.string.lower() == "parameter":
                 symbol = True
 
+            if isinstance(i, Fortran2008.Attr_Spec_List):
+
+                attr_size = []
+                attr_offset = []
+                sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
+                
+                for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
+                    print(shape_spec)
+                    self.parse_shape_specification(shape_spec, attr_size, attr_offset)
+                print(sizes.children)
+                print(type(sizes))
+                #print(sizes.children)
+
+                #if len(i.children) > 0 and isinstance(i.children[0], f03.Dimension_Attr_Spec):
+                #    print(i, dir(i), type(i.children[0]), dir(i.children[0]))
+
+        #sizes = get_child(attributes, ["Attr_Spec_List"])
+        #print(sizes)
+
         vardecls = []
 
         for var in names:
             #first handle dimensions
             size = None
+            offset = None
             var_components = self.create_children(var)
             array_sizes = get_children(var, "Explicit_Shape_Spec_List")
             actual_name = get_child(var_components, ast_internal_classes.Name_Node)
@@ -596,26 +643,7 @@ def type_declaration_stmt(self, node: FASTNode):
                 for dim in array_sizes.children:
                     #sanity check
                     if isinstance(dim, f03.Explicit_Shape_Spec):
-                        dim_expr = [i for i in dim.children if i is not None]
-                        # handle size definition
-                        if len(dim_expr) == 1:
-                            dim_expr = dim_expr[0]
-                            #now to add the dimension to the size list after processing it if necessary
-                            size.append(self.create_ast(dim_expr))
-                            offset.append(1)
-                        elif len(dim_expr) == 2:
-                            # extract offets
-                            for expr in dim_expr:
-                                if not isinstance(expr, f03.Int_Literal_Constant):
-                                    raise TypeError("Array offsets must be constant expressions!")
-                            offset.append(int(dim_expr[0].tostr()))
-
-                            fortran_size = int(dim_expr[1].tostr()) - int(dim_expr[0].tostr()) + 1
-                            fortran_ast_size = f03.Int_Literal_Constant(str(fortran_size))
-
-                            size.append(self.create_ast(fortran_ast_size))
-                        else:
-                            raise TypeError("Array dimension must be at most two expressions")
+                        self.parse_shape_specification(dim, size, offset)
             #handle initializiation
             init = None
 
@@ -628,17 +656,30 @@ def type_declaration_stmt(self, node: FASTNode):
                 raw_init = initialization.children[1]
                 init = self.create_ast(raw_init)
 
+            print('t', symbol, size, attr_size)
+            print(offset, attr_offset)
             if symbol == False:
 
-                vardecls.append(
-                    ast_internal_classes.Var_Decl_Node(name=actual_name.name,
-                                                       type=testtype,
-                                                       alloc=alloc,
-                                                       sizes=size,
-                                                       kind=kind,
-                                                       line_number=node.item.span))
+                if attr_size is None:
+                    vardecls.append(
+                        ast_internal_classes.Var_Decl_Node(name=actual_name.name,
+                                                        type=testtype,
+                                                        alloc=alloc,
+                                                        sizes=size,
+                                                        offsets=offset,
+                                                        kind=kind,
+                                                        line_number=node.item.span))
+                else:
+                    vardecls.append(
+                        ast_internal_classes.Var_Decl_Node(name=actual_name.name,
+                                                        type=testtype,
+                                                        alloc=alloc,
+                                                        sizes=attr_size,
+                                                        offsets=attr_offset,
+                                                        kind=kind,
+                                                        line_number=node.item.span))
             else:
-                if size is None:
+                if size is None and attr_size is None:
                     self.symbols[actual_name.name] = init
                     vardecls.append(
                         ast_internal_classes.Symbol_Decl_Node(name=actual_name.name,
@@ -646,6 +687,16 @@ def type_declaration_stmt(self, node: FASTNode):
                                                               alloc=alloc,
                                                               init=init,
                                                               line_number=node.item.span))
+                elif attr_size is not None:
+                    vardecls.append(
+                        ast_internal_classes.Symbol_Array_Decl_Node(name=actual_name.name,
+                                                                    type=testtype,
+                                                                    alloc=alloc,
+                                                                    sizes=attr_size,
+                                                                    offsets=attr_offset,
+                                                                    kind=kind,
+                                                                    init=init,
+                                                                    line_number=node.item.span))
                 else:
                     vardecls.append(
                         ast_internal_classes.Symbol_Array_Decl_Node(name=actual_name.name,
@@ -656,7 +707,8 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
-
+        #print(vardecls[0].sizes)
+        #print(vardecls[0].offsets)
         return ast_internal_classes.Decl_Stmt_Node(vardecl=vardecls, line_number=node.item.span)
 
     def entity_decl(self, node: FASTNode):
diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index daddfbe8ef..f9bf97ca08 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -214,6 +214,7 @@ class Var_Decl_Node(Statement_Node):
     )
     _fields = (
         'sizes',
+        'offsets',
         'typeref',
         'init',
     )
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/index_offset_test.py
index 5e38a0adc6..564df31634 100644
--- a/tests/fortran/index_offset_test.py
+++ b/tests/fortran/index_offset_test.py
@@ -18,6 +18,46 @@
 import dace.frontend.fortran.ast_utils as ast_utils
 import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
 
+def test_fortran_frontend_index_offset_attributes():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54) :: d
+                    !double precision, dimension(5) :: d
+                    !double precision d(50:54)
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    !double precision d(50:54)
+                    !double precision d(5)
+                    double precision, dimension(50:54) :: d
+                    !double precision, intent(inout) :: d(50:54)
+
+                    do i=50,54
+                       d(i) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([60], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(50,54):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
+
 def test_fortran_frontend_index_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
@@ -56,5 +96,5 @@ def test_fortran_frontend_index_offset():
 
 if __name__ == "__main__":
 
-    #test_fortran_frontend_index_offset()
-    test_fortran_frontend_index_offset_dimensions()
+    test_fortran_frontend_index_offset()
+    test_fortran_frontend_index_offset_attributes()

From da8f1d767e2f02a7c6082636625d695087d8c268 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 24 Jul 2023 20:50:31 +0200
Subject: [PATCH 376/392] Rename array attributes test

---
 dace/frontend/fortran/ast_components.py       | 12 ----
 ...ffset_test.py => array_attributes_test.py} | 56 +++++++++++++------
 2 files changed, 39 insertions(+), 29 deletions(-)
 rename tests/fortran/{index_offset_test.py => array_attributes_test.py} (65%)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 4b48f81367..b11c970973 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -615,17 +615,7 @@ def type_declaration_stmt(self, node: FASTNode):
                 sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
                 
                 for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
-                    print(shape_spec)
                     self.parse_shape_specification(shape_spec, attr_size, attr_offset)
-                print(sizes.children)
-                print(type(sizes))
-                #print(sizes.children)
-
-                #if len(i.children) > 0 and isinstance(i.children[0], f03.Dimension_Attr_Spec):
-                #    print(i, dir(i), type(i.children[0]), dir(i.children[0]))
-
-        #sizes = get_child(attributes, ["Attr_Spec_List"])
-        #print(sizes)
 
         vardecls = []
 
@@ -656,8 +646,6 @@ def type_declaration_stmt(self, node: FASTNode):
                 raw_init = initialization.children[1]
                 init = self.create_ast(raw_init)
 
-            print('t', symbol, size, attr_size)
-            print(offset, attr_offset)
             if symbol == False:
 
                 if attr_size is None:
diff --git a/tests/fortran/index_offset_test.py b/tests/fortran/array_attributes_test.py
similarity index 65%
rename from tests/fortran/index_offset_test.py
rename to tests/fortran/array_attributes_test.py
index 564df31634..1ccb3c5f57 100644
--- a/tests/fortran/index_offset_test.py
+++ b/tests/fortran/array_attributes_test.py
@@ -1,24 +1,45 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 
-from fparser.common.readfortran import FortranStringReader
-from fparser.common.readfortran import FortranFileReader
-from fparser.two.parser import ParserFactory
-import sys, os
 import numpy as np
-import pytest
 
-import dace
-from dace import SDFG, SDFGState, instrument, nodes, dtypes, data, subsets, symbolic
 from dace.frontend.fortran import fortran_parser
-from fparser.two.symbol_table import SymbolTable
-from dace.sdfg import utils as sdutil
 
-import dace.frontend.fortran.ast_components as ast_components
-import dace.frontend.fortran.ast_transforms as ast_transforms
-import dace.frontend.fortran.ast_utils as ast_utils
-import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+def test_fortran_frontend_array_attribute_no_offset():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(5) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(5) :: d
+
+                    do i=1,5
+                       d(i) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test")
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+    assert len(sdfg.data('d').offset) == 1
+    assert sdfg.data('d').offset[0] == -1
+
+    a = np.full([5], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(1,5):
+        # offset -1 is already added
+        assert a[i-1] == i * 2
 
-def test_fortran_frontend_index_offset_attributes():
+def test_fortran_frontend_array_attribute_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
     """
@@ -58,7 +79,7 @@ def test_fortran_frontend_index_offset_attributes():
         # offset -1 is already added
         assert a[i-1] == i * 2
 
-def test_fortran_frontend_index_offset():
+def test_fortran_frontend_array_offset():
     """
     Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
     """
@@ -96,5 +117,6 @@ def test_fortran_frontend_index_offset():
 
 if __name__ == "__main__":
 
-    test_fortran_frontend_index_offset()
-    test_fortran_frontend_index_offset_attributes()
+    test_fortran_frontend_array_offset()
+    test_fortran_frontend_array_attribute_no_offset()
+    test_fortran_frontend_array_attribute_offset()

From a32346855c15f58a48eee625fcb6852f1926edee Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 14 Aug 2023 18:41:54 +0200
Subject: [PATCH 377/392] Remove old code

---
 dace/frontend/fortran/ast_components.py | 2 --
 tests/fortran/array_attributes_test.py  | 5 -----
 2 files changed, 7 deletions(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index b11c970973..492c819322 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -695,8 +695,6 @@ def type_declaration_stmt(self, node: FASTNode):
                                                                     kind=kind,
                                                                     init=init,
                                                                     line_number=node.item.span))
-        #print(vardecls[0].sizes)
-        #print(vardecls[0].offsets)
         return ast_internal_classes.Decl_Stmt_Node(vardecl=vardecls, line_number=node.item.span)
 
     def entity_decl(self, node: FASTNode):
diff --git a/tests/fortran/array_attributes_test.py b/tests/fortran/array_attributes_test.py
index 1ccb3c5f57..af433905bc 100644
--- a/tests/fortran/array_attributes_test.py
+++ b/tests/fortran/array_attributes_test.py
@@ -47,16 +47,11 @@ def test_fortran_frontend_array_attribute_offset():
                     PROGRAM index_offset_test
                     implicit none
                     double precision, dimension(50:54) :: d
-                    !double precision, dimension(5) :: d
-                    !double precision d(50:54)
                     CALL index_test_function(d)
                     end
 
                     SUBROUTINE index_test_function(d)
-                    !double precision d(50:54)
-                    !double precision d(5)
                     double precision, dimension(50:54) :: d
-                    !double precision, intent(inout) :: d(50:54)
 
                     do i=50,54
                        d(i) = i * 2.0

From 1a148fe354fe722a17776dada474d28cd2529e6e Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Mon, 14 Aug 2023 19:29:11 +0200
Subject: [PATCH 378/392] Fix handling of non-dimensional attributes in Fortran
 frontend

---
 dace/frontend/fortran/ast_components.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py
index 492c819322..1e5bfb4528 100644
--- a/dace/frontend/fortran/ast_components.py
+++ b/dace/frontend/fortran/ast_components.py
@@ -610,9 +610,13 @@ def type_declaration_stmt(self, node: FASTNode):
 
             if isinstance(i, Fortran2008.Attr_Spec_List):
 
+                dimension_spec = get_children(i, "Dimension_Attr_Spec")
+                if len(dimension_spec) == 0:
+                    continue
+
                 attr_size = []
                 attr_offset = []
-                sizes = get_child(get_child(i, ["Dimension_Attr_Spec"]), ["Explicit_Shape_Spec_List"])
+                sizes = get_child(dimension_spec[0], ["Explicit_Shape_Spec_List"])
                 
                 for shape_spec in get_children(sizes, [f03.Explicit_Shape_Spec]):
                     self.parse_shape_specification(shape_spec, attr_size, attr_offset)

From 5cfbed3292080545b5340184d6feefd425ad20ea Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 15:27:36 +0200
Subject: [PATCH 379/392] Add Fortran AST pass to gather all variable
 declarations inside a scope

---
 dace/frontend/fortran/ast_internal_classes.py |  8 +++-
 dace/frontend/fortran/ast_transforms.py       | 27 +++++++++--
 tests/fortran/parent_test.py                  |  4 +-
 tests/fortran/scope_arrays.py                 | 47 +++++++++++++++++++
 4 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 tests/fortran/scope_arrays.py

diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index ffa3cd2d76..171b941858 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -11,7 +11,13 @@ def __init__(self, *args, **kwargs):  # real signature unknown
         self.integrity_exceptions = []
         self.read_vars = []
         self.written_vars = []
-        self.parent: Optional["FNode"] = None
+        self.parent: Optional[
+            Union[
+                Subroutine_Subprogram_Node,
+                Function_Subprogram_Node,
+                Main_Program_Node
+            ]
+        ] = None
         for k, v in kwargs.items():
             setattr(self, k, v)
 
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index b0196506ee..efeac3a430 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,7 +1,7 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
 from dace.frontend.fortran import ast_components, ast_internal_classes
-from typing import List, Optional, Tuple, Set
+from typing import Dict, List, Optional, Tuple, Set
 import copy
 
 
@@ -326,8 +326,7 @@ def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_inte
         parent_node_types = [
             ast_internal_classes.Subroutine_Subprogram_Node,
             ast_internal_classes.Function_Subprogram_Node,
-            ast_internal_classes.Main_Program_Node,
-            ast_internal_classes.Program_Node
+            ast_internal_classes.Main_Program_Node
         ]
 
         if parent_node is not None and type(parent_node) in parent_node_types:
@@ -344,6 +343,28 @@ def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_inte
             elif isinstance(value, ast_internal_classes.FNode):
                 self.visit(value, node)
 
+class ScopeVarsDeclarations(NodeVisitor):
+    """
+        Creates a mapping (scope name, variable name) -> variable declaration.
+
+        The visitor is used to access information on variable dimension, sizes, and offsets.
+    """
+
+    def __init__(self):
+
+        self.scope_vars: Dict[Tuple[str, str], ast_internal_classes.FNode] = {}
+
+    def visit_Var_Decl_Node(self, node: ast_internal_classes.Var_Decl_Node):
+
+        if isinstance(node.parent, ast_internal_classes.Main_Program_Node):
+            parent_name = node.parent.name.name.name
+        else:
+            parent_name = node.parent.name.name
+        var_name = node.name
+
+        self.scope_vars[(parent_name, var_name)] = node
+
+
 class IndexExtractorNodeLister(NodeVisitor):
     """
     Finds all array subscript expressions in the AST node and its children that have to be extracted into independent expressions
diff --git a/tests/fortran/parent_test.py b/tests/fortran/parent_test.py
index c3f0ce71b5..e68f03db8c 100644
--- a/tests/fortran/parent_test.py
+++ b/tests/fortran/parent_test.py
@@ -29,7 +29,7 @@ def test_fortran_frontend_parent():
     ast_transforms.ParentScopeAssigner().visit(ast)
 
     assert ast.parent is None
-    assert ast.main_program.parent == ast
+    assert ast.main_program.parent == None
 
     main_program = ast.main_program
     # Both executed lines
@@ -43,7 +43,7 @@ def test_fortran_frontend_parent():
 
     for subroutine in ast.subroutine_definitions:
 
-        assert subroutine.parent == ast
+        assert subroutine.parent == None
         assert subroutine.execution_part.parent == subroutine
         for execution in subroutine.execution_part.execution:
             assert execution.parent == subroutine
diff --git a/tests/fortran/scope_arrays.py b/tests/fortran/scope_arrays.py
new file mode 100644
index 0000000000..0eb0cf44b2
--- /dev/null
+++ b/tests/fortran/scope_arrays.py
@@ -0,0 +1,47 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from dace.frontend.fortran import fortran_parser
+
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_parent():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM scope_test
+                    implicit none
+                    double precision d(4)
+                    double precision, dimension(5) :: arr
+                    double precision, dimension(50:54) :: arr3
+                    CALL scope_test_function(d)
+                    end
+
+                    SUBROUTINE scope_test_function(d)
+                    double precision d(4)
+                    double precision, dimension(50:54) :: arr4
+
+                    d(2)=5.5
+
+                    END SUBROUTINE scope_test_function
+                    """
+
+    ast, functions = fortran_parser.create_ast_from_string(test_string, "array_access_test")
+    ast_transforms.ParentScopeAssigner().visit(ast)
+    visitor = ast_transforms.ScopeVarsDeclarations()
+    visitor.visit(ast)
+
+    for var in ['d', 'arr', 'arr3']:
+        assert ('scope_test', var) in visitor.scope_vars
+        assert isinstance(visitor.scope_vars[('scope_test', var)], ast_internal_classes.Var_Decl_Node)
+        assert visitor.scope_vars[('scope_test', var)].name == var
+
+    for var in ['d', 'arr4']:
+        assert ('scope_test_function', var) in visitor.scope_vars
+        assert visitor.scope_vars[('scope_test_function', var)].name == var
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_parent()

From 22965569ca72888b6d5032e1d330add1a3888bec Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 17:36:47 +0200
Subject: [PATCH 380/392] First implementation of the offset normalization pass

---
 dace/frontend/fortran/ast_transforms.py | 107 +++++++++++++++++++----
 dace/frontend/fortran/fortran_parser.py |  37 ++++++--
 tests/fortran/offset_normalizer.py      | 109 ++++++++++++++++++++++++
 3 files changed, 228 insertions(+), 25 deletions(-)
 create mode 100644 tests/fortran/offset_normalizer.py

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index efeac3a430..750bf2571b 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,5 +1,6 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
+from sympy.matrices.expressions.slice import normalize
 from dace.frontend.fortran import ast_components, ast_internal_classes
 from typing import Dict, List, Optional, Tuple, Set
 import copy
@@ -354,16 +355,56 @@ def __init__(self):
 
         self.scope_vars: Dict[Tuple[str, str], ast_internal_classes.FNode] = {}
 
+    def get_var(self, scope: ast_internal_classes.FNode, variable_name: str) -> ast_internal_classes.FNode:
+        return self.scope_vars[(self._scope_name(scope), variable_name)]
+
     def visit_Var_Decl_Node(self, node: ast_internal_classes.Var_Decl_Node):
 
-        if isinstance(node.parent, ast_internal_classes.Main_Program_Node):
-            parent_name = node.parent.name.name.name
-        else:
-            parent_name = node.parent.name.name
+        parent_name = self._scope_name(node.parent)
         var_name = node.name
-
         self.scope_vars[(parent_name, var_name)] = node
 
+    def _scope_name(self, scope: ast_internal_classes.FNode) -> str:
+        if isinstance(scope, ast_internal_classes.Main_Program_Node):
+            return scope.name.name.name
+        else:
+            return scope.name.name
+
+
+class ArrayOffsetNormalizer(NodeTransformer):
+    """
+    """
+    def __init__(self, ast: ast_internal_classes.FNode):
+
+        ParentScopeAssigner().visit(ast)
+        self.scope_vars = ScopeVarsDeclarations()
+        self.scope_vars.visit(ast)
+
+    #def visit(self, node: ast_internal_classes.FNode):
+    #    #print(node)
+    #    return self.generic_visit(node)
+        
+    #def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+    #    print(node.name.name)
+    #    return node
+        #if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
+        #    return self.generic_visit(node)
+        #else:
+        #    return node
+
+    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
+        #print(node.name.name)
+        return node
+    #    tmp = self.count
+    #    new_indices = []
+    #    for i in node.indices:
+    #        if isinstance(i, ast_internal_classes.ParDecl_Node):
+    #            new_indices.append(i)
+    #        else:
+    #            new_indices.append(ast_internal_classes.Name_Node(name="tmp_index_" + str(tmp)))
+    #            tmp = tmp + 1
+    #    self.count = tmp
+    #    return ast_internal_classes.Array_Subscript_Node(name=node.name, indices=new_indices)
 
 class IndexExtractorNodeLister(NodeVisitor):
     """
@@ -390,9 +431,22 @@ class IndexExtractor(NodeTransformer):
     Uses the IndexExtractorNodeLister to find all array subscript expressions
     in the AST node and its children that have to be extracted into independent expressions
     It then creates a new temporary variable for each of them and replaces the index expression with the variable.
+
+    Before parsing the AST, the transformation first runs:
+    - ParentScopeAssigner to ensure that each node knows its scope assigner.
+    - ScopeVarsDeclarations to aggregate all variable declarations for each function.
     """
-    def __init__(self, count=0):
+    def __init__(self, ast: ast_internal_classes.FNode, normalize_offsets: bool = False, count=0):
+
         self.count = count
+        self.normalize_offsets = normalize_offsets
+
+        #self.variable_indices: Dict[]
+
+        if normalize_offsets:
+            ParentScopeAssigner().visit(ast)
+            self.scope_vars = ScopeVarsDeclarations()
+            self.scope_vars.visit(ast)
 
     def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
         if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
@@ -421,9 +475,11 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
             lister.visit(child)
             res = lister.nodes
             temp = self.count
+
+
             if res is not None:
                 for j in res:
-                    for i in j.indices:
+                    for idx, i in enumerate(j.indices):
                         if isinstance(i, ast_internal_classes.ParDecl_Node):
                             continue
                         else:
@@ -437,16 +493,33 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                                                                        line_number=child.line_number)
                                 ],
                                                                     line_number=child.line_number))
-                            newbody.append(
-                                ast_internal_classes.BinOp_Node(
-                                    op="=",
-                                    lval=ast_internal_classes.Name_Node(name=tmp_name),
-                                    rval=ast_internal_classes.BinOp_Node(
-                                        op="-",
-                                        lval=i,
-                                        rval=ast_internal_classes.Int_Literal_Node(value="1"),
-                                        line_number=child.line_number),
-                                    line_number=child.line_number))
+                            if self.normalize_offsets:
+
+                                var_name = child.lval.name.name
+                                variable = self.scope_vars.get_var(child.parent, var_name)
+                                offset = variable.offsets[idx]
+
+                                newbody.append(
+                                    ast_internal_classes.BinOp_Node(
+                                        op="=",
+                                        lval=ast_internal_classes.Name_Node(name=tmp_name),
+                                        rval=ast_internal_classes.BinOp_Node(
+                                            op="-",
+                                            lval=i,
+                                            rval=ast_internal_classes.Int_Literal_Node(value=str(offset)),
+                                            line_number=child.line_number),
+                                        line_number=child.line_number))
+                            else:
+                                newbody.append(
+                                    ast_internal_classes.BinOp_Node(
+                                        op="=",
+                                        lval=ast_internal_classes.Name_Node(name=tmp_name),
+                                        rval=ast_internal_classes.BinOp_Node(
+                                            op="-",
+                                            lval=i,
+                                            rval=ast_internal_classes.Int_Literal_Node(value="1"),
+                                            line_number=child.line_number),
+                                        line_number=child.line_number))
             newbody.append(self.visit(child))
         return ast_internal_classes.Execution_Part_Node(execution=newbody)
 
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index b1041ac4eb..7f092a5f02 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -133,7 +133,7 @@ def translate(self, node: ast_internal_classes.FNode, sdfg: SDFG):
             for i in node:
                 self.translate(i, sdfg)
         else:
-            warnings.warn("WARNING:", node.__class__.__name__)
+            warnings.warn(f"WARNING: {node.__class__.__name__}")
 
     def ast2sdfg(self, node: ast_internal_classes.Program_Node, sdfg: SDFG):
         """
@@ -1018,7 +1018,8 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
 def create_ast_from_string(
     source_string: str,
     sdfg_name: str,
-    transform: bool = False
+    transform: bool = False,
+    normalize_offsets: bool = False
 ):
     """
     Creates an AST from a Fortran file in a string
@@ -1046,13 +1047,33 @@ def create_ast_from_string(
         program = ast_transforms.ArrayToLoop().visit(program)
         program = ast_transforms.SumToLoop().visit(program)
         program = ast_transforms.ForDeclarer().visit(program)
-        program = ast_transforms.IndexExtractor().visit(program)
+        program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
 
-    return (program, functions_and_subroutines)
+    return (program, own_ast)
+
+def ast2sdfg(program, own_ast, sdfg_name: str):
+
+    ast2sdfg = AST_translator(own_ast, __file__)
+    sdfg = SDFG(sdfg_name)
+    ast2sdfg.top_level = program
+    ast2sdfg.globalsdfg = sdfg
+    ast2sdfg.translate(program, sdfg)
+
+    for node, parent in sdfg.all_nodes_recursive():
+        if isinstance(node, nodes.NestedSDFG):
+            if 'test_function' in node.sdfg.name:
+                sdfg = node.sdfg
+                break
+    sdfg.parent = None
+    sdfg.parent_sdfg = None
+    sdfg.parent_nsdfg_node = None
+    sdfg.reset_sdfg_list()
+    return sdfg
 
 def create_sdfg_from_string(
     source_string: str,
     sdfg_name: str,
+    normalize_offsets: bool = False
 ):
     """
     Creates an SDFG from a fortran file in a string
@@ -1066,7 +1087,7 @@ def create_sdfg_from_string(
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast, None)
+    program = own_ast.create_ast(ast)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1077,7 +1098,7 @@ def create_sdfg_from_string(
     program = ast_transforms.ArrayToLoop().visit(program)
     program = ast_transforms.SumToLoop().visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
-    program = ast_transforms.IndexExtractor().visit(program)
+    program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
     sdfg = SDFG(sdfg_name)
     ast2sdfg.top_level = program
@@ -1108,7 +1129,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast, None)
+    program = own_ast.create_ast(ast)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1119,7 +1140,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     program = ast_transforms.ArrayToLoop().visit(program)
     program = ast_transforms.SumToLoop().visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
-    program = ast_transforms.IndexExtractor().visit(program)
+    program = ast_transforms.IndexExtractor(program).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
     sdfg = SDFG(source_string)
     ast2sdfg.top_level = program
diff --git a/tests/fortran/offset_normalizer.py b/tests/fortran/offset_normalizer.py
new file mode 100644
index 0000000000..101a47e59b
--- /dev/null
+++ b/tests/fortran/offset_normalizer.py
@@ -0,0 +1,109 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import numpy as np
+
+from dace.frontend.fortran import ast_transforms, fortran_parser
+
+def test_fortran_frontend_offset_normalizer_1d():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(50:54) :: d
+
+                    do i=50,54
+                        d(i) = i * 2.0
+                    end do
+                    !do i=50,54
+                    !    do j=10,15
+                    !        d(i, j) = i * 2.0
+                    !        !d(i, :) = i * 2.0
+                    !    end do
+                    !end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Test to verify that offset is normalized correctly
+    ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
+
+    for subroutine in ast.subroutine_definitions:
+
+        loop = subroutine.execution_part.execution[1]
+        idx_assignment = loop.body.execution[1]
+        assert idx_assignment.rval.rval.value == "50"
+
+    # Now test to verify it executes correctly
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", True)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+
+    a = np.full([5], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(0,5):
+        assert a[i] == (50+i)* 2
+
+def test_fortran_frontend_offset_normalizer_2d():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54,7:9) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(50:54,7:9) :: d
+
+                    do i=50,54
+                        do j=7,9
+                            d(i, j) = i * 2.0 + 3 * j
+                            !d(i, :) = i * 2.0
+                        end do
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Test to verify that offset is normalized correctly
+    ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
+
+    #for subroutine in ast.subroutine_definitions:
+
+    #    loop = subroutine.execution_part.execution[1]
+    #    idx_assignment = loop.body.execution[1]
+    #    assert idx_assignment.rval.rval.value == "50"
+
+    # Now test to verify it executes correctly
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", True)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 2
+    assert sdfg.data('d').shape[0] == 5
+    assert sdfg.data('d').shape[1] == 3
+
+    a = np.full([5,3], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(0,5):
+        for j in range(0,3):
+            assert a[i, j] == (50+i) * 2 + 3 * (7 + j)
+
+if __name__ == "__main__":
+
+    #test_fortran_frontend_offset_normalizer_1d()
+    test_fortran_frontend_offset_normalizer_2d()

From 3f769829d44f2957067f214658b5afb41ef4dac8 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 14:43:19 +0200
Subject: [PATCH 381/392] Add Fortran AST transformation assigning to each node
 its parent scope

---
 dace/frontend/fortran/ast_internal_classes.py |  3 +-
 dace/frontend/fortran/ast_transforms.py       | 35 +++++++++++-
 tests/fortran/parent_test.py                  | 54 +++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 tests/fortran/parent_test.py

diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index f9bf97ca08..ffa3cd2d76 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -1,5 +1,5 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
-from typing import Any, List, Tuple, Type, TypeVar, Union, overload
+from typing import Any, List, Optional, Tuple, Type, TypeVar, Union, overload
 
 # The node class is the base class for all nodes in the AST. It provides attributes including the line number and fields.
 # Attributes are not used when walking the tree, but are useful for debugging and for code generation.
@@ -11,6 +11,7 @@ def __init__(self, *args, **kwargs):  # real signature unknown
         self.integrity_exceptions = []
         self.read_vars = []
         self.written_vars = []
+        self.parent: Optional["FNode"] = None
         for k, v in kwargs.items():
             setattr(self, k, v)
 
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 7e5cd3bf00..b0196506ee 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,7 +1,7 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
 from dace.frontend.fortran import ast_components, ast_internal_classes
-from typing import List, Tuple, Set
+from typing import List, Optional, Tuple, Set
 import copy
 
 
@@ -310,6 +310,39 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
 
         return ast_internal_classes.Execution_Part_Node(execution=newbody)
 
+class ParentScopeAssigner(NodeVisitor):
+    """
+        For each node, it assigns its parent scope - program, subroutine, function.
+
+        If the parent node is one of the "parent" types, we assign it as the parent.
+        Otherwise, we look for the parent of my parent to cover nested AST nodes within
+        a single scope.
+    """
+    def __init__(self):
+        pass
+
+    def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_internal_classes.FNode] = None):
+
+        parent_node_types = [
+            ast_internal_classes.Subroutine_Subprogram_Node,
+            ast_internal_classes.Function_Subprogram_Node,
+            ast_internal_classes.Main_Program_Node,
+            ast_internal_classes.Program_Node
+        ]
+
+        if parent_node is not None and type(parent_node) in parent_node_types:
+            node.parent = parent_node
+        elif parent_node is not None:
+            node.parent = parent_node.parent
+
+        # Copied from `generic_visit` to recursively parse all leafs
+        for field, value in iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, ast_internal_classes.FNode):
+                        self.visit(item, node)
+            elif isinstance(value, ast_internal_classes.FNode):
+                self.visit(value, node)
 
 class IndexExtractorNodeLister(NodeVisitor):
     """
diff --git a/tests/fortran/parent_test.py b/tests/fortran/parent_test.py
new file mode 100644
index 0000000000..c3f0ce71b5
--- /dev/null
+++ b/tests/fortran/parent_test.py
@@ -0,0 +1,54 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from dace.frontend.fortran import fortran_parser
+
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_parent():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM access_test
+                    implicit none
+                    double precision d(4)
+                    d(1)=0
+                    CALL array_access_test_function(d)
+                    end
+
+                    SUBROUTINE array_access_test_function(d)
+                    double precision d(4)
+
+                    d(2)=5.5
+
+                    END SUBROUTINE array_access_test_function
+                    """
+    ast, functions = fortran_parser.create_ast_from_string(test_string, "array_access_test")
+    ast_transforms.ParentScopeAssigner().visit(ast)
+
+    assert ast.parent is None
+    assert ast.main_program.parent == ast
+
+    main_program = ast.main_program
+    # Both executed lines
+    for execution in main_program.execution_part.execution:
+        assert execution.parent == main_program
+    # call to the function
+    call_node = main_program.execution_part.execution[1]
+    assert isinstance(call_node, ast_internal_classes.Call_Expr_Node)
+    for arg in call_node.args:
+        assert arg.parent == main_program
+
+    for subroutine in ast.subroutine_definitions:
+
+        assert subroutine.parent == ast
+        assert subroutine.execution_part.parent == subroutine
+        for execution in subroutine.execution_part.execution:
+            assert execution.parent == subroutine
+
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_parent()

From 60e954764839db80f667d27f79f05a6239d113fa Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 14:44:56 +0200
Subject: [PATCH 382/392] Add new Fortran parser function to export pure AST,
 not SDFG

---
 dace/frontend/fortran/fortran_parser.py | 38 +++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index d7112892fe..b1041ac4eb 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -1015,6 +1015,40 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
         if node.name not in self.contexts[sdfg.name].containers:
             self.contexts[sdfg.name].containers.append(node.name)
 
+def create_ast_from_string(
+    source_string: str,
+    sdfg_name: str,
+    transform: bool = False
+):
+    """
+    Creates an AST from a Fortran file in a string
+    :param source_string: The fortran file as a string
+    :param sdfg_name: The name to be given to the resulting SDFG
+    :return: The resulting AST
+
+    """
+    parser = pf().create(std="f2008")
+    reader = fsr(source_string)
+    ast = parser(reader)
+    tables = SymbolTable
+    own_ast = ast_components.InternalFortranAst(ast, tables)
+    program = own_ast.create_ast(ast)
+
+    functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
+    functions_and_subroutines_builder.visit(program)
+    functions_and_subroutines = functions_and_subroutines_builder.nodes
+
+    if transform:
+        program = ast_transforms.functionStatementEliminator(program)
+        program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
+        program = ast_transforms.CallExtractor().visit(program)
+        program = ast_transforms.SignToIf().visit(program)
+        program = ast_transforms.ArrayToLoop().visit(program)
+        program = ast_transforms.SumToLoop().visit(program)
+        program = ast_transforms.ForDeclarer().visit(program)
+        program = ast_transforms.IndexExtractor().visit(program)
+
+    return (program, functions_and_subroutines)
 
 def create_sdfg_from_string(
     source_string: str,
@@ -1032,7 +1066,7 @@ def create_sdfg_from_string(
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast)
+    program = own_ast.create_ast(ast, None)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1074,7 +1108,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast)
+    program = own_ast.create_ast(ast, None)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes

From 17eaf5a27c70c373b39009fd79b59c7744a943ab Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 15:27:36 +0200
Subject: [PATCH 383/392] Add Fortran AST pass to gather all variable
 declarations inside a scope

---
 dace/frontend/fortran/ast_internal_classes.py |  8 +++-
 dace/frontend/fortran/ast_transforms.py       | 27 +++++++++--
 tests/fortran/parent_test.py                  |  4 +-
 tests/fortran/scope_arrays.py                 | 47 +++++++++++++++++++
 4 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 tests/fortran/scope_arrays.py

diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index ffa3cd2d76..171b941858 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -11,7 +11,13 @@ def __init__(self, *args, **kwargs):  # real signature unknown
         self.integrity_exceptions = []
         self.read_vars = []
         self.written_vars = []
-        self.parent: Optional["FNode"] = None
+        self.parent: Optional[
+            Union[
+                Subroutine_Subprogram_Node,
+                Function_Subprogram_Node,
+                Main_Program_Node
+            ]
+        ] = None
         for k, v in kwargs.items():
             setattr(self, k, v)
 
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index b0196506ee..efeac3a430 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,7 +1,7 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
 from dace.frontend.fortran import ast_components, ast_internal_classes
-from typing import List, Optional, Tuple, Set
+from typing import Dict, List, Optional, Tuple, Set
 import copy
 
 
@@ -326,8 +326,7 @@ def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_inte
         parent_node_types = [
             ast_internal_classes.Subroutine_Subprogram_Node,
             ast_internal_classes.Function_Subprogram_Node,
-            ast_internal_classes.Main_Program_Node,
-            ast_internal_classes.Program_Node
+            ast_internal_classes.Main_Program_Node
         ]
 
         if parent_node is not None and type(parent_node) in parent_node_types:
@@ -344,6 +343,28 @@ def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_inte
             elif isinstance(value, ast_internal_classes.FNode):
                 self.visit(value, node)
 
+class ScopeVarsDeclarations(NodeVisitor):
+    """
+        Creates a mapping (scope name, variable name) -> variable declaration.
+
+        The visitor is used to access information on variable dimension, sizes, and offsets.
+    """
+
+    def __init__(self):
+
+        self.scope_vars: Dict[Tuple[str, str], ast_internal_classes.FNode] = {}
+
+    def visit_Var_Decl_Node(self, node: ast_internal_classes.Var_Decl_Node):
+
+        if isinstance(node.parent, ast_internal_classes.Main_Program_Node):
+            parent_name = node.parent.name.name.name
+        else:
+            parent_name = node.parent.name.name
+        var_name = node.name
+
+        self.scope_vars[(parent_name, var_name)] = node
+
+
 class IndexExtractorNodeLister(NodeVisitor):
     """
     Finds all array subscript expressions in the AST node and its children that have to be extracted into independent expressions
diff --git a/tests/fortran/parent_test.py b/tests/fortran/parent_test.py
index c3f0ce71b5..e68f03db8c 100644
--- a/tests/fortran/parent_test.py
+++ b/tests/fortran/parent_test.py
@@ -29,7 +29,7 @@ def test_fortran_frontend_parent():
     ast_transforms.ParentScopeAssigner().visit(ast)
 
     assert ast.parent is None
-    assert ast.main_program.parent == ast
+    assert ast.main_program.parent == None
 
     main_program = ast.main_program
     # Both executed lines
@@ -43,7 +43,7 @@ def test_fortran_frontend_parent():
 
     for subroutine in ast.subroutine_definitions:
 
-        assert subroutine.parent == ast
+        assert subroutine.parent == None
         assert subroutine.execution_part.parent == subroutine
         for execution in subroutine.execution_part.execution:
             assert execution.parent == subroutine
diff --git a/tests/fortran/scope_arrays.py b/tests/fortran/scope_arrays.py
new file mode 100644
index 0000000000..0eb0cf44b2
--- /dev/null
+++ b/tests/fortran/scope_arrays.py
@@ -0,0 +1,47 @@
+# Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+from dace.frontend.fortran import fortran_parser
+
+import dace.frontend.fortran.ast_transforms as ast_transforms
+import dace.frontend.fortran.ast_internal_classes as ast_internal_classes
+
+
+def test_fortran_frontend_parent():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM scope_test
+                    implicit none
+                    double precision d(4)
+                    double precision, dimension(5) :: arr
+                    double precision, dimension(50:54) :: arr3
+                    CALL scope_test_function(d)
+                    end
+
+                    SUBROUTINE scope_test_function(d)
+                    double precision d(4)
+                    double precision, dimension(50:54) :: arr4
+
+                    d(2)=5.5
+
+                    END SUBROUTINE scope_test_function
+                    """
+
+    ast, functions = fortran_parser.create_ast_from_string(test_string, "array_access_test")
+    ast_transforms.ParentScopeAssigner().visit(ast)
+    visitor = ast_transforms.ScopeVarsDeclarations()
+    visitor.visit(ast)
+
+    for var in ['d', 'arr', 'arr3']:
+        assert ('scope_test', var) in visitor.scope_vars
+        assert isinstance(visitor.scope_vars[('scope_test', var)], ast_internal_classes.Var_Decl_Node)
+        assert visitor.scope_vars[('scope_test', var)].name == var
+
+    for var in ['d', 'arr4']:
+        assert ('scope_test_function', var) in visitor.scope_vars
+        assert visitor.scope_vars[('scope_test_function', var)].name == var
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_parent()

From 1be4754dfa16f3fb816643cd523de1804a8505d0 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 17:36:47 +0200
Subject: [PATCH 384/392] First implementation of the offset normalization pass

---
 dace/frontend/fortran/ast_transforms.py | 107 +++++++++++++++++++----
 dace/frontend/fortran/fortran_parser.py |  37 ++++++--
 tests/fortran/offset_normalizer.py      | 109 ++++++++++++++++++++++++
 3 files changed, 228 insertions(+), 25 deletions(-)
 create mode 100644 tests/fortran/offset_normalizer.py

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index efeac3a430..750bf2571b 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,5 +1,6 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
+from sympy.matrices.expressions.slice import normalize
 from dace.frontend.fortran import ast_components, ast_internal_classes
 from typing import Dict, List, Optional, Tuple, Set
 import copy
@@ -354,16 +355,56 @@ def __init__(self):
 
         self.scope_vars: Dict[Tuple[str, str], ast_internal_classes.FNode] = {}
 
+    def get_var(self, scope: ast_internal_classes.FNode, variable_name: str) -> ast_internal_classes.FNode:
+        return self.scope_vars[(self._scope_name(scope), variable_name)]
+
     def visit_Var_Decl_Node(self, node: ast_internal_classes.Var_Decl_Node):
 
-        if isinstance(node.parent, ast_internal_classes.Main_Program_Node):
-            parent_name = node.parent.name.name.name
-        else:
-            parent_name = node.parent.name.name
+        parent_name = self._scope_name(node.parent)
         var_name = node.name
-
         self.scope_vars[(parent_name, var_name)] = node
 
+    def _scope_name(self, scope: ast_internal_classes.FNode) -> str:
+        if isinstance(scope, ast_internal_classes.Main_Program_Node):
+            return scope.name.name.name
+        else:
+            return scope.name.name
+
+
+class ArrayOffsetNormalizer(NodeTransformer):
+    """
+    """
+    def __init__(self, ast: ast_internal_classes.FNode):
+
+        ParentScopeAssigner().visit(ast)
+        self.scope_vars = ScopeVarsDeclarations()
+        self.scope_vars.visit(ast)
+
+    #def visit(self, node: ast_internal_classes.FNode):
+    #    #print(node)
+    #    return self.generic_visit(node)
+        
+    #def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
+    #    print(node.name.name)
+    #    return node
+        #if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
+        #    return self.generic_visit(node)
+        #else:
+        #    return node
+
+    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
+        #print(node.name.name)
+        return node
+    #    tmp = self.count
+    #    new_indices = []
+    #    for i in node.indices:
+    #        if isinstance(i, ast_internal_classes.ParDecl_Node):
+    #            new_indices.append(i)
+    #        else:
+    #            new_indices.append(ast_internal_classes.Name_Node(name="tmp_index_" + str(tmp)))
+    #            tmp = tmp + 1
+    #    self.count = tmp
+    #    return ast_internal_classes.Array_Subscript_Node(name=node.name, indices=new_indices)
 
 class IndexExtractorNodeLister(NodeVisitor):
     """
@@ -390,9 +431,22 @@ class IndexExtractor(NodeTransformer):
     Uses the IndexExtractorNodeLister to find all array subscript expressions
     in the AST node and its children that have to be extracted into independent expressions
     It then creates a new temporary variable for each of them and replaces the index expression with the variable.
+
+    Before parsing the AST, the transformation first runs:
+    - ParentScopeAssigner to ensure that each node knows its scope assigner.
+    - ScopeVarsDeclarations to aggregate all variable declarations for each function.
     """
-    def __init__(self, count=0):
+    def __init__(self, ast: ast_internal_classes.FNode, normalize_offsets: bool = False, count=0):
+
         self.count = count
+        self.normalize_offsets = normalize_offsets
+
+        #self.variable_indices: Dict[]
+
+        if normalize_offsets:
+            ParentScopeAssigner().visit(ast)
+            self.scope_vars = ScopeVarsDeclarations()
+            self.scope_vars.visit(ast)
 
     def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
         if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
@@ -421,9 +475,11 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
             lister.visit(child)
             res = lister.nodes
             temp = self.count
+
+
             if res is not None:
                 for j in res:
-                    for i in j.indices:
+                    for idx, i in enumerate(j.indices):
                         if isinstance(i, ast_internal_classes.ParDecl_Node):
                             continue
                         else:
@@ -437,16 +493,33 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                                                                        line_number=child.line_number)
                                 ],
                                                                     line_number=child.line_number))
-                            newbody.append(
-                                ast_internal_classes.BinOp_Node(
-                                    op="=",
-                                    lval=ast_internal_classes.Name_Node(name=tmp_name),
-                                    rval=ast_internal_classes.BinOp_Node(
-                                        op="-",
-                                        lval=i,
-                                        rval=ast_internal_classes.Int_Literal_Node(value="1"),
-                                        line_number=child.line_number),
-                                    line_number=child.line_number))
+                            if self.normalize_offsets:
+
+                                var_name = child.lval.name.name
+                                variable = self.scope_vars.get_var(child.parent, var_name)
+                                offset = variable.offsets[idx]
+
+                                newbody.append(
+                                    ast_internal_classes.BinOp_Node(
+                                        op="=",
+                                        lval=ast_internal_classes.Name_Node(name=tmp_name),
+                                        rval=ast_internal_classes.BinOp_Node(
+                                            op="-",
+                                            lval=i,
+                                            rval=ast_internal_classes.Int_Literal_Node(value=str(offset)),
+                                            line_number=child.line_number),
+                                        line_number=child.line_number))
+                            else:
+                                newbody.append(
+                                    ast_internal_classes.BinOp_Node(
+                                        op="=",
+                                        lval=ast_internal_classes.Name_Node(name=tmp_name),
+                                        rval=ast_internal_classes.BinOp_Node(
+                                            op="-",
+                                            lval=i,
+                                            rval=ast_internal_classes.Int_Literal_Node(value="1"),
+                                            line_number=child.line_number),
+                                        line_number=child.line_number))
             newbody.append(self.visit(child))
         return ast_internal_classes.Execution_Part_Node(execution=newbody)
 
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index b1041ac4eb..7f092a5f02 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -133,7 +133,7 @@ def translate(self, node: ast_internal_classes.FNode, sdfg: SDFG):
             for i in node:
                 self.translate(i, sdfg)
         else:
-            warnings.warn("WARNING:", node.__class__.__name__)
+            warnings.warn(f"WARNING: {node.__class__.__name__}")
 
     def ast2sdfg(self, node: ast_internal_classes.Program_Node, sdfg: SDFG):
         """
@@ -1018,7 +1018,8 @@ def vardecl2sdfg(self, node: ast_internal_classes.Var_Decl_Node, sdfg: SDFG):
 def create_ast_from_string(
     source_string: str,
     sdfg_name: str,
-    transform: bool = False
+    transform: bool = False,
+    normalize_offsets: bool = False
 ):
     """
     Creates an AST from a Fortran file in a string
@@ -1046,13 +1047,33 @@ def create_ast_from_string(
         program = ast_transforms.ArrayToLoop().visit(program)
         program = ast_transforms.SumToLoop().visit(program)
         program = ast_transforms.ForDeclarer().visit(program)
-        program = ast_transforms.IndexExtractor().visit(program)
+        program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
 
-    return (program, functions_and_subroutines)
+    return (program, own_ast)
+
+def ast2sdfg(program, own_ast, sdfg_name: str):
+
+    ast2sdfg = AST_translator(own_ast, __file__)
+    sdfg = SDFG(sdfg_name)
+    ast2sdfg.top_level = program
+    ast2sdfg.globalsdfg = sdfg
+    ast2sdfg.translate(program, sdfg)
+
+    for node, parent in sdfg.all_nodes_recursive():
+        if isinstance(node, nodes.NestedSDFG):
+            if 'test_function' in node.sdfg.name:
+                sdfg = node.sdfg
+                break
+    sdfg.parent = None
+    sdfg.parent_sdfg = None
+    sdfg.parent_nsdfg_node = None
+    sdfg.reset_sdfg_list()
+    return sdfg
 
 def create_sdfg_from_string(
     source_string: str,
     sdfg_name: str,
+    normalize_offsets: bool = False
 ):
     """
     Creates an SDFG from a fortran file in a string
@@ -1066,7 +1087,7 @@ def create_sdfg_from_string(
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast, None)
+    program = own_ast.create_ast(ast)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1077,7 +1098,7 @@ def create_sdfg_from_string(
     program = ast_transforms.ArrayToLoop().visit(program)
     program = ast_transforms.SumToLoop().visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
-    program = ast_transforms.IndexExtractor().visit(program)
+    program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
     sdfg = SDFG(sdfg_name)
     ast2sdfg.top_level = program
@@ -1108,7 +1129,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     ast = parser(reader)
     tables = SymbolTable
     own_ast = ast_components.InternalFortranAst(ast, tables)
-    program = own_ast.create_ast(ast, None)
+    program = own_ast.create_ast(ast)
     functions_and_subroutines_builder = ast_transforms.FindFunctionAndSubroutines()
     functions_and_subroutines_builder.visit(program)
     own_ast.functions_and_subroutines = functions_and_subroutines_builder.nodes
@@ -1119,7 +1140,7 @@ def create_sdfg_from_fortran_file(source_string: str):
     program = ast_transforms.ArrayToLoop().visit(program)
     program = ast_transforms.SumToLoop().visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
-    program = ast_transforms.IndexExtractor().visit(program)
+    program = ast_transforms.IndexExtractor(program).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
     sdfg = SDFG(source_string)
     ast2sdfg.top_level = program
diff --git a/tests/fortran/offset_normalizer.py b/tests/fortran/offset_normalizer.py
new file mode 100644
index 0000000000..101a47e59b
--- /dev/null
+++ b/tests/fortran/offset_normalizer.py
@@ -0,0 +1,109 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import numpy as np
+
+from dace.frontend.fortran import ast_transforms, fortran_parser
+
+def test_fortran_frontend_offset_normalizer_1d():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(50:54) :: d
+
+                    do i=50,54
+                        d(i) = i * 2.0
+                    end do
+                    !do i=50,54
+                    !    do j=10,15
+                    !        d(i, j) = i * 2.0
+                    !        !d(i, :) = i * 2.0
+                    !    end do
+                    !end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Test to verify that offset is normalized correctly
+    ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
+
+    for subroutine in ast.subroutine_definitions:
+
+        loop = subroutine.execution_part.execution[1]
+        idx_assignment = loop.body.execution[1]
+        assert idx_assignment.rval.rval.value == "50"
+
+    # Now test to verify it executes correctly
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", True)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+
+    a = np.full([5], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(0,5):
+        assert a[i] == (50+i)* 2
+
+def test_fortran_frontend_offset_normalizer_2d():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54,7:9) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(50:54,7:9) :: d
+
+                    do i=50,54
+                        do j=7,9
+                            d(i, j) = i * 2.0 + 3 * j
+                            !d(i, :) = i * 2.0
+                        end do
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Test to verify that offset is normalized correctly
+    ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
+
+    #for subroutine in ast.subroutine_definitions:
+
+    #    loop = subroutine.execution_part.execution[1]
+    #    idx_assignment = loop.body.execution[1]
+    #    assert idx_assignment.rval.rval.value == "50"
+
+    # Now test to verify it executes correctly
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", True)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 2
+    assert sdfg.data('d').shape[0] == 5
+    assert sdfg.data('d').shape[1] == 3
+
+    a = np.full([5,3], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(0,5):
+        for j in range(0,3):
+            assert a[i, j] == (50+i) * 2 + 3 * (7 + j)
+
+if __name__ == "__main__":
+
+    #test_fortran_frontend_offset_normalizer_1d()
+    test_fortran_frontend_offset_normalizer_2d()

From 027f1e28f361a754cdd2e1666664f4637a31fe22 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 17:46:46 +0200
Subject: [PATCH 385/392] Remove dead and old code

---
 dace/frontend/fortran/ast_transforms.py | 40 +------------------------
 dace/frontend/fortran/fortran_parser.py | 19 ------------
 2 files changed, 1 insertion(+), 58 deletions(-)

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 750bf2571b..822024ffa7 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -1,6 +1,5 @@
 # Copyright 2023 ETH Zurich and the DaCe authors. All rights reserved.
 
-from sympy.matrices.expressions.slice import normalize
 from dace.frontend.fortran import ast_components, ast_internal_classes
 from typing import Dict, List, Optional, Tuple, Set
 import copy
@@ -370,42 +369,6 @@ def _scope_name(self, scope: ast_internal_classes.FNode) -> str:
         else:
             return scope.name.name
 
-
-class ArrayOffsetNormalizer(NodeTransformer):
-    """
-    """
-    def __init__(self, ast: ast_internal_classes.FNode):
-
-        ParentScopeAssigner().visit(ast)
-        self.scope_vars = ScopeVarsDeclarations()
-        self.scope_vars.visit(ast)
-
-    #def visit(self, node: ast_internal_classes.FNode):
-    #    #print(node)
-    #    return self.generic_visit(node)
-        
-    #def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node):
-    #    print(node.name.name)
-    #    return node
-        #if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]:
-        #    return self.generic_visit(node)
-        #else:
-        #    return node
-
-    def visit_Array_Subscript_Node(self, node: ast_internal_classes.Array_Subscript_Node):
-        #print(node.name.name)
-        return node
-    #    tmp = self.count
-    #    new_indices = []
-    #    for i in node.indices:
-    #        if isinstance(i, ast_internal_classes.ParDecl_Node):
-    #            new_indices.append(i)
-    #        else:
-    #            new_indices.append(ast_internal_classes.Name_Node(name="tmp_index_" + str(tmp)))
-    #            tmp = tmp + 1
-    #    self.count = tmp
-    #    return ast_internal_classes.Array_Subscript_Node(name=node.name, indices=new_indices)
-
 class IndexExtractorNodeLister(NodeVisitor):
     """
     Finds all array subscript expressions in the AST node and its children that have to be extracted into independent expressions
@@ -441,8 +404,6 @@ def __init__(self, ast: ast_internal_classes.FNode, normalize_offsets: bool = Fa
         self.count = count
         self.normalize_offsets = normalize_offsets
 
-        #self.variable_indices: Dict[]
-
         if normalize_offsets:
             ParentScopeAssigner().visit(ast)
             self.scope_vars = ScopeVarsDeclarations()
@@ -495,6 +456,7 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                                                                     line_number=child.line_number))
                             if self.normalize_offsets:
 
+                                # Find the offset of a variable to which we are assigning
                                 var_name = child.lval.name.name
                                 variable = self.scope_vars.get_var(child.parent, var_name)
                                 offset = variable.offsets[idx]
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 7f092a5f02..7253ec78e6 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -1051,25 +1051,6 @@ def create_ast_from_string(
 
     return (program, own_ast)
 
-def ast2sdfg(program, own_ast, sdfg_name: str):
-
-    ast2sdfg = AST_translator(own_ast, __file__)
-    sdfg = SDFG(sdfg_name)
-    ast2sdfg.top_level = program
-    ast2sdfg.globalsdfg = sdfg
-    ast2sdfg.translate(program, sdfg)
-
-    for node, parent in sdfg.all_nodes_recursive():
-        if isinstance(node, nodes.NestedSDFG):
-            if 'test_function' in node.sdfg.name:
-                sdfg = node.sdfg
-                break
-    sdfg.parent = None
-    sdfg.parent_sdfg = None
-    sdfg.parent_nsdfg_node = None
-    sdfg.reset_sdfg_list()
-    return sdfg
-
 def create_sdfg_from_string(
     source_string: str,
     sdfg_name: str,

From b6d9320fc4c1800ac4852f87e3815091429ea40d Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 17:47:01 +0200
Subject: [PATCH 386/392] Update the 2D offset normalizer tests to verify
 offsets on the AST level

---
 tests/fortran/offset_normalizer.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/fortran/offset_normalizer.py b/tests/fortran/offset_normalizer.py
index 101a47e59b..1044deed79 100644
--- a/tests/fortran/offset_normalizer.py
+++ b/tests/fortran/offset_normalizer.py
@@ -81,11 +81,18 @@ def test_fortran_frontend_offset_normalizer_2d():
     # Test to verify that offset is normalized correctly
     ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
 
-    #for subroutine in ast.subroutine_definitions:
+    for subroutine in ast.subroutine_definitions:
+
+        loop = subroutine.execution_part.execution[1]
+        nested_loop = loop.body.execution[1]
+
+        idx = nested_loop.body.execution[1]
+        assert idx.lval.name == 'tmp_index_0'
+        assert idx.rval.rval.value == "50"
 
-    #    loop = subroutine.execution_part.execution[1]
-    #    idx_assignment = loop.body.execution[1]
-    #    assert idx_assignment.rval.rval.value == "50"
+        idx2 = nested_loop.body.execution[3]
+        assert idx2.lval.name == 'tmp_index_1'
+        assert idx2.rval.rval.value == "7"
 
     # Now test to verify it executes correctly
 
@@ -105,5 +112,5 @@ def test_fortran_frontend_offset_normalizer_2d():
 
 if __name__ == "__main__":
 
-    #test_fortran_frontend_offset_normalizer_1d()
+    test_fortran_frontend_offset_normalizer_1d()
     test_fortran_frontend_offset_normalizer_2d()

From 379dadaec19d949c35d029aa6a2a3fe116633094 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 19:38:41 +0200
Subject: [PATCH 387/392] Fix handling of ArrayToLoop when involved arrays have
 offsets

---
 dace/frontend/fortran/ast_transforms.py | 70 +++++++++++++++++----
 dace/frontend/fortran/fortran_parser.py | 12 ++--
 tests/fortran/array_to_loop_offset.py   | 84 +++++++++++++++++++++++++
 3 files changed, 147 insertions(+), 19 deletions(-)
 create mode 100644 tests/fortran/array_to_loop_offset.py

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 822024ffa7..9ee11aa54d 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -735,6 +735,7 @@ def par_Decl_Range_Finder(node: ast_internal_classes.Array_Subscript_Node,
                           rangepos: list,
                           count: int,
                           newbody: list,
+                          scope_vars: ScopeVarsDeclarations,
                           declaration=True,
                           is_sum_to_loop=False):
     """
@@ -749,18 +750,54 @@ def par_Decl_Range_Finder(node: ast_internal_classes.Array_Subscript_Node,
     :return: Ranges, rangepos, newbody
     """
 
+    def add_offset(original, offset: int):
+
+        if offset != 0:
+            return ast_internal_classes.BinOp_Node(
+                lval=original,
+                op="+",
+                rval=ast_internal_classes.Int_Literal_Node(value=str(offset))
+            )
+        else:
+            return original
+
     currentindex = 0
     indices = []
-    for i in node.indices:
+    offsets = scope_vars.get_var(node.parent, node.name.name).offsets
+
+    for idx, i in enumerate(node.indices):
         if isinstance(i, ast_internal_classes.ParDecl_Node):
+
             if i.type == "ALL":
-                ranges.append([
-                    ast_internal_classes.Int_Literal_Node(value="1"),
-                    ast_internal_classes.Name_Range_Node(name="f2dace_MAX",
-                                                         type="INTEGER",
-                                                         arrname=node.name,
-                                                         pos=currentindex)
-                ])
+
+                lower_boundary = None
+                if offsets[idx] != 0:
+                    lower_boundary = ast_internal_classes.Int_Literal_Node(value=str(offsets[idx]))
+                else:
+                    lower_boundary = ast_internal_classes.Int_Literal_Node(value="1"),
+
+                upper_boundary = None
+                upper_boundary = ast_internal_classes.Name_Range_Node(name="f2dace_MAX",
+                                                        type="INTEGER",
+                                                        arrname=node.name,
+                                                        pos=currentindex)
+                """
+                    When there's an offset, we add MAX_RANGE + offset.
+                    But since the generated loop has `<=` condition, we need to subtract 1.
+                """
+                if offsets[idx] != 0:
+                    upper_boundary = ast_internal_classes.BinOp_Node(
+                        lval=upper_boundary,
+                        op="+",
+                        rval=ast_internal_classes.Int_Literal_Node(value=str(offsets[idx]))
+                    )
+                    upper_boundary = ast_internal_classes.BinOp_Node(
+                        lval=upper_boundary,
+                        op="-",
+                        rval=ast_internal_classes.Int_Literal_Node(value="1")
+                    )
+                ranges.append([lower_boundary, upper_boundary])
+
             else:
                 ranges.append([i.range[0], i.range[1]])
             rangepos.append(currentindex)
@@ -782,9 +819,13 @@ class ArrayToLoop(NodeTransformer):
     """
     Transforms the AST by removing array expressions and replacing them with loops
     """
-    def __init__(self):
+    def __init__(self, ast):
         self.count = 0
 
+        ParentScopeAssigner().visit(ast)
+        self.scope_vars = ScopeVarsDeclarations()
+        self.scope_vars.visit(ast)
+
     def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
         newbody = []
         for child in node.execution:
@@ -798,7 +839,7 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                 val = child.rval
                 ranges = []
                 rangepos = []
-                par_Decl_Range_Finder(current, ranges, rangepos, self.count, newbody, True)
+                par_Decl_Range_Finder(current, ranges, rangepos, self.count, newbody, self.scope_vars, True)
 
                 if res_range is not None and len(res_range) > 0:
                     rvals = [i for i in mywalk(val) if isinstance(i, ast_internal_classes.Array_Subscript_Node)]
@@ -806,7 +847,7 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                         rangeposrval = []
                         rangesrval = []
 
-                        par_Decl_Range_Finder(i, rangesrval, rangeposrval, self.count, newbody, False)
+                        par_Decl_Range_Finder(i, rangesrval, rangeposrval, self.count, newbody, self.scope_vars, False)
 
                         for i, j in zip(ranges, rangesrval):
                             if i != j:
@@ -880,8 +921,11 @@ class SumToLoop(NodeTransformer):
     """
     Transforms the AST by removing array sums and replacing them with loops
     """
-    def __init__(self):
+    def __init__(self, ast):
         self.count = 0
+        ParentScopeAssigner().visit(ast)
+        self.scope_vars = ScopeVarsDeclarations()
+        self.scope_vars.visit(ast)
 
     def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node):
         newbody = []
@@ -900,7 +944,7 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No
                 rangeposrval = []
                 rangesrval = []
 
-                par_Decl_Range_Finder(val, rangesrval, rangeposrval, self.count, newbody, False, True)
+                par_Decl_Range_Finder(val, rangesrval, rangeposrval, self.count, newbody, self.scope_vars, False, True)
 
                 range_index = 0
                 body = ast_internal_classes.BinOp_Node(lval=current,
diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py
index 7253ec78e6..b15435f4ff 100644
--- a/dace/frontend/fortran/fortran_parser.py
+++ b/dace/frontend/fortran/fortran_parser.py
@@ -1044,8 +1044,8 @@ def create_ast_from_string(
         program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
         program = ast_transforms.CallExtractor().visit(program)
         program = ast_transforms.SignToIf().visit(program)
-        program = ast_transforms.ArrayToLoop().visit(program)
-        program = ast_transforms.SumToLoop().visit(program)
+        program = ast_transforms.ArrayToLoop(program).visit(program)
+        program = ast_transforms.SumToLoop(program).visit(program)
         program = ast_transforms.ForDeclarer().visit(program)
         program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
 
@@ -1076,8 +1076,8 @@ def create_sdfg_from_string(
     program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
     program = ast_transforms.CallExtractor().visit(program)
     program = ast_transforms.SignToIf().visit(program)
-    program = ast_transforms.ArrayToLoop().visit(program)
-    program = ast_transforms.SumToLoop().visit(program)
+    program = ast_transforms.ArrayToLoop(program).visit(program)
+    program = ast_transforms.SumToLoop(program).visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
     program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
@@ -1118,8 +1118,8 @@ def create_sdfg_from_fortran_file(source_string: str):
     program = ast_transforms.CallToArray(functions_and_subroutines_builder.nodes).visit(program)
     program = ast_transforms.CallExtractor().visit(program)
     program = ast_transforms.SignToIf().visit(program)
-    program = ast_transforms.ArrayToLoop().visit(program)
-    program = ast_transforms.SumToLoop().visit(program)
+    program = ast_transforms.ArrayToLoop(program).visit(program)
+    program = ast_transforms.SumToLoop(program).visit(program)
     program = ast_transforms.ForDeclarer().visit(program)
     program = ast_transforms.IndexExtractor(program).visit(program)
     ast2sdfg = AST_translator(own_ast, __file__)
diff --git a/tests/fortran/array_to_loop_offset.py b/tests/fortran/array_to_loop_offset.py
new file mode 100644
index 0000000000..aa50d5606a
--- /dev/null
+++ b/tests/fortran/array_to_loop_offset.py
@@ -0,0 +1,84 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+
+import numpy as np
+
+from dace.frontend.fortran import ast_transforms, fortran_parser
+
+def test_fortran_frontend_arr2loop_without_offset():
+    """
+    Tests that the generated array map correctly handles offsets.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(5,3) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(5,3) :: d
+
+                    do i=1,5
+                        d(i, :) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Now test to verify it executes correctly with no offset normalization
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", False)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 2
+    assert sdfg.data('d').shape[0] == 5
+    assert sdfg.data('d').shape[1] == 3
+
+    a = np.full([5,9], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(1,6):
+        for j in range(1,4):
+            assert a[i-1, j-1] == i * 2
+
+def test_fortran_frontend_arr2loop_with_offset():
+    """
+    Tests that the generated array map correctly handles offsets.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(5,7:9) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(5,7:9) :: d
+
+                    do i=1,5
+                        d(i, :) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Now test to verify it executes correctly with no offset normalization
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", False)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 2
+    assert sdfg.data('d').shape[0] == 5
+    assert sdfg.data('d').shape[1] == 3
+
+    a = np.full([5,9], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(1,6):
+        for j in range(7,10):
+            assert a[i-1, j-1] == i * 2
+
+if __name__ == "__main__":
+
+    test_fortran_frontend_arr2loop_with_offset()
+    test_fortran_frontend_arr2loop_without_offset()

From c5ce575c0daad5350b51e5df3096febfa3a73975 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 19:52:05 +0200
Subject: [PATCH 388/392] Add test verifying a 1D ArrayToLoop transform with
 offsets

---
 tests/fortran/array_to_loop_offset.py | 39 +++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/tests/fortran/array_to_loop_offset.py b/tests/fortran/array_to_loop_offset.py
index aa50d5606a..43d01d9b6b 100644
--- a/tests/fortran/array_to_loop_offset.py
+++ b/tests/fortran/array_to_loop_offset.py
@@ -41,7 +41,41 @@ def test_fortran_frontend_arr2loop_without_offset():
         for j in range(1,4):
             assert a[i-1, j-1] == i * 2
 
-def test_fortran_frontend_arr2loop_with_offset():
+def test_fortran_frontend_arr2loop_1d_offset():
+    """
+    Tests that the generated array map correctly handles offsets.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(2:6) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(2:6) :: d
+
+                    d(:) = 5
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Now test to verify it executes correctly with no offset normalization
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", False)
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 1
+    assert sdfg.data('d').shape[0] == 5
+
+    a = np.full([6], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    assert a[0] == 42
+    for i in range(2,7):
+        assert a[i-1] == 5
+
+def test_fortran_frontend_arr2loop_2d_offset():
     """
     Tests that the generated array map correctly handles offsets.
     """
@@ -80,5 +114,6 @@ def test_fortran_frontend_arr2loop_with_offset():
 
 if __name__ == "__main__":
 
-    test_fortran_frontend_arr2loop_with_offset()
+    test_fortran_frontend_arr2loop_1d_offset()
+    test_fortran_frontend_arr2loop_2d_offset()
     test_fortran_frontend_arr2loop_without_offset()

From 243605144d6f90c78b8e60962f3749fd95b06a3c Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 19:58:16 +0200
Subject: [PATCH 389/392] Add test verifying that Fortran offset normalizer
 works for 1D and 2D arrays

---
 tests/fortran/offset_normalizer.py | 55 ++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tests/fortran/offset_normalizer.py b/tests/fortran/offset_normalizer.py
index 1044deed79..26f29b9954 100644
--- a/tests/fortran/offset_normalizer.py
+++ b/tests/fortran/offset_normalizer.py
@@ -110,7 +110,62 @@ def test_fortran_frontend_offset_normalizer_2d():
         for j in range(0,3):
             assert a[i, j] == (50+i) * 2 + 3 * (7 + j)
 
+def test_fortran_frontend_offset_normalizer_2d_arr2loop():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    PROGRAM index_offset_test
+                    implicit none
+                    double precision, dimension(50:54,7:9) :: d
+                    CALL index_test_function(d)
+                    end
+
+                    SUBROUTINE index_test_function(d)
+                    double precision, dimension(50:54,7:9) :: d
+
+                    do i=50,54
+                        d(i, :) = i * 2.0
+                    end do
+
+                    END SUBROUTINE index_test_function
+                    """
+
+    # Test to verify that offset is normalized correctly
+    ast, own_ast = fortran_parser.create_ast_from_string(test_string, "index_offset_test", True, True)
+
+    for subroutine in ast.subroutine_definitions:
+
+        loop = subroutine.execution_part.execution[1]
+        nested_loop = loop.body.execution[1]
+
+        idx = nested_loop.body.execution[1]
+        assert idx.lval.name == 'tmp_index_0'
+        assert idx.rval.rval.value == "50"
+
+        idx2 = nested_loop.body.execution[3]
+        assert idx2.lval.name == 'tmp_index_1'
+        assert idx2.rval.rval.value == "7"
+
+    # Now test to verify it executes correctly with no normalization
+
+    sdfg = fortran_parser.create_sdfg_from_string(test_string, "index_offset_test", True)
+    sdfg.save('test.sdfg')
+    sdfg.simplify(verbose=True)
+    sdfg.compile()
+
+    assert len(sdfg.data('d').shape) == 2
+    assert sdfg.data('d').shape[0] == 5
+    assert sdfg.data('d').shape[1] == 3
+
+    a = np.full([5,3], 42, order="F", dtype=np.float64)
+    sdfg(d=a)
+    for i in range(0,5):
+        for j in range(0,3):
+            assert a[i, j] == (50 + i) * 2
+
 if __name__ == "__main__":
 
     test_fortran_frontend_offset_normalizer_1d()
     test_fortran_frontend_offset_normalizer_2d()
+    test_fortran_frontend_offset_normalizer_2d_arr2loop()

From ec77693e25895ef0d0015ed5f78be2eb19eaa21c Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 20:09:00 +0200
Subject: [PATCH 390/392] Adjust offsets in Array2Loop only when it has offset
 different than default's 1

---
 dace/frontend/fortran/ast_transforms.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 9ee11aa54d..6feab88bb4 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -771,12 +771,11 @@ def add_offset(original, offset: int):
             if i.type == "ALL":
 
                 lower_boundary = None
-                if offsets[idx] != 0:
+                if offsets[idx] != 1:
                     lower_boundary = ast_internal_classes.Int_Literal_Node(value=str(offsets[idx]))
                 else:
-                    lower_boundary = ast_internal_classes.Int_Literal_Node(value="1"),
+                    lower_boundary = ast_internal_classes.Int_Literal_Node(value="1")
 
-                upper_boundary = None
                 upper_boundary = ast_internal_classes.Name_Range_Node(name="f2dace_MAX",
                                                         type="INTEGER",
                                                         arrname=node.name,
@@ -785,7 +784,7 @@ def add_offset(original, offset: int):
                     When there's an offset, we add MAX_RANGE + offset.
                     But since the generated loop has `<=` condition, we need to subtract 1.
                 """
-                if offsets[idx] != 0:
+                if offsets[idx] != 1:
                     upper_boundary = ast_internal_classes.BinOp_Node(
                         lval=upper_boundary,
                         op="+",

From b37c1f505bae3deea4abc21b569e29fa3f36a3a2 Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 20:16:11 +0200
Subject: [PATCH 391/392] Remove dead code

---
 dace/frontend/fortran/ast_transforms.py | 11 -----------
 tests/fortran/offset_normalizer.py      |  7 -------
 2 files changed, 18 deletions(-)

diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 6feab88bb4..24ac6edeca 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -750,17 +750,6 @@ def par_Decl_Range_Finder(node: ast_internal_classes.Array_Subscript_Node,
     :return: Ranges, rangepos, newbody
     """
 
-    def add_offset(original, offset: int):
-
-        if offset != 0:
-            return ast_internal_classes.BinOp_Node(
-                lval=original,
-                op="+",
-                rval=ast_internal_classes.Int_Literal_Node(value=str(offset))
-            )
-        else:
-            return original
-
     currentindex = 0
     indices = []
     offsets = scope_vars.get_var(node.parent, node.name.name).offsets
diff --git a/tests/fortran/offset_normalizer.py b/tests/fortran/offset_normalizer.py
index 26f29b9954..b4138c1cac 100644
--- a/tests/fortran/offset_normalizer.py
+++ b/tests/fortran/offset_normalizer.py
@@ -21,12 +21,6 @@ def test_fortran_frontend_offset_normalizer_1d():
                     do i=50,54
                         d(i) = i * 2.0
                     end do
-                    !do i=50,54
-                    !    do j=10,15
-                    !        d(i, j) = i * 2.0
-                    !        !d(i, :) = i * 2.0
-                    !    end do
-                    !end do
 
                     END SUBROUTINE index_test_function
                     """
@@ -71,7 +65,6 @@ def test_fortran_frontend_offset_normalizer_2d():
                     do i=50,54
                         do j=7,9
                             d(i, j) = i * 2.0 + 3 * j
-                            !d(i, :) = i * 2.0
                         end do
                     end do
 

From 70c33dd913376cd87b46887da51da2b5d939f10f Mon Sep 17 00:00:00 2001
From: Marcin Copik <mcopik@gmail.com>
Date: Fri, 8 Sep 2023 23:04:41 +0200
Subject: [PATCH 392/392] Add support for Fortran modules in scope parent
 assignment pass

---
 dace/frontend/fortran/ast_internal_classes.py |  3 +-
 dace/frontend/fortran/ast_transforms.py       |  3 +-
 tests/fortran/parent_test.py                  | 37 +++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/dace/frontend/fortran/ast_internal_classes.py b/dace/frontend/fortran/ast_internal_classes.py
index 171b941858..70a43e21b8 100644
--- a/dace/frontend/fortran/ast_internal_classes.py
+++ b/dace/frontend/fortran/ast_internal_classes.py
@@ -15,7 +15,8 @@ def __init__(self, *args, **kwargs):  # real signature unknown
             Union[
                 Subroutine_Subprogram_Node,
                 Function_Subprogram_Node,
-                Main_Program_Node
+                Main_Program_Node,
+                Module_Node
             ]
         ] = None
         for k, v in kwargs.items():
diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py
index 24ac6edeca..e2a7246aed 100644
--- a/dace/frontend/fortran/ast_transforms.py
+++ b/dace/frontend/fortran/ast_transforms.py
@@ -326,7 +326,8 @@ def visit(self, node: ast_internal_classes.FNode, parent_node: Optional[ast_inte
         parent_node_types = [
             ast_internal_classes.Subroutine_Subprogram_Node,
             ast_internal_classes.Function_Subprogram_Node,
-            ast_internal_classes.Main_Program_Node
+            ast_internal_classes.Main_Program_Node,
+            ast_internal_classes.Module_Node
         ]
 
         if parent_node is not None and type(parent_node) in parent_node_types:
diff --git a/tests/fortran/parent_test.py b/tests/fortran/parent_test.py
index e68f03db8c..b1d08eaf37 100644
--- a/tests/fortran/parent_test.py
+++ b/tests/fortran/parent_test.py
@@ -48,7 +48,44 @@ def test_fortran_frontend_parent():
         for execution in subroutine.execution_part.execution:
             assert execution.parent == subroutine
 
+def test_fortran_frontend_module():
+    """
+    Tests that the Fortran frontend can parse array accesses and that the accessed indices are correct.
+    """
+    test_string = """
+                    module test_module
+                    implicit none
+                        ! good enough approximation
+                        integer, parameter :: pi = 4
+                    end module test_module
+
+                    PROGRAM access_test
+                    implicit none
+                    double precision d(4)
+                    d(1)=0
+                    CALL array_access_test_function(d)
+                    end
+
+                    SUBROUTINE array_access_test_function(d)
+                    double precision d(4)
+
+                    d(2)=5.5
+
+                    END SUBROUTINE array_access_test_function
+                    """
+    ast, functions = fortran_parser.create_ast_from_string(test_string, "array_access_test")
+    ast_transforms.ParentScopeAssigner().visit(ast)
+
+    assert ast.parent is None
+    assert ast.main_program.parent == None
+
+    module = ast.modules[0]
+    assert module.parent == None
+    specification = module.specification_part.specifications[0]
+    assert specification.parent == module
+
 
 if __name__ == "__main__":
 
     test_fortran_frontend_parent()
+    test_fortran_frontend_module()