From 2408ad0816b2e2612902f45d399f7fa5b2a0b4d8 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 10 Dec 2024 17:24:58 +0100
Subject: [PATCH] Improve validation and type checks and fix bugs

---
 dace/codegen/targets/cpu.py  |  2 +-
 dace/codegen/targets/cuda.py |  7 ++--
 dace/data.py                 |  2 +-
 dace/sdfg/sdfg.py            |  2 +-
 dace/sdfg/validation.py      |  4 ++
 tests/deferred_alloc_test.py | 72 +++++++++++++++++++++++++++++++-----
 6 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 85c51cb8b1..2172bbc0da 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -344,7 +344,7 @@ def declare_array(self,
                 size_desc_name = sdfg.arrays[name].size_desc_name
                 if size_desc_name is not None:
                     size_desc = sdfg.arrays[size_desc_name]
-                    size_ctypedef = dtypes.pointer(size_desc.dtype).ctype
+                    size_ctypedef = size_desc.dtype.ctype
                     self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef)
             return
         elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal:
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index eb205d30c8..8bc51d0418 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1503,7 +1503,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         # make dynamic map inputs constant
         # TODO move this into _get_const_params(dfg_scope)
         # Do not add src as const if the size is being red (src_conn is _read_size)
-        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if not e.src_conn.endswith("size"))
+        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if e.src_conn is None or (e.src_conn is not None and e.src_conn == "_read_size"))
 
         # Store init/exit code streams
         old_entry_stream = self.scope_entry_stream
@@ -1626,8 +1626,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             for i in range(size_arr.shape[0]):
                 if f"__{arr_name}_dim{i}_size" not in dyn_args:
                     dyn_args.append(f"__{arr_name}_dim{i}_size")
-                    dyn_args_typed.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size")
-                    needed_size_scalars_declaration.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+                    size_desc = sdfg.arrays[size_desc_name]
+                    dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size")
+                    needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
diff --git a/dace/data.py b/dace/data.py
index a3b008f150..1678721062 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1442,7 +1442,7 @@ def __init__(self,
         else:
             self.offset = [0] * len(shape)
 
-        self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape])
+        self.is_deferred_array = any([str(dim).startswith("__dace_defer") for dim in self.shape])
 
         self.validate()
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index a08f572782..6e80270ea8 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1795,7 +1795,7 @@ def add_array(self,
         # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
         for i, s in enumerate(shape):
-            if isinstance(s, str) and s == "__dace_defer":
+            if isinstance(s, str) and s.startswith("__dace_defer"):
                 newshape.append(dace.symbolic.pystr_to_symbolic(f"{s}_dim{i}"))
             else:
                 try:
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index c4173dd181..55ed7570db 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -573,11 +573,15 @@ def validate_state(state: 'dace.sdfg.SDFGState',
             write_size_edges = list(state.edges_by_connector(node, insize))
 
             # Reading-Writing the size is valid only if the array is transient and has the storage type CPU_Heap or GPU_Global
+            has_writes = len(write_size_edges) > 0
             has_writes_or_reads = len(read_size_edges) + len(write_size_edges) > 0
             size_access_allowed = arr.transient and (arr.storage == dtypes.StorageType.CPU_Heap or arr.storage == dtypes.StorageType.GPU_Global)
             if has_writes_or_reads and not size_access_allowed:
                 raise InvalidSDFGNodeError('Reading the size of an array, or changing (writing to) the size of an array '
                                            'is only valid if the array is transient and the storage is CPU_Heap or GPU_Global', sdfg, state_id, nid)
+            if has_writes and scope[node] is not None:
+                raise InvalidSDFGNodeError('Resizing array is not allowed within a scope (e.g. not inside maps)', sdfg, state_id, nid)
+
 
             if len(write_size_edges) > 1:
                 raise InvalidSDFGNodeError('One node can have at maximum one edge writing to its size descriptior', sdfg, state_id, nid)
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index adc5427a9a..9aa8d86c14 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -4,6 +4,7 @@
 import numpy
 import pytest
 
+
 @pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
 def storage_type(request):
     return request.param
@@ -73,7 +74,7 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
     arrn = state.add_access(arr_name)
 
     if storage_type == dace.dtypes.StorageType.CPU_Heap:
-        assert (schedule_type == dace.dtypes.ScheduleType.Sequential)
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore)
     elif storage_type == dace.dtypes.StorageType.GPU_Global:
         assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
 
@@ -188,9 +189,55 @@ def test_trivial_realloc_cpu(transient: bool):
     _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient)
 
 
-def test_realloc_inside_map():
+def _add_realloc_inside_map(sdfg: dace.SDFG, schedule_type: dace.dtypes.ScheduleType):
+    pre_state = sdfg.states()[0]
+    state = sdfg.add_state("s2")
+    sdfg.add_edge(pre_state, state, dace.InterstateEdge(None, None))
+
+    map_entry, map_exit = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,4,1)])},
+                                        schedule=schedule_type)
+    an_2 = state.add_access('A')
+    an_2.add_in_connector("_write_size")
+
+    t1 = state.add_tasklet(name="assign", inputs={}, outputs={"__out"}, code="_out=8")
+    t1.add_out_connector("__out")
+
+    _, _ = sdfg.add_array("tmp0", shape=(2, ), dtype=numpy.uint64, transient=True)
+    sca = state.add_access("tmp0")
+
+    state.add_edge(map_entry, None, t1, None, dace.Memlet(None))
+    state.add_edge(t1, "__out", sca, None, dace.Memlet("tmp0[0]"))
+    state.add_edge(sca, None, an_2, "_write_size", dace.Memlet("tmp0"))
+    state.add_edge(an_2, None, map_exit, None, dace.Memlet(None))
+
+
+def test_realloc_inside_map_gpu():
+    sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+    _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.GPU_Device)
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+def test_realloc_inside_map_cpu():
+    sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
+    _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.CPU_Multicore)
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+def test_conditional_alloc_gpu():
+    pass
+
+def test_conditional_alloc_cpu():
     pass
 
+
 def test_incomplete_write_dimensions_1():
     sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
     try:
@@ -211,23 +258,28 @@ def test_incomplete_write_dimensions_2():
 
 
 if __name__ == "__main__":
+    print(f"Trivial Realloc within map {dace.dtypes.StorageType.CPU_Multicore}")
+    test_realloc_inside_map_cpu()
+    print(f"Trivial Realloc within map {dace.dtypes.StorageType.GPU_Device}")
+    test_realloc_inside_map_gpu()
+
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}")
-    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, True)
+    test_trivial_realloc_cpu(True)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}")
-    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.Sequential)
+    test_realloc_use_cpu(True)
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}")
-    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, True)
+    test_trivial_realloc_gpu(True)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}")
-    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+    test_realloc_use_gpu(True)
 
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
-    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, False)
+    test_trivial_realloc_cpu(False)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
-    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, False, dace.dtypes.ScheduleType.Sequential)
+    test_realloc_use_cpu(False)
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
-    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, False)
+    test_trivial_realloc_gpu(False)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
-    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, False, dace.dtypes.ScheduleType.GPU_Device)
+    test_realloc_use_gpu(False)
 
     print(f"Realloc with incomplete write 1")
     test_incomplete_write_dimensions_1()