From b956142c75868a570f036a47e6292a1b02b7030c Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Wed, 20 Nov 2024 12:48:04 +0100 Subject: [PATCH] Add more test cases and fix some bugs --- dace/codegen/targets/cpp.py | 8 +++++ dace/codegen/targets/cpu.py | 69 ++++++++++++++++++++++-------------- dace/subsets.py | 30 ++++++++-------- tests/deferred_alloc_test.py | 59 ++++++++++++++++++++---------- 4 files changed, 106 insertions(+), 60 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 3ed64d994b..494890089b 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -232,6 +232,14 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher', elif memlet.data == dst_node.data: copy_shape, src_strides = reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + def replace_dace_defer_dim(string, arrname): + pattern = r"__dace_defer_dim(\d+)" + return re.sub(pattern, r"A_size[\1]", string) + + # TODO: do this better? + dst_expr = replace_dace_defer_dim(dst_expr, dst_node.data) if dst_expr is not None else None + src_expr = replace_dace_defer_dim(src_expr, src_node.data) if src_expr is not None else None + return copy_shape, src_strides, dst_strides, src_expr, dst_expr diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 5eab35d384..1f7c8debaa 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -756,27 +756,34 @@ def _emit_copy( if isinstance(dst_node, nodes.Tasklet): # Copy into tasklet + desc = sdfg.arrays[memlet.data] + deferred_size_names = self._get_deferred_size_names(desc, memlet) stream.write( - " " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn]), + " " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn], deferred_size_names=deferred_size_names), cfg, state_id, [src_node, dst_node], ) - stream.write( - "//u1" - ) + if deferred_size_names is not None: + stream.write( + "// Size uses deferred allocation" + ) + return elif isinstance(src_node, nodes.Tasklet): # Copy out of tasklet + desc = sdfg.arrays[memlet.data] + deferred_size_names = self._get_deferred_size_names(desc, memlet) stream.write( - " " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn]), + " " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn], deferred_size_names=deferred_size_names), cfg, state_id, [src_node, dst_node], ) - stream.write( - "//u2" - ) + if deferred_size_names is not None: + stream.write( + "// Size uses deferred allocation" + ) return else: # Copy array-to-array src_nodedesc = src_node.desc(sdfg) @@ -875,6 +882,7 @@ def _emit_copy( state_dfg: SDFGState = cfg.nodes()[state_id] + copy_shape, src_strides, dst_strides, src_expr, dst_expr = cpp.memlet_copy_to_absolute_strides( self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._packed_types) @@ -1043,6 +1051,27 @@ def write_and_resolve_expr(self, sdfg: SDFG, memlet: mmlt.Memlet, nc: bool, outn custom_reduction = cpp.unparse_cr(sdfg, memlet.wcr, dtype) return (f'dace::wcr_custom<{dtype.ctype}>:: template {func}({custom_reduction}, {ptr}, {inname})') + def _get_deferred_size_names(self, desc, memlet): + if (desc.storage != dtypes.StorageType.GPU_Global and + desc.storage != dtypes.StorageType.CPU_Heap and + not desc.transient): + return None + def check_dace_defer(elements): + for elem in elements: + if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"): + return True + return False + deferred_size_names = None + if check_dace_defer(desc.shape): + if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap: + deferred_size_names = [] + for i, elem in enumerate(desc.shape): + if str(elem).startswith("__dace_defer"): + deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]") + else: + deferred_size_names.append(elem) + return deferred_size_names if len(deferred_size_names) > 0 else None + def process_out_memlets(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -1179,22 +1208,7 @@ def process_out_memlets(self, # If the storage type if CPU_Heap or GPU_Global then it might be requiring deferred allocation # We can check if the array requires sepcial access using A_size[0] (CPU) or __A_dim0_size (GPU0) # by going through the shape and checking for symbols starting with __dace_defer - def check_dace_defer(elements): - for elem in elements: - if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"): - return True - return False - deferred_size_names = None - if check_dace_defer(desc.shape): - if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap: - deferred_size_names = [] - for i, elem in enumerate(desc.shape): - if str(elem).startswith("__dace_defer"): - deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]") - else: - deferred_size_names.append(elem) - else: - raise Exception("Deferred Allocation only supported on array storages of type GPU_Global or CPU_Heap") + deferred_size_names = self._get_deferred_size_names(desc, memlet) expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame, deferred_size_names=deferred_size_names) write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype) @@ -1332,7 +1346,8 @@ def memlet_definition(self, local_name: str, conntype: Union[data.Data, dtypes.typeclass] = None, allow_shadowing: bool = False, - codegen: 'CPUCodeGen' = None): + codegen: 'CPUCodeGen' = None, + deferred_size_names = None): # TODO: Robust rule set if conntype is None: raise ValueError('Cannot define memlet for "%s" without connector type' % local_name) @@ -1381,7 +1396,7 @@ def memlet_definition(self, decouple_array_interfaces=decouple_array_interfaces) result = '' - expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame) + expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame, deferred_size_names=deferred_size_names) if var_type in [DefinedType.Pointer, DefinedType.StreamArray, DefinedType.ArrayInterface] else ptr) if expr != ptr: @@ -1425,7 +1440,7 @@ def memlet_definition(self, if not memlet.dynamic and memlet.num_accesses == 1: if not output: if isinstance(desc, data.Stream) and desc.is_stream_array(): - index = cpp.cpp_offset_expr(desc, memlet.subset) + index = cpp.cpp_offset_expr(desc, memlet.subset, deferred_size_names=deferred_size_names) expr = f"{memlet.data}[{index}]" result += f'{memlet_type} {local_name} = ({expr}).pop();' defined = DefinedType.Scalar diff --git a/dace/subsets.py b/dace/subsets.py index 0fdc36c22e..e6d69e1a67 100644 --- a/dace/subsets.py +++ b/dace/subsets.py @@ -99,7 +99,7 @@ def covers(self, other): return False return True - + def covers_precise(self, other): """ Returns True if self contains all the elements in other. """ @@ -734,7 +734,7 @@ def compose(self, other): def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = True) -> List[int]: """ Removes size-1 ranges from the subset and returns a list of dimensions that remain. - + For example, ``[i:i+10, j]`` will change the range to ``[i:i+10]`` and return ``[0]``. If ``offset`` is True, the subset will become ``[0:10]``. @@ -770,7 +770,7 @@ def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = Tru def unsqueeze(self, axes: Sequence[int]) -> List[int]: """ Adds 0:1 ranges to the subset, in the indices contained in axes. - + The method is mostly used to restore subsets that had their length-1 ranges removed (i.e., squeezed subsets). Hence, the method is called 'unsqueeze'. @@ -1046,7 +1046,7 @@ def squeeze(self, ignore_indices=None): def unsqueeze(self, axes: Sequence[int]) -> List[int]: """ Adds zeroes to the subset, in the indices contained in axes. - + The method is mostly used to restore subsets that had their zero-indices removed (i.e., squeezed subsets). Hence, the method is called 'unsqueeze'. @@ -1112,7 +1112,7 @@ def __init__(self, subset): self.subset_list = [subset] def covers(self, other): - """ + """ Returns True if this SubsetUnion covers another subset (using a bounding box). If other is another SubsetUnion then self and other will only return true if self is other. If other is a different type of subset @@ -1128,13 +1128,13 @@ def covers(self, other): return False else: return any(s.covers(other) for s in self.subset_list) - + def covers_precise(self, other): - """ + """ Returns True if this SubsetUnion covers another subset. If other is another SubsetUnion then self and other will only return true if self is other. If other is a different type of subset - true is returned when one of the subsets in self is equal to other + true is returned when one of the subsets in self is equal to other """ if isinstance(other, SubsetUnion): @@ -1154,7 +1154,7 @@ def __str__(self): string += " " string += subset.__str__() return string - + def dims(self): if not self.subset_list: return 0 @@ -1178,7 +1178,7 @@ def free_symbols(self) -> Set[str]: for subset in self.subset_list: result |= subset.free_symbols return result - + def replace(self, repl_dict): for subset in self.subset_list: subset.replace(repl_dict) @@ -1192,15 +1192,15 @@ def num_elements(self): min = subset.num_elements() except: continue - + return min def _union_special_cases(arb: symbolic.SymbolicType, brb: symbolic.SymbolicType, are: symbolic.SymbolicType, bre: symbolic.SymbolicType): - """ - Special cases of subset unions. If case found, returns pair of + """ + Special cases of subset unions. If case found, returns pair of (min,max), otherwise returns None. """ if are + 1 == brb: @@ -1267,7 +1267,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset: """ Compute the union of two Subset objects. If the subsets are not of the same type, degenerates to bounding-box union. - + :param subset_a: The first subset. :param subset_b: The second subset. :return: A Subset object whose size is at least the union of the two @@ -1303,7 +1303,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset: def list_union(subset_a: Subset, subset_b: Subset) -> Subset: - """ + """ Returns the union of two Subset lists. :param subset_a: The first subset. diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py index 5eed3d9173..d2cf87168f 100644 --- a/tests/deferred_alloc_test.py +++ b/tests/deferred_alloc_test.py @@ -1,5 +1,6 @@ import dace import numpy +import cupy def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"): sdfg = dace.sdfg.SDFG(name="deferred_alloc_test") @@ -11,17 +12,15 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo an_1 = state.add_access('A') an_1.add_in_connector('_write_size') - an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64) + an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64) state.add_edge(an_2, None, an_1, '_write_size', dace.Memlet(expr=f"user_size[{write_size}]") ) - sdfg.save("def_alloc_1.sdfg") - return sdfg -def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool): +def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default): sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_4") sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, @@ -33,12 +32,13 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool) an_1.add_in_connector('_write_size') an_1.add_out_connector('_read_size') - an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64) + an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64) state.add_edge(an_2, None, an_1, '_write_size', dace.Memlet(expr="user_size[0:2]") ) - map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) }) + map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) }, + schedule=schedule_type) state.add_edge(an_1, '_read_size', map_entry, "__A_dim1_size", dace.Memlet(expr="A_size[1]")) map_entry.add_in_connector("__A_dim1_size") map_exit.add_in_connector("IN_A") @@ -51,8 +51,17 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool) an_3 = state.add_access('A') state.add_edge(map_exit, "OUT_A", an_3, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)]))) + arr_name, arr = sdfg.add_array(name="example_array", dtype=dace.float32, shape=(1,), transient=False, storage=storage_type) + arrn = state.add_access(arr_name) + + if storage_type == dace.dtypes.StorageType.CPU_Heap: + assert (schedule_type == dace.dtypes.ScheduleType.Sequential) + elif storage_type == dace.dtypes.StorageType.GPU_Global: + assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device) + an_3.add_out_connector('_read_size') - map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) }) + map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)])}, + schedule=schedule_type) state.add_edge(an_3, '_read_size', map_entry2, "__A_dim1_size", dace.Memlet(expr="A_size[1]")) state.add_edge(an_3, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:15, 0:__A_dim1_size]")) map_entry2.add_in_connector("__A_dim1_size") @@ -61,14 +70,14 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool) map_exit2.add_in_connector("IN_A") map_exit2.add_out_connector("OUT_A") - t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='if (_in != 5.0){ throw std::runtime_error("fail"); } \n _out=_in;', language=dace.dtypes.Language.CPP) + t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='_out = _in', language=dace.dtypes.Language.Python) state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]")) state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]")) an_5 = state.add_access('A') state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)]))) - sdfg.save("def_alloc_4.sdfg") + state.add_edge(an_5, None, arrn, None, dace.memlet.Memlet("A[7, 7]")) return sdfg @@ -91,8 +100,8 @@ def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool) sdfg.compile() -def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool): - sdfg = _get_assign_map_sdfg(storage_type, transient) +def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType): + sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type) try: sdfg.validate() except Exception: @@ -104,7 +113,18 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool): if not _valid_to_reallocate(transient, storage_type, None): raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.") - sdfg.compile() + compiled_sdfg = sdfg.compile() + if storage_type == dace.dtypes.StorageType.CPU_Heap: + arr = numpy.array([-1.0]).astype(numpy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg (user_size=user_size, example_array=arr) + assert ( arr[0] == 3.0 ) + if storage_type == dace.dtypes.StorageType.GPU_Global: + arr = cupy.array([-1.0]).astype(cupy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg (user_size=user_size, example_array=arr) + assert ( arr.get()[0] == 3.0 ) + def test_incomplete_write_dimensions_1(): sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2") @@ -128,25 +148,28 @@ def test_realloc_inside_map(): pass if __name__ == "__main__": - for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]: + for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), + (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: print(f"Trivial Realloc with storage {storage_type}") test_trivial_realloc(storage_type, True) print(f"Trivial Realloc-Use with storage {storage_type}") - test_realloc_use(storage_type, True) + test_realloc_use(storage_type, True, schedule_type) - for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]: + for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), + (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: print(f"Trivial Realloc with storage {storage_type} on non-transient data") test_trivial_realloc(storage_type, False) print(f"Trivial Realloc-Use with storage {storage_type} on non-transient data") - test_realloc_use(storage_type, False) + test_realloc_use(storage_type, False, schedule_type) # Try some other combinations for transient in [True, False]: - for storage_type in [dace.dtypes.StorageType.Default, dace.dtypes.StorageType.Register]: + for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), + (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: print(f"Trivial Realloc with storage {storage_type} on transient:{transient} data") test_trivial_realloc(storage_type, transient) print(f"Trivial Realloc-Use with storage {storage_type} on transient:{transient} data") - test_realloc_use(storage_type, transient) + test_realloc_use(storage_type, transient, schedule_type) print(f"Realloc with incomplete write 1") test_incomplete_write_dimensions_1()