From 9ea89e73553e1e120c6dfb0588f9eca321c64c9e Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 18 Jul 2024 16:38:38 -0600 Subject: [PATCH 01/77] cpu, gpu basic tests --- cpu.py | 15 +++++++++++++++ gpu.py | 25 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 cpu.py create mode 100644 gpu.py diff --git a/cpu.py b/cpu.py new file mode 100644 index 0000000000..ab6bcce6e8 --- /dev/null +++ b/cpu.py @@ -0,0 +1,15 @@ +import dace +import numpy as np + +@dace.program +def cpu_getstarted(A, B): + return A + B + +if __name__ == "__main__": + #a = np.random.rand(2,3) + a = 10 + b = 20 + print ("before dace(CPU) (a,b)", a, b) + print("after dace(CPU)", cpu_getstarted(a, b)) + sdfg = cpu_getstarted.to_sdfg(a, b) + # sdfg.apply_gpu_transformations() \ No newline at end of file diff --git a/gpu.py b/gpu.py new file mode 100644 index 0000000000..3f9ddd202b --- /dev/null +++ b/gpu.py @@ -0,0 +1,25 @@ + +import dace +import numpy as np +import pytest +from dace.transformation.interstate import GPUTransformSDFG + + +@dace.program +def gpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + # gpu_vector() + sdfg = gpu_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(GPUTransformSDFG) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + assert np.array_equal(ref, C) From 46a3c07cfac8dc465bed0535f8358454e700fb4d Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 18 Jul 2024 19:44:23 -0600 Subject: [PATCH 02/77] add cpu array test --- cpu_array.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 cpu_array.py diff --git a/cpu_array.py b/cpu_array.py new file mode 100644 index 0000000000..c1b33be7fc --- /dev/null +++ b/cpu_array.py @@ -0,0 +1,20 @@ +import dace +import numpy as np + +@dace.program +def cpu_getstarted(A, B, C): + C = A + B + return C + +if __name__ == "__main__": + #a = np.random.rand(2,3) + # a = 10 + # b = 20 + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + print ("before dace(CPU) (a,b)", A, B, C) + print("after dace(CPU)", cpu_getstarted(A, B, C)) + sdfg = cpu_getstarted.to_sdfg(A, B, C) + # sdfg.apply_gpu_transformations() \ No newline at end of file From 809048c6a9f17990276bd815a35df75925e755ae Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 19 Jul 2024 20:49:26 -0600 Subject: [PATCH 03/77] add optimization, helper file-check_external_library_used.py, this is from the documentation online to check what library is present --- check_external_library_used.py | 13 +++++++++++++ cpu.py | 2 ++ cpu_array_optimize.py | 25 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 check_external_library_used.py create mode 100644 cpu_array_optimize.py diff --git a/check_external_library_used.py b/check_external_library_used.py new file mode 100644 index 0000000000..b408009b08 --- /dev/null +++ b/check_external_library_used.py @@ -0,0 +1,13 @@ +from dace.libraries import blas + +print('BLAS calls will expand by default to', blas.default_implementation) + +if blas.IntelMKL.is_installed(): + blas.default_implementation = 'MKL' +elif blas.cuBLAS.is_installed(): + blas.default_implementation = 'cuBLAS' +elif blas.OpenBLAS.is_installed(): + blas.default_implementation = 'OpenBLAS' +elif not blas.BLAS.is_installed(): + # No BLAS library found, use the unoptimized native SDFG fallback + blas.default_implementation = 'pure' diff --git a/cpu.py b/cpu.py index ab6bcce6e8..47b81d5e25 100644 --- a/cpu.py +++ b/cpu.py @@ -12,4 +12,6 @@ def cpu_getstarted(A, B): print ("before dace(CPU) (a,b)", a, b) print("after dace(CPU)", cpu_getstarted(a, b)) sdfg = cpu_getstarted.to_sdfg(a, b) + + sdfg.save('save_cpu_sdfg.py', use_pickle=True) # sdfg.apply_gpu_transformations() \ No newline at end of file diff --git a/cpu_array_optimize.py b/cpu_array_optimize.py new file mode 100644 index 0000000000..67e0bd28d8 --- /dev/null +++ b/cpu_array_optimize.py @@ -0,0 +1,25 @@ +import dace +import numpy as np +from dace.transformation.optimizer import SDFGOptimizer + + +@dace.program +def cpu_getstarted_optimize(A, B, C): + C = A + B + return C + +if __name__ == "__main__": + #a = np.random.rand(2,3) + # a = 10 + # b = 20 + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + print ("before dace(CPU) (a,b)", A, B, C) + print("after dace(CPU)", cpu_getstarted_optimize(A, B, C)) + sdfg = cpu_getstarted_optimize.to_sdfg(A, B, C) + + # VISUALLY OPTIMIZE + sdfg = SDFGOptimizer(sdfg).optimize() + # sdfg.apply_gpu_transformations() \ No newline at end of file From 5c68cce086390f16f12e15a3106b29dbbfdbc407 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 22 Jul 2024 15:47:12 -0600 Subject: [PATCH 04/77] understood where is the source generated from, read codegen.py --- dace/codegen/codegen.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index d1427bf037..a329b69fea 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -17,7 +17,7 @@ from dace.codegen.instrumentation import InstrumentationProvider from dace.sdfg.state import SDFGState - +# include/* files, containing the signature header code. def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: """ Generate a header file for the SDFG """ proto = "" @@ -34,7 +34,7 @@ def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: proto += 'extern "C" void __program_%s(%sHandle_t handle%s);\n' % params return proto - +# sample/* files - contains the main() function. def generate_dummy(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: """ Generates a C program calling this SDFG. Since we do not know the purpose/semantics of the program, we allocate @@ -147,7 +147,10 @@ def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: disp.instrumentation[sdfg.instrument] = provider_mapping[sdfg.instrument] - +# 3 step process +# 1. Generate the code for the SDFG(.cpp file)(generate_code) +# 2. Generate the header file for the SDFG(.h file)(generate_headers) +# 3. Generate the main function to call the SDFG(.main file)(generate_dummy) def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. From 9a210f43c7e78240939e1b1df5d44bdda9b74727 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 23 Jul 2024 21:01:39 -0600 Subject: [PATCH 05/77] make more verbose comment --- dace/codegen/codegen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index a329b69fea..5af4780321 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -148,9 +148,9 @@ def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): disp.instrumentation[sdfg.instrument] = provider_mapping[sdfg.instrument] # 3 step process -# 1. Generate the code for the SDFG(.cpp file)(generate_code) -# 2. Generate the header file for the SDFG(.h file)(generate_headers) -# 3. Generate the main function to call the SDFG(.main file)(generate_dummy) +# 1. Generate the code for the SDFG(.cpp file)(generate_code)(sdfg.generate_code()[0]) +# 2. Generate the header file for the SDFG(.h file)(generate_headers)(sdfg.generate_code()[1]) +# 3. Generate the main function to call the SDFG(.main file)(generate_dummy)(sdfg.generate_code()[2]) def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. From 8763d5419e3488f4bb721dc16fa287a4b1524326 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 23 Jul 2024 21:04:06 -0600 Subject: [PATCH 06/77] Tried using a custom codegen following the tutorial guide on dace webpage, this commit didn't work. They also have something similar - adding Tensor core backend from Nvidia as an external codegen --- custom_codegen_external.py | 107 +++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 custom_codegen_external.py diff --git a/custom_codegen_external.py b/custom_codegen_external.py new file mode 100644 index 0000000000..0b84b77ec2 --- /dev/null +++ b/custom_codegen_external.py @@ -0,0 +1,107 @@ +import dace +from dace import registry +from dace.sdfg.scope import ScopeSubgraphView +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.target import TargetCodeGenerator +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.targets.cpp import sym2cpp + +@dace.program +def custom_kernel(A: dace.float64[20, 30]): + for i, j in dace.map[0:20:2, 0:30]: + A[i, j] += A[i, j] + + + +dace.ScheduleType.register('LoopyLoop') +dace.SCOPEDEFAULT_SCHEDULE[dace.ScheduleType.LoopyLoop] = dace.ScheduleType.Sequential +dace.SCOPEDEFAULT_STORAGE[dace.ScheduleType.LoopyLoop] = dace.StorageType.CPU_Heap + + +@registry.autoregister_params(name='loopy') +class MyCustomLoop(TargetCodeGenerator): + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): + ################################################################ + # Define some locals: + # Can be used to call back to the frame-code generator + self.frame = frame_codegen + # Can be used to dispatch other code generators for allocation/nodes + self.dispatcher = frame_codegen.dispatcher + + ################################################################ + # Register handlers/hooks through dispatcher: Can be used for + # nodes, memory copy/allocation, scopes, states, and more. + + # In this case, register scopes + self.dispatcher.register_map_dispatcher(dace.ScheduleType.LoopyLoop, self) + + # You can similarly use register_{array,copy,node,state}_dispatcher + + # A scope dispatcher will trigger a method called generate_scope whenever + # an SDFG has a scope with that schedule + def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, + callsite_stream: CodeIOStream): + # The parameters here are: + # sdfg: The SDFG we are currently generating. + # scope: The subgraph of the state containing only the scope (map contents) + # we want to generate the code for. + # state_id: The state in the SDFG the subgraph is taken from (i.e., + # `sdfg.node(state_id)` is the same as `scope.graph`) + # function_stream: A cursor to the global code (which can be used to define + # functions, hence the name). + # callsite_stream: A cursor to the current location in the code, most of + # the code is generated here. + + # We can get the map entry node from the scope graph + entry_node = scope.source_nodes()[0] + + # First, generate an opening brace (for instrumentation and dynamic map ranges) + callsite_stream.write('{', sdfg, state_id, entry_node) + + ################################################################ + # Generate specific code: We will generate a reversed loop with a + # comment for each dimension of the map. For the sake of simplicity, + # dynamic map ranges are not supported. + + for param, rng in zip(entry_node.map.params, entry_node.map.range): + # We use the sym2cpp function from the cpp support functions + # to convert symbolic expressions to proper C++ + begin, end, stride = (sym2cpp(r) for r in rng) + + # Every write is optionally (but recommended to be) tagged with + # 1-3 extra arguments, serving as line information to match + # SDFG, state, and graph nodes/edges to written code. + callsite_stream.write(f'''// Loopy-loop {param} + for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''', + sdfg, state_id, entry_node + ) + + # NOTE: CodeIOStream will automatically take care of indentation for us. + + + # Now that the loops have been defined, use the dispatcher to invoke any + # code generator (including this one) that is registered to deal with + # the internal nodes in the subgraph. We skip the MapEntry node. + self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, + function_stream, callsite_stream, + skip_entry_node=True) + + # NOTE: Since skip_exit_node above is set to False, closing braces will + # be automatically generated + +# Preview SDFG +sdfg = custom_kernel.to_sdfg() + +# Change schedule +for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + node.schedule = dace.ScheduleType.LoopyLoop + +# Code(sdfg.generate_code()[0].clean_code, language='cpp') + + +# display +from IPython.display import Code +from IPython.display import display +display(Code(sdfg.generate_code()[0].clean_code, language='cpp')) From e5ae4ee532ed497dc59b399166edacdfdd58a4cf Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 23 Jul 2024 23:47:41 -0600 Subject: [PATCH 07/77] make cpu, gpu, fpga tests to the most smallest and all doing vector addition --- cpu.py | 25 ++++++++++++++----------- cpu_array.py | 20 -------------------- fpga.py | 23 +++++++++++++++++++++++ gpu.py | 6 ++---- 4 files changed, 39 insertions(+), 35 deletions(-) delete mode 100644 cpu_array.py create mode 100644 fpga.py diff --git a/cpu.py b/cpu.py index 47b81d5e25..8b84cdedfc 100644 --- a/cpu.py +++ b/cpu.py @@ -1,17 +1,20 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np @dace.program -def cpu_getstarted(A, B): - return A + B +def cpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] -if __name__ == "__main__": - #a = np.random.rand(2,3) - a = 10 - b = 20 - print ("before dace(CPU) (a,b)", a, b) - print("after dace(CPU)", cpu_getstarted(a, b)) - sdfg = cpu_getstarted.to_sdfg(a, b) +if __name__ == '__main__': + sdfg = cpu_vector_add.to_sdfg(simplify=False) # compiled SDFG - sdfg.save('save_cpu_sdfg.py', use_pickle=True) - # sdfg.apply_gpu_transformations() \ No newline at end of file + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/cpu_array.py b/cpu_array.py deleted file mode 100644 index c1b33be7fc..0000000000 --- a/cpu_array.py +++ /dev/null @@ -1,20 +0,0 @@ -import dace -import numpy as np - -@dace.program -def cpu_getstarted(A, B, C): - C = A + B - return C - -if __name__ == "__main__": - #a = np.random.rand(2,3) - # a = 10 - # b = 20 - # call with values - A = np.ones((20), dtype=np.int32) # 1,1,1,1,... - B = np.ones((20), dtype=np.int32) # 1,1,1,1,... - C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... - print ("before dace(CPU) (a,b)", A, B, C) - print("after dace(CPU)", cpu_getstarted(A, B, C)) - sdfg = cpu_getstarted.to_sdfg(A, B, C) - # sdfg.apply_gpu_transformations() \ No newline at end of file diff --git a/fpga.py b/fpga.py new file mode 100644 index 0000000000..95188bae65 --- /dev/null +++ b/fpga.py @@ -0,0 +1,23 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +import pytest +from dace.transformation.interstate import FPGATransformSDFG + +@dace.program +def fpga_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = fpga_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(FPGATransformSDFG) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/gpu.py b/gpu.py index 3f9ddd202b..d5066a4ba9 100644 --- a/gpu.py +++ b/gpu.py @@ -1,7 +1,6 @@ import dace import numpy as np -import pytest from dace.transformation.interstate import GPUTransformSDFG @@ -11,7 +10,6 @@ def gpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): C[i] = A[i] + B[i] if __name__ == '__main__': - # gpu_vector() sdfg = gpu_vector_add.to_sdfg(simplify=False) # compiled SDFG sdfg.apply_transformations(GPUTransformSDFG) @@ -21,5 +19,5 @@ def gpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... sdfg(A, B, C) - ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... - assert np.array_equal(ref, C) + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) From a727cb9085278b78cf3e7ed0a109bf1e7fbc5676 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 12:55:01 -0600 Subject: [PATCH 08/77] add debug comments to understand the SDFG --- dace/codegen/codegen.py | 1 + dace/codegen/targets/framecode.py | 39 +++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index 5af4780321..6a02a4a57d 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -233,6 +233,7 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: # NOTE: THE SDFG IS ASSUMED TO BE FROZEN (not change) FROM THIS POINT ONWARDS # Generate frame code (and the rest of the code) + # (, generated_code/clean_code, ...)) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 5b756b413c..1d8b874263 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -928,7 +928,10 @@ def generate_code(self, # Generate actual program body states_generated = self.generate_states(sdfg, global_stream, callsite_stream) - + # Loop over states_generated and print them + for state in states_generated: + print(state) + print("*" * 50) ####################################################################### # Sanity check @@ -969,16 +972,44 @@ def generate_code(self, header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() - + # print("generated header:") + # print("#" * 50) + # print(generated_header) + # print("#" * 50) + # # print("Footer Stream:") + # # print("#" * 50) + # # print(footer_stream.getvalue()) + # # print("#" * 50) + + + all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() + # print("#" * 50) + # print("Function Signature:") + # print("#" * 50) + # print(function_signature) + # print("#" * 50) + # print("Header Stream:") + # print("#" * 50) + # header_stream.write("This is internal header, int x=10;") + # print(header_stream.getvalue()) + # print("#" * 50) + # print("callsite Code:") + # print("#" * 50) + # print(callsite_stream.getvalue()) + # print("#" * 50) + # print("Footer Stream:") + # print("#" * 50) + # print(footer_stream.getvalue()) + # print("#" * 50) else: - generated_header = global_stream.getvalue() - generated_code = callsite_stream.getvalue() + generated_header = global_stream.getvalue() # header + generated_code = callsite_stream.getvalue() # frame # Clean up generated code gotos = re.findall(r'goto (.*?);', generated_code) From a575ef880aceb6e06180f1ccb406b24bf1439fba Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 18:29:41 -0600 Subject: [PATCH 09/77] basic structure is dumped, using node as of now, build fails as well --- dace/codegen/targets/__init__.py | 1 + dace/codegen/targets/ipu.py | 82 ++++++++++++++++++++++++++++++++ dace/config_schema.yml | 25 ++++++++++ dace/dtypes.py | 2 + graphcore.py | 20 ++++++++ 5 files changed, 130 insertions(+) create mode 100644 dace/codegen/targets/ipu.py create mode 100644 graphcore.py diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index cd4d5f957f..5f01d11f08 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,3 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen +from .ipu import IPUCodeGen \ No newline at end of file diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py new file mode 100644 index 0000000000..2272c38413 --- /dev/null +++ b/dace/codegen/targets/ipu.py @@ -0,0 +1,82 @@ +# import +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from copy import deepcopy +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView +import functools +import itertools +import warnings + +from dace import data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config +from dace.codegen import cppunparse, exceptions as cgx +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets import cpp, fpga +from dace.codegen.common import codeblock_to_cpp, sym2cpp, update_persistent_desc +from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute +from dace.codegen.dispatcher import DefinedType, TargetDispatcher +from dace.frontend import operations +from dace.sdfg import nodes, utils as sdutils +from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, + dynamic_map_inputs) +from dace.sdfg.scope import is_devicelevel_gpu, is_in_scope +from dace.sdfg.validation import validate_memlet_data +from typing import TYPE_CHECKING, Optional, Tuple, Union +from dace.codegen.codeobject import CodeObject +from dace.codegen.targets.cpu import CPUCodeGen + + +@registry.autoregister_params(name='ipu') +class IPUCodeGen(TargetCodeGenerator): + """ IPU(Graphcore) code generator. """ + target_name = 'ipu' + title = 'IPU' + language = 'cpp' + + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): + self._codeobjects = [] # Holds any external files - src/cuda/xyz.cu, ... + self._sdfg = sdfg + self._frame = frame_codegen + self._dispatcher = frame_codegen.dispatcher + self._dispatcher.register_node_dispatcher(self) + self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() + + + + # __dace_init_ function is generated if True + @property + def has_initializer(self): + return False + + # __dace_exit_ function is generated if True + @property + def has_finalizer(self): + return False + + @staticmethod + def cmake_options(): + options = [] + + linker_flags = Config.get("compiler", "ipu", "libs") + + if linker_flags: + options.append(f'-DCMAKE_SHARED_LINKER_FLAGS="{linker_flags}"') + + + return options + + # This will generate the src/cuda/xyz.cu files and folders using "codeObjects" class. + # We don't need this now as we are mostly concerned about a single file codegen as of now. + def get_generated_codeobjects(self): + return self._codeobjects + + def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + callsite_stream.write( + f''' + something is printed this is for testing! + ''' + , sdfg) + + # do codegen using CPU technique. + # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + diff --git a/dace/config_schema.yml b/dace/config_schema.yml index da35e61997..d393d39258 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -264,6 +264,31 @@ required: If set to true, multiple connected components will generate "#pragma omp parallel sections" code around them. + ############################################# + # IPU compiler + ipu: + type: dict + title: IPU + description: IPU compiler preferences + required: + executable: + type: str + default: '' + title: Compiler executable override + description: File path or name of compiler executable + + args: + type: str + title: Arguments + description: Compiler argument flags + default: '-std=c++14 -fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label' + default_Windows: '/O2 /fp:fast /arch:AVX2 /D_USRDLL /D_WINDLL /D__restrict__=__restrict' + + libs: + type: str + title: Additional libraries + description: Additional linked libraries required by target + default: '-lpoplar' ############################################# # GPU (CUDA/HIP) compiler diff --git a/dace/dtypes.py b/dace/dtypes.py index f04200e63b..5fc6893ba5 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,10 +19,12 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) + IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) + @undefined_safe_enum @extensible_enum class StorageType(aenum.AutoNumberEnum): diff --git a/graphcore.py b/graphcore.py new file mode 100644 index 0000000000..1cc61b60fe --- /dev/null +++ b/graphcore.py @@ -0,0 +1,20 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np + +@dace.program +def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(IPUTransformSDFG) + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) From 5934537e629826b4d1dfce11c9f6973dd54e5a2f Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 21:16:02 -0600 Subject: [PATCH 10/77] IPUTransformSDFG commented in python code, probably missing registration of the pass --- graphcore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphcore.py b/graphcore.py index 1cc61b60fe..31046d434b 100644 --- a/graphcore.py +++ b/graphcore.py @@ -9,7 +9,7 @@ def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): if __name__ == '__main__': sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG - sdfg.apply_transformations(IPUTransformSDFG) + #sdfg.apply_transformations(IPUTransformSDFG) # call with values A = np.ones((20), dtype=np.int32) # 1,1,1,1,... B = np.ones((20), dtype=np.int32) # 1,1,1,1,... From 01c8bf5483955fb1c5141f82d4687ad3f98a31a9 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 21:16:18 -0600 Subject: [PATCH 11/77] MPI basic test --- mpi.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 mpi.py diff --git a/mpi.py b/mpi.py new file mode 100644 index 0000000000..593fec2208 --- /dev/null +++ b/mpi.py @@ -0,0 +1,22 @@ + +import dace +import numpy as np +from dace.transformation.dataflow import MPITransformMap + + +@dace.program +def mpi_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = mpi_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(MPITransformMap) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) From 429737d3a1e75b552387996dd2a92379c96be616 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 23:58:36 -0600 Subject: [PATCH 12/77] Implement the LoopyLoop custom codegen on Map, will revert in the next commit --- dace/codegen/targets/__init__.py | 2 +- dace/codegen/targets/ipu.py | 94 +++++++++++++++++++++++++++----- dace/dtypes.py | 7 ++- graphcore.py | 30 +++++++--- 4 files changed, 108 insertions(+), 25 deletions(-) diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index 5f01d11f08..8f77081925 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,4 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen -from .ipu import IPUCodeGen \ No newline at end of file +from .ipu import MyCustomLoop \ No newline at end of file diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 2272c38413..f85143bd4a 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -26,11 +26,13 @@ from dace.codegen.targets.cpu import CPUCodeGen -@registry.autoregister_params(name='ipu') -class IPUCodeGen(TargetCodeGenerator): +# @registry.autoregister_params(name='ipu') +# class IPUCodeGen(TargetCodeGenerator): +@registry.autoregister_params(name='loopy') +class MyCustomLoop(TargetCodeGenerator): """ IPU(Graphcore) code generator. """ - target_name = 'ipu' - title = 'IPU' + target_name = 'loopy' + title = 'LOOPY' language = 'cpp' def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): @@ -38,8 +40,11 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self._sdfg = sdfg self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher - self._dispatcher.register_node_dispatcher(self) - self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() + # self._dispatcher.register_node_dispatcher(self) + # self._dispatcher.register_state_dispatcher(self) + self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.LoopyLoop, self) + + # self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() @@ -69,14 +74,73 @@ def cmake_options(): # We don't need this now as we are mostly concerned about a single file codegen as of now. def get_generated_codeobjects(self): return self._codeobjects + + # def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + # callsite_stream.write( + # f''' + # Node! + # ''' + # , sdfg) + + # def generate_state(self, sdfg: SDFG, state: SDFGState, function_stream: CodeIOStream, callsite_stream: CodeIOStream, generate_state_footer: bool) -> None: + # callsite_stream.write( + # f''' + # State! + # ''' + # , sdfg) - def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): - callsite_stream.write( - f''' - something is printed this is for testing! - ''' - , sdfg) - - # do codegen using CPU technique. - # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # A scope dispatcher will trigger a method called generate_scope whenever + # an SDFG has a scope with that schedule + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, + callsite_stream: CodeIOStream): + # The parameters here are: + # sdfg: The SDFG we are currently generating. + # scope: The subgraph of the state containing only the scope (map contents) + # we want to generate the code for. + # state_id: The state in the SDFG the subgraph is taken from (i.e., + # `sdfg.node(state_id)` is the same as `scope.graph`) + # function_stream: A cursor to the global code (which can be used to define + # functions, hence the name). + # callsite_stream: A cursor to the current location in the code, most of + # the code is generated here. + + # We can get the map entry node from the scope graph + entry_node = scope.source_nodes()[0] + + # First, generate an opening brace (for instrumentation and dynamic map ranges) + callsite_stream.write('{', sdfg, state_id, entry_node) + + ################################################################ + # Generate specific code: We will generate a reversed loop with a + # comment for each dimension of the map. For the sake of simplicity, + # dynamic map ranges are not supported. + + for param, rng in zip(entry_node.map.params, entry_node.map.range): + # We use the sym2cpp function from the cpp support functions + # to convert symbolic expressions to proper C++ + begin, end, stride = (sym2cpp(r) for r in rng) + + # Every write is optionally (but recommended to be) tagged with + # 1-3 extra arguments, serving as line information to match + # SDFG, state, and graph nodes/edges to written code. + callsite_stream.write(f'''// Loopy-loop {param} + for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''', + sdfg, state_id, entry_node + ) + + # NOTE: CodeIOStream will automatically take care of indentation for us. + + # Now that the loops have been defined, use the dispatcher to invoke any + # code generator (including this one) that is registered to deal with + # the internal nodes in the subgraph. We skip the MapEntry node. + self._dispatcher.dispatch_subgraph(sdfg, cfg, scope, state_id, + function_stream, callsite_stream, + skip_entry_node=True, skip_exit_node=True) + # NOTE: Since skip_exit_node above is set to False, closing braces will + # be automatically generated + # Change schedule + # for node, _ in sdfg.all_nodes_recursive(): + # if isinstance(node, dtypes.nodes.MapEntry): + # node.schedule = dtypes.ScheduleType.LoopyLoop diff --git a/dace/dtypes.py b/dace/dtypes.py index 5fc6893ba5..bd1840e914 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -79,6 +79,7 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping + LoopyLoop = () # A subset of GPU schedule types @@ -200,7 +201,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + ScheduleType.LoopyLoop: StorageType.CPU_Heap } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -221,7 +223,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, + ScheduleType.LoopyLoop: ScheduleType.Sequential } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. diff --git a/graphcore.py b/graphcore.py index 31046d434b..d77a28b1e3 100644 --- a/graphcore.py +++ b/graphcore.py @@ -1,6 +1,10 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np +@dace.program +def simple(A: dace.float64[20, 30]): + for i, j in dace.map[0:20:2, 0:30]: + A[i, j] += A[i, j] @dace.program def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): @@ -8,13 +12,25 @@ def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): C[i] = A[i] + B[i] if __name__ == '__main__': - sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG - #sdfg.apply_transformations(IPUTransformSDFG) - # call with values - A = np.ones((20), dtype=np.int32) # 1,1,1,1,... - B = np.ones((20), dtype=np.int32) # 1,1,1,1,... - C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... - sdfg(A, B, C) +# sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG +# #sdfg.apply_transformations(IPUTransformSDFG) +# # call with values +# A = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# B = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... +# sdfg(A, B, C) +# + # Preview SDFG + sdfg = simple.to_sdfg() + + +# Change schedule + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + node.schedule = dace.ScheduleType.LoopyLoop + + + print (sdfg.generate_code()[0].clean_code) # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... # assert np.array_equal(ref, C) From 77a738893d59d886810f319a0c569d85b79045be Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 24 Jul 2024 23:58:55 -0600 Subject: [PATCH 13/77] Revert "Implement the LoopyLoop custom codegen on Map, will revert in the next commit" This reverts commit 429737d3a1e75b552387996dd2a92379c96be616. --- dace/codegen/targets/__init__.py | 2 +- dace/codegen/targets/ipu.py | 94 +++++--------------------------- dace/dtypes.py | 7 +-- graphcore.py | 30 +++------- 4 files changed, 25 insertions(+), 108 deletions(-) diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index 8f77081925..5f01d11f08 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,4 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen -from .ipu import MyCustomLoop \ No newline at end of file +from .ipu import IPUCodeGen \ No newline at end of file diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index f85143bd4a..2272c38413 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -26,13 +26,11 @@ from dace.codegen.targets.cpu import CPUCodeGen -# @registry.autoregister_params(name='ipu') -# class IPUCodeGen(TargetCodeGenerator): -@registry.autoregister_params(name='loopy') -class MyCustomLoop(TargetCodeGenerator): +@registry.autoregister_params(name='ipu') +class IPUCodeGen(TargetCodeGenerator): """ IPU(Graphcore) code generator. """ - target_name = 'loopy' - title = 'LOOPY' + target_name = 'ipu' + title = 'IPU' language = 'cpp' def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): @@ -40,11 +38,8 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self._sdfg = sdfg self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher - # self._dispatcher.register_node_dispatcher(self) - # self._dispatcher.register_state_dispatcher(self) - self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.LoopyLoop, self) - - # self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() + self._dispatcher.register_node_dispatcher(self) + self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() @@ -74,73 +69,14 @@ def cmake_options(): # We don't need this now as we are mostly concerned about a single file codegen as of now. def get_generated_codeobjects(self): return self._codeobjects - - # def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): - # callsite_stream.write( - # f''' - # Node! - # ''' - # , sdfg) - - # def generate_state(self, sdfg: SDFG, state: SDFGState, function_stream: CodeIOStream, callsite_stream: CodeIOStream, generate_state_footer: bool) -> None: - # callsite_stream.write( - # f''' - # State! - # ''' - # , sdfg) - # A scope dispatcher will trigger a method called generate_scope whenever - # an SDFG has a scope with that schedule - def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): - # The parameters here are: - # sdfg: The SDFG we are currently generating. - # scope: The subgraph of the state containing only the scope (map contents) - # we want to generate the code for. - # state_id: The state in the SDFG the subgraph is taken from (i.e., - # `sdfg.node(state_id)` is the same as `scope.graph`) - # function_stream: A cursor to the global code (which can be used to define - # functions, hence the name). - # callsite_stream: A cursor to the current location in the code, most of - # the code is generated here. - - # We can get the map entry node from the scope graph - entry_node = scope.source_nodes()[0] - - # First, generate an opening brace (for instrumentation and dynamic map ranges) - callsite_stream.write('{', sdfg, state_id, entry_node) - - ################################################################ - # Generate specific code: We will generate a reversed loop with a - # comment for each dimension of the map. For the sake of simplicity, - # dynamic map ranges are not supported. - - for param, rng in zip(entry_node.map.params, entry_node.map.range): - # We use the sym2cpp function from the cpp support functions - # to convert symbolic expressions to proper C++ - begin, end, stride = (sym2cpp(r) for r in rng) - - # Every write is optionally (but recommended to be) tagged with - # 1-3 extra arguments, serving as line information to match - # SDFG, state, and graph nodes/edges to written code. - callsite_stream.write(f'''// Loopy-loop {param} - for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''', - sdfg, state_id, entry_node - ) - - # NOTE: CodeIOStream will automatically take care of indentation for us. - - # Now that the loops have been defined, use the dispatcher to invoke any - # code generator (including this one) that is registered to deal with - # the internal nodes in the subgraph. We skip the MapEntry node. - self._dispatcher.dispatch_subgraph(sdfg, cfg, scope, state_id, - function_stream, callsite_stream, - skip_entry_node=True, skip_exit_node=True) + def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + callsite_stream.write( + f''' + something is printed this is for testing! + ''' + , sdfg) + + # do codegen using CPU technique. + # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - # NOTE: Since skip_exit_node above is set to False, closing braces will - # be automatically generated - # Change schedule - # for node, _ in sdfg.all_nodes_recursive(): - # if isinstance(node, dtypes.nodes.MapEntry): - # node.schedule = dtypes.ScheduleType.LoopyLoop diff --git a/dace/dtypes.py b/dace/dtypes.py index bd1840e914..5fc6893ba5 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -79,7 +79,6 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping - LoopyLoop = () # A subset of GPU schedule types @@ -201,8 +200,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM, - ScheduleType.LoopyLoop: StorageType.CPU_Heap + ScheduleType.Snitch: StorageType.Snitch_TCDM } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -223,8 +221,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, - ScheduleType.LoopyLoop: ScheduleType.Sequential + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. diff --git a/graphcore.py b/graphcore.py index d77a28b1e3..31046d434b 100644 --- a/graphcore.py +++ b/graphcore.py @@ -1,10 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np -@dace.program -def simple(A: dace.float64[20, 30]): - for i, j in dace.map[0:20:2, 0:30]: - A[i, j] += A[i, j] @dace.program def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): @@ -12,25 +8,13 @@ def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): C[i] = A[i] + B[i] if __name__ == '__main__': -# sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG -# #sdfg.apply_transformations(IPUTransformSDFG) -# # call with values -# A = np.ones((20), dtype=np.int32) # 1,1,1,1,... -# B = np.ones((20), dtype=np.int32) # 1,1,1,1,... -# C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... -# sdfg(A, B, C) -# - # Preview SDFG - sdfg = simple.to_sdfg() - - -# Change schedule - for node, _ in sdfg.all_nodes_recursive(): - if isinstance(node, dace.nodes.MapEntry): - node.schedule = dace.ScheduleType.LoopyLoop - - - print (sdfg.generate_code()[0].clean_code) + sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG + #sdfg.apply_transformations(IPUTransformSDFG) + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... # assert np.array_equal(ref, C) From fa789383518f0dc61cf8f93aa0202801ea29dc7d Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 25 Jul 2024 15:11:09 -0600 Subject: [PATCH 14/77] Debug: Find what are different types of nodes and how they are organized --- dace/codegen/targets/ipu.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 2272c38413..a71cb9fd6f 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -39,7 +39,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher self._dispatcher.register_node_dispatcher(self) - self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() + #self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() @@ -71,12 +71,23 @@ def get_generated_codeobjects(self): return self._codeobjects def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): - callsite_stream.write( - f''' - something is printed this is for testing! - ''' - , sdfg) - # do codegen using CPU technique. - # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + if isinstance(node, nodes.Map): + callsite_stream.write( + f''' + Map! + ''' + , sdfg) + elif isinstance(node, nodes.AccessNode): + callsite_stream.write( + f''' + AccessNode! + ''' + , sdfg) + elif isinstance(node, nodes.CodeNode): + callsite_stream.write( + f''' + CodeNode! + ''' + , sdfg) From d1af9714333b148b13693d5b2fc01d829fda385e Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 25 Jul 2024 15:20:35 -0600 Subject: [PATCH 15/77] Debug: make output more verbose from last commit --- dace/codegen/targets/ipu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index a71cb9fd6f..1793f1fa8a 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -75,19 +75,19 @@ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVie if isinstance(node, nodes.Map): callsite_stream.write( f''' - Map! + Concurrency(Map/Consume)(omp loop)! ''' , sdfg) elif isinstance(node, nodes.AccessNode): callsite_stream.write( f''' - AccessNode! + AccessNode(container=array/stream)! ''' , sdfg) elif isinstance(node, nodes.CodeNode): callsite_stream.write( f''' - CodeNode! + CodeNode(Tasklet/nestedSDFG)! ''' , sdfg) From c3171fb8d435eaa5c1b8749c792d4e6b55202bb5 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 25 Jul 2024 18:35:24 -0600 Subject: [PATCH 16/77] print states(if-else, for) --- dace/codegen/targets/ipu.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 1793f1fa8a..38f87062f3 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -91,3 +91,17 @@ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVie ''' , sdfg) +def generate_state(self, + sdfg:SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream:CodeIOStream, + generate_state_footer:bool = True): + + callsite_stream.write( + f''' + State(CFG/Loops/Conditionals(if else, for, ...)) + ''' + , sdfg) + \ No newline at end of file From ed6f63e9fdedac283926791187a2bb0ba28defbc Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 26 Jul 2024 12:14:27 -0600 Subject: [PATCH 17/77] convert from vector add to saclar add, name might be confusing --- graphcore.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/graphcore.py b/graphcore.py index 31046d434b..f3acd9628d 100644 --- a/graphcore.py +++ b/graphcore.py @@ -2,18 +2,22 @@ import dace import numpy as np +# @dace.program +# def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): +# for i in dace.map[0:20]: # parallelization construct +# C[i] = A[i] + B[i] + @dace.program -def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): - for i in dace.map[0:20]: # parallelization construct - C[i] = A[i] + B[i] +def ipu_vector_add(A: dace.int32, B: dace.int32, C: dace.int32): + C = A + B if __name__ == '__main__': sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG #sdfg.apply_transformations(IPUTransformSDFG) # call with values - A = np.ones((20), dtype=np.int32) # 1,1,1,1,... - B = np.ones((20), dtype=np.int32) # 1,1,1,1,... - C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + A = np.int32(1) # 1,1,1,1,... + B = np.int32(1) # 1,1,1,1,... + C = np.int32(0) # 0,0,0,0,... sdfg(A, B, C) # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... From c30f1f23afc78f043a24aa074de412661e079121 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 26 Jul 2024 12:19:04 -0600 Subject: [PATCH 18/77] some debug comments, found control_flow_tree code, ipu.py has a lot of experimental changes which try to understand the SDFGIR and the changes to make IPUCodeGen registry into the frame_targets. --- dace/codegen/codegen.py | 3 + dace/codegen/targets/framecode.py | 3 +- dace/codegen/targets/ipu.py | 134 +++++++++++++++++++++++++----- 3 files changed, 118 insertions(+), 22 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index 6a02a4a57d..62d7adeb08 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -246,6 +246,9 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: ] # Create code objects for each target + print("Used targets:", used_targets) + print("Frame targets:", frame.targets) + print("Frame " + str(frame)) for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 1d8b874263..181ea38c8f 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -473,7 +473,7 @@ def dispatch_state(state: SDFGState) -> str: opbar.next() states_generated.add(state) # For sanity check return stream.getvalue() - + callsite_stream.write("START CORE", sdfg) if sdfg.root_sdfg.using_experimental_blocks: # Use control flow blocks embedded in the SDFG to generate control flow. cft = cflow.structured_control_flow_tree_with_regions(sdfg, dispatch_state) @@ -496,6 +496,7 @@ def dispatch_state(state: SDFGState) -> str: [], [], [], [], False) callsite_stream.write(cft.as_cpp(self, sdfg.symbols), sdfg) + callsite_stream.write("END CORE", sdfg) opbar.done() diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 38f87062f3..565120b720 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -1,5 +1,6 @@ # import # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from typing import TYPE_CHECKING from copy import deepcopy from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.sdfg.graph import MultiConnectorEdge @@ -25,6 +26,10 @@ from dace.codegen.codeobject import CodeObject from dace.codegen.targets.cpu import CPUCodeGen +if TYPE_CHECKING: + from dace.codegen.targets.ipu import IPUCodeGen + from dace.codegen.targets.cpp import CPUCodeGen + @registry.autoregister_params(name='ipu') class IPUCodeGen(TargetCodeGenerator): @@ -33,25 +38,33 @@ class IPUCodeGen(TargetCodeGenerator): title = 'IPU' language = 'cpp' - def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): - self._codeobjects = [] # Holds any external files - src/cuda/xyz.cu, ... + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._sdfg = sdfg self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher - self._dispatcher.register_node_dispatcher(self) - #self._cpu_codegen: CPUCodeGen = self._dispatcher.get_generic_node_dispatcher() + self._global_sdfg = sdfg + + # Register dispatchers + # self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() + # Register additional dispatchers + # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self) + # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) + self._dispatcher.register_node_dispatcher(self) + def state_dispatch_predicate(self, sdfg, state): + return True # __dace_init_ function is generated if True @property def has_initializer(self): - return False + return True # __dace_exit_ function is generated if True @property def has_finalizer(self): - return False + return True @staticmethod def cmake_options(): @@ -68,7 +81,50 @@ def cmake_options(): # This will generate the src/cuda/xyz.cu files and folders using "codeObjects" class. # We don't need this now as we are mostly concerned about a single file codegen as of now. def get_generated_codeobjects(self): - return self._codeobjects + fileheader = CodeIOStream() + sdfg = self._global_sdfg + + # Adds + self._frame.generate_fileheader(self._global_sdfg, fileheader, 'poplar') + + # cuda/mpi seemed to be using this follow + params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) + if params_comma: + params_comma = ', ' + params_comma + codelet_file_code = """ +// Copyright (c) 2018 Graphcore Ltd. All rights reserved. +// Copied from tut3_vertices from Poplar SDK tutorials + +#include + +class SumVertex : public poplar::Vertex { + public: + // Fields + poplar::Input> in; + poplar::Output out; + + // Compute function + bool compute() { + *out = 0; + for (const auto &v : in) { + *out += v; + } + return true; + } +}; +""" + + codeobj = CodeObject( + name=sdfg.name + '_codelets', + code=codelet_file_code, + language='cpp', + target=IPUCodeGen, + title='IPU', + target_type='ipu', + linkable=False) + + # Fill in the list + return [codeobj] def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): @@ -91,17 +147,53 @@ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVie ''' , sdfg) -def generate_state(self, - sdfg:SDFG, - cfg: ControlFlowRegion, - state: SDFGState, - function_stream: CodeIOStream, - callsite_stream:CodeIOStream, - generate_state_footer:bool = True): - - callsite_stream.write( - f''' - State(CFG/Loops/Conditionals(if else, for, ...)) - ''' - , sdfg) - \ No newline at end of file + # def generate_state(self, + # sdfg:SDFG, + # cfg: ControlFlowRegion, + # state: SDFGState, + # function_stream: CodeIOStream, + # callsite_stream:CodeIOStream, + # generate_state_footer:bool = True): + + # callsite_stream.write( + # f''' + # State(CFG/Loops/Conditionals(if else, for, ...)) + # ''' + # , sdfg) + + # def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, + # function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # # Take care of map header + # assert len(dfg_scope.source_nodes()) == 1 + # map_header: nodes.MapEntry = dfg_scope.source_nodes()[0] + + # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, map_header) + + # # Add extra opening brace (dynamic map ranges, closed in MapExit + # # generator) + # callsite_stream.write('{', cfg, state_id, map_header) + + # if len(map_header.map.params) > 1: + # raise NotImplementedError('Multi-dimensional MPI maps are not supported') + + # state = cfg.state(state_id) + # symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) + + # for var, r in zip(map_header.map.params, map_header.map.range): + # begin, end, skip = r + + # callsite_stream.write('{\n', cfg, state_id, map_header) + # callsite_stream.write( + # '%s %s = %s + __dace_comm_rank * (%s);\n' % + # (symtypes[var], var, cppunparse.pyexpr2cpp(symbolic.symstr(begin, cpp_mode=True)), + # cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) + + # self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) + + # self._dispatcher.dispatch_subgraph(sdfg, + # cfg, + # dfg_scope, + # state_id, + # function_stream, + # callsite_stream, + # skip_entry_node=True) From 78f19af0b17a095fd9d0fa82ead5619bbcce43f5 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sat, 27 Jul 2024 13:44:43 -0600 Subject: [PATCH 19/77] 1. Fix IPUCodegen {used_targets}-{frame} error. The fix was to call 'self._frame.generate_state' recursively from ipu.generate_state. This goes into framecode.py and calls the recursive function which traverses substates and calls the codegen respectively. Based on this learning a point to note is to remember to call the recursive functions inside generate_*() functions. example is 'self._dispatcher.dispatch_subgraph' in 'generate_scope'. 2. Fix ipu/ipu 2 folders were created recursively. Fix - Remove 'target_type='ipu' from CodeObject. --- dace/codegen/targets/ipu.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 565120b720..f10cef0750 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -52,6 +52,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self) # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) self._dispatcher.register_node_dispatcher(self) + self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) def state_dispatch_predicate(self, sdfg, state): return True @@ -120,7 +121,6 @@ class SumVertex : public poplar::Vertex { language='cpp', target=IPUCodeGen, title='IPU', - target_type='ipu', linkable=False) # Fill in the list @@ -147,19 +147,21 @@ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVie ''' , sdfg) - # def generate_state(self, - # sdfg:SDFG, - # cfg: ControlFlowRegion, - # state: SDFGState, - # function_stream: CodeIOStream, - # callsite_stream:CodeIOStream, - # generate_state_footer:bool = True): + def generate_state(self, + sdfg:SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream:CodeIOStream, + generate_state_footer:bool = True): + # print state.label - # callsite_stream.write( - # f''' - # State(CFG/Loops/Conditionals(if else, for, ...)) - # ''' - # , sdfg) + callsite_stream.write( + ''' + State(CFG/Loops/Conditionals(if else, for, ...)){} + '''.format(state.label) + , sdfg) + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) # def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, # function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: From af38f47ba62acec813c1d3e5e57dbe092e153bde Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 29 Jul 2024 21:03:27 -0600 Subject: [PATCH 20/77] mpi_scalar.py, some debug comments, now move on to cpu only, don't look right now into GPU/MPI --- dace/codegen/targets/framecode.py | 27 ++++++++++++++------------- dace/codegen/targets/ipu.py | 9 +++++---- mpi_scalar.py | 17 +++++++++++++++++ 3 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 mpi_scalar.py diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 181ea38c8f..5083432fd6 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -1012,19 +1012,20 @@ def generate_code(self, generated_header = global_stream.getvalue() # header generated_code = callsite_stream.getvalue() # frame - # Clean up generated code - gotos = re.findall(r'goto (.*?);', generated_code) - clean_code = '' - for line in generated_code.split('\n'): - # Empty line with semicolon - if re.match(r'^\s*;\s*', line): - continue - # Label that might be unused - label = re.findall(r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) - if len(label) > 0: - if label[0] not in gotos: - continue - clean_code += line + '\n' + # # Clean up generated code + # gotos = re.findall(r'goto (.*?);', generated_code) + # clean_code = '' + # for line in generated_code.split('\n'): + # # Empty line with semicolon + # if re.match(r'^\s*;\s*', line): + # continue + # # Label that might be unused + # label = re.findall(r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) + # if len(label) > 0: + # if label[0] not in gotos: + # continue + # clean_code += line + '\n' + clean_code = generated_code # Return the generated global and local code strings return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index f10cef0750..c7afe10e9c 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -48,11 +48,9 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() # Register additional dispatchers - # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) - # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self) # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) - self._dispatcher.register_node_dispatcher(self) - self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # self._dispatcher.register_node_dispatcher(self) + # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) def state_dispatch_predicate(self, sdfg, state): return True @@ -126,6 +124,9 @@ class SumVertex : public poplar::Vertex { # Fill in the list return [codeobj] +############################################################################################################ +# IPU specific node/state generation +############################################################################################################ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): if isinstance(node, nodes.Map): diff --git a/mpi_scalar.py b/mpi_scalar.py new file mode 100644 index 0000000000..7037908c31 --- /dev/null +++ b/mpi_scalar.py @@ -0,0 +1,17 @@ + +import dace +import numpy as np +from dace.transformation.dataflow import MPITransformMap + +@dace.program +def mpi_scalar_add(A: dace.int32, B: dace.int32, C: dace.int32): + C = A + B + +if __name__ == '__main__': + sdfg = mpi_scalar_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(MPITransformMap) + + A = np.int32(1) # 1,1,1,1,... + B = np.int32(1) # 1,1,1,1,... + C = np.int32(0) # 0,0,0,0,... + sdfg(A, B, C) \ No newline at end of file From 908a0f9681e87f4409af71114707b866b7d55539 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 29 Jul 2024 23:59:00 -0600 Subject: [PATCH 21/77] partial code works, read cpu.py and generate_{node, state} --- dace/codegen/targets/framecode.py | 2 - dace/codegen/targets/ipu.py | 140 +++++++++++++++++++++--------- 2 files changed, 99 insertions(+), 43 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 5083432fd6..ab71932999 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -473,7 +473,6 @@ def dispatch_state(state: SDFGState) -> str: opbar.next() states_generated.add(state) # For sanity check return stream.getvalue() - callsite_stream.write("START CORE", sdfg) if sdfg.root_sdfg.using_experimental_blocks: # Use control flow blocks embedded in the SDFG to generate control flow. cft = cflow.structured_control_flow_tree_with_regions(sdfg, dispatch_state) @@ -496,7 +495,6 @@ def dispatch_state(state: SDFGState) -> str: [], [], [], [], False) callsite_stream.write(cft.as_cpp(self, sdfg.symbols), sdfg) - callsite_stream.write("END CORE", sdfg) opbar.done() diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index c7afe10e9c..0fd6ac37b6 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -43,14 +43,17 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher self._global_sdfg = sdfg + self._generated_nodes = set() + self.calling_codegen = self # Register dispatchers # self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() # Register additional dispatchers # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) - # self._dispatcher.register_node_dispatcher(self) - # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + self._dispatcher.register_node_dispatcher(self) + self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + def state_dispatch_predicate(self, sdfg, state): return True @@ -68,12 +71,9 @@ def has_finalizer(self): @staticmethod def cmake_options(): options = [] - - linker_flags = Config.get("compiler", "ipu", "libs") - - if linker_flags: - options.append(f'-DCMAKE_SHARED_LINKER_FLAGS="{linker_flags}"') - + + if Config.get("compiler", "ipu", "libs"): + options.append('-DCMAKE_SHARED_LINKER_FLAGS="{}"'.format(Config.get("compiler", "ipu", "libs"))) return options @@ -127,26 +127,22 @@ class SumVertex : public poplar::Vertex { ############################################################################################################ # IPU specific node/state generation ############################################################################################################ + # from cpu.py def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + if isinstance(node, nodes.NestedSDFG): + # Dynamically obtain node generator according to class name + try: + gen = getattr(self, "_generate_" + type(node).__name__) + except AttributeError: + if isinstance(node, nodes.LibraryNode): + raise NodeNotExpandedError(sdfg, state_id, dfg.node_id(node)) + raise + # _generate_Tasklet() example - if isinstance(node, nodes.Map): - callsite_stream.write( - f''' - Concurrency(Map/Consume)(omp loop)! - ''' - , sdfg) - elif isinstance(node, nodes.AccessNode): - callsite_stream.write( - f''' - AccessNode(container=array/stream)! - ''' - , sdfg) - elif isinstance(node, nodes.CodeNode): - callsite_stream.write( - f''' - CodeNode(Tasklet/nestedSDFG)! - ''' - , sdfg) + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # Mark node as "generated" + self._generated_nodes.add(node) + # self._locals.clear_scope(self._ldepth + 1) def generate_state(self, sdfg:SDFG, @@ -157,21 +153,87 @@ def generate_state(self, generate_state_footer:bool = True): # print state.label - callsite_stream.write( - ''' - State(CFG/Loops/Conditionals(if else, for, ...)){} - '''.format(state.label) - , sdfg) + # callsite_stream.write( + # ''' + # State(CFG/Loops/Conditionals(if else, for, ...)){} + # '''.format(state.label) + # , sdfg) self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + def generate_scope(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + # Get the first entry node of Map + entry_node = dfg_scope.source_nodes()[0] + + # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, entry_node) + callsite_stream.write('{', cfg, state_id, entry_node) + + # cpp.presynchronize_streams(sdfg, cfg, dfg_scope, state_id, entry_node, callsite_stream) #TODO: add some other function of own. + # Should we ? + self.generate_node(sdfg, cfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) + # generated nested subgraphs + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) + +#### Helpers + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label, state_struct=True): + # prepend = [] + # if state_struct: + # prepend = ['__state'] + # fsyms = node.sdfg.used_symbols(all_symbols=False, keep_defined_in_mapping=True) + # args = ', '.join(prepend + [argval for _, _, argval in memlet_references] + [ + # cpp.sym2cpp(symval) for symname, symval in sorted(node.symbol_mapping.items()) + # if symname in fsyms and symname not in sdfg.constants + # ]) + # return f'{sdfg_label}({args});' + args = '' + return f'{sdfg_label}({args});' #TODO: add args later + +#### Node Generators(What node to generate) - callback from generate_node() + def _generate_NestedSDFG( + self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: ScopeSubgraphView, + state_id: int, + node: nodes.NestedSDFG, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ): + state_dfg = cfg.nodes()[state_id] + # Emit nested SDFG as a separate function + nested_stream = CodeIOStream() + nested_global_stream = CodeIOStream() + + # unique name generation of function + sdfg_label = "%s_%d_%d_%d" % (node.sdfg.name, sdfg.cfg_id, state_id, dfg.node_id(node)) + + # Generate function call + codegen = self.calling_codegen + memlet_references = None # TODO: add memlet references later + callsite_stream.write(codegen.generate_nsdfg_call(sdfg, cfg, state_dfg, node, memlet_references, + sdfg_label), + cfg, state_id, node) + # callsite_stream.write(sdfg_label, cfg, state_id, node) + + + # def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, # function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - # # Take care of map header - # assert len(dfg_scope.source_nodes()) == 1 - # map_header: nodes.MapEntry = dfg_scope.source_nodes()[0] # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, map_header) + # # Add extra opening brace (dynamic map ranges, closed in MapExit # # generator) # callsite_stream.write('{', cfg, state_id, map_header) @@ -182,6 +244,8 @@ def generate_state(self, # state = cfg.state(state_id) # symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) + + #$$$$ First dace::copy() # for var, r in zip(map_header.map.params, map_header.map.range): # begin, end, skip = r @@ -192,11 +256,5 @@ def generate_state(self, # cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) # self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) + # subgraphs_scope_call - # self._dispatcher.dispatch_subgraph(sdfg, - # cfg, - # dfg_scope, - # state_id, - # function_stream, - # callsite_stream, - # skip_entry_node=True) From 73fc0bb6280974fb8348e80e552583e370823ddf Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 31 Jul 2024 16:12:09 -0600 Subject: [PATCH 22/77] add mapping GC program to dace --- graphcore_mapped_dace.cpp | 96 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 graphcore_mapped_dace.cpp diff --git a/graphcore_mapped_dace.cpp b/graphcore_mapped_dace.cpp new file mode 100644 index 0000000000..176da32a73 --- /dev/null +++ b/graphcore_mapped_dace.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2018 Graphcore Ltd. All rights reserved. + +/* This file contains the completed version of Poplar tutorial 3. + See the Poplar user guide for details. +*/ + +#include +#include +#include +#include +using namespace poplar; +using namespace poplar::program; + +void func_device_pre(){ + + // Create the IPU model device + IPUModel ipuModel; + Device device = ipuModel.createDevice(); + Target target = device.getTarget(); + // Create the Graph object + +} +// graph +// codelets->tasklets +// data +// connect input-output + +void func_graph(){ + // init device and graph + Graph graph(target); + + // Add codelets to the graph + graph.addCodelets("tut3_codelets.cpp"); + + // CONTAINERS ->ARRAYS/MAPS + // Add variables to the graph + Tensor v1 = graph.addVariable(FLOAT, {4}, "v1"); + Tensor v2 = graph.addVariable(FLOAT, {4}, "v2"); + for (unsigned i = 0; i < 4; ++i) { + graph.setTileMapping(v1[i], i); + graph.setTileMapping(v2[i], i); + } + // Add steps to initialize the variables + Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); + graph.setTileMapping(c1, 0); + + + // parallel stuff -> MAP/CONSUME? + // Connect the codelets with data --> MEMLET part + ComputeSet computeSet = graph.addComputeSet("computeSet"); + for (unsigned i = 0; i < 4; ++i) { + VertexRef vtx = graph.addVertex(computeSet, "SumVertex"); + graph.connect(vtx["in"], v1.slice(i, 4)); + graph.connect(vtx["out"], v2[i]); + graph.setTileMapping(vtx, i); + graph.setPerfEstimate(vtx, 20); + } + +} + +// seems like function before calling __internal__ +void cfg() { + // Create a control program that is a sequence of steps + Sequence prog; + + prog.add(Copy(c1, v1)); + + // Add step to execute the compute set + prog.add(Execute(computeSet)); // ------------->>>>graph() + + // Add step to print out v2 + prog.add(PrintTensor("v2", v2)); +} + +void func_engine_and_cleanup(){ + + // Create the engine + Engine engine(graph, prog); + engine.load(device); + + // Run the control program + std::cout << "Running program\n"; + engine.run(0); + std::cout << "Program complete\n"; +} + +int main() { +// where is the dataflow graph supposed to get built? + + func_device_pre(); // no + func_graph(); // yes (__internal__) + cfg(); // program(){__internal__} + func_engine_and_cleanup(); // no + + return 0; +} \ No newline at end of file From c9971470f37818d4c77536e25eea21a834a99e1c Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 2 Aug 2024 10:07:40 -0600 Subject: [PATCH 23/77] [WIP] Register array, copy, and add some code for generating the headers and the IPUDevice, this goes in __init__/exit part and not in the SDFG --- dace/codegen/targets/framecode.py | 10 +++++ dace/codegen/targets/ipu.py | 66 ++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index ab71932999..a3048e1835 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -148,6 +148,9 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: if backend in headers: global_stream.write("\n".join("#include \"" + h + "\"" for h in headers[backend]), sdfg) + # GRAPHCORE + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) ######################################################### # Custom types datatypes = set() @@ -215,6 +218,13 @@ def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre for env in self.environments: self.statestruct.extend(env.state_fields) + # GRAPHCORE + self.statestruct.append('IPUModel ipuModel;') + self.statestruct.append('Device device = ipuModel.createDevice();') + self.statestruct.append('Target target = device.getTarget();') + self.statestruct.append('Graph graph(target);') + + # Instrumentation preamble if len(self._dispatcher.instrumentation) > 2: self.statestruct.append('dace::perf::Report report;') diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 0fd6ac37b6..1725242e2c 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -48,11 +48,15 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # Register dispatchers # self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() - + ipu_storage = [dtypes.StorageType.Register] # Register additional dispatchers # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) self._dispatcher.register_node_dispatcher(self) self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + self._dispatcher.register_array_dispatcher(ipu_storage, self) + # Register IPU copies (all internal pairs) + for src_storage, dst_storage in itertools.product(ipu_storage, ipu_storage): + self._dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) def state_dispatch_predicate(self, sdfg, state): @@ -129,6 +133,9 @@ class SumVertex : public poplar::Vertex { ############################################################################################################ # from cpu.py def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + + self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, node, node.desc(sdfg), function_stream, callsite_stream) + if isinstance(node, nodes.NestedSDFG): # Dynamically obtain node generator according to class name try: @@ -143,7 +150,7 @@ def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVie # Mark node as "generated" self._generated_nodes.add(node) # self._locals.clear_scope(self._ldepth + 1) - + def generate_state(self, sdfg:SDFG, cfg: ControlFlowRegion, @@ -151,14 +158,8 @@ def generate_state(self, function_stream: CodeIOStream, callsite_stream:CodeIOStream, generate_state_footer:bool = True): - # print state.label - - # callsite_stream.write( - # ''' - # State(CFG/Loops/Conditionals(if else, for, ...)){} - # '''.format(state.label) - # , sdfg) - self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) def generate_scope(self, sdfg: SDFG, @@ -185,6 +186,51 @@ def generate_scope(self, callsite_stream, skip_entry_node=True) + def declare_array(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + node: nodes.Node, + nodedesc: data.Data, + function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: + print("IN DECLARE_ARRAY") + fsymbols = self._frame.symbols_and_constants(sdfg) + # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent + # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). + # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if + # `nodedesc` is a View and `dfg` is None. + if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): + raise NotImplementedError("The declare_array method should only be used for variables " + "that must have their declaration and allocation separate.") + + name = node.root_data + ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame) + + if nodedesc.transient is False: + return + + # Check if array is already declared + if self._dispatcher.declared_arrays.has(ptrname): + return + + # Compute array size + arrsize = nodedesc.total_size + if not isinstance(nodedesc.dtype, dtypes.opaque): + arrsize_bytes = arrsize * nodedesc.dtype.bytes + + if (nodedesc.storage == dtypes.StorageType.Register): + ctypedef = dtypes.pointer(nodedesc.dtype).ctype + declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) + #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); + declaration_stream.write(f'{nodedesc.dtype.ctype} {name}_const = graph.addConstant<{nodedesc.dtype.ctype}>({nodedesc.dtype.ctype.capitalize}, {arrsize}, {nodedesc.ctype}({nodedesc.dtype.ctype}));\n', cfg, state_id, node) + self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) + return + else: + raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) + + #### Helpers def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label, state_struct=True): # prepend = [] From 01d0658bdd0becf8bd0834510cb5159adee146b4 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 4 Aug 2024 10:09:20 -0600 Subject: [PATCH 24/77] use dace.DeviceType.IPU to check and emit headers in framecode, not tested might be buggy --- dace/codegen/targets/framecode.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index a3048e1835..099c9304a7 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -149,8 +149,21 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: global_stream.write("\n".join("#include \"" + h + "\"" for h in headers[backend]), sdfg) # GRAPHCORE - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) + if (backend == dace.DeviceType.IPU): + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) ######################################################### # Custom types datatypes = set() From afbbdd1d6b16f359b8ee5c6a0b1d442b91c4bff1 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 4 Aug 2024 10:09:45 -0600 Subject: [PATCH 25/77] learn sdfg by using the APIs and writing tests --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 graphcore_dace/handcrafted_sdfg_scalar_add.py diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py new file mode 100644 index 0000000000..f211be0aec --- /dev/null +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -0,0 +1,75 @@ +import dace +from dace.sdfg.propagation import propagate_states + +# Handcrafted SDFG for scalar addition + +def handcrafted_sdfg_scalar_add(): + sdfg = dace.SDFG('handcrafted_sdfg') + + + ################OTHER################ + # other data nodes + sdfg.add_symbol('Symbol', dace.int64) # symbol can't be added in a state + sdfg.add_constant('constant_bool', True) # constant + + ################DEPRICATED################ + # data(scalar, symbol, array, constant, stream, transient) - everything is depricated + sdfg.add_array('Array_normal', [2, 2], dace.int64, storage=dace.StorageType.Default, + transient=False) # normal array + sdfg.add_array('Array_transient', [2, 2], dace.int64, storage=dace.StorageType.Default, + transient=True) #Transiant + sdfg.add_array('Array_onGPU', [2, 2], dace.int64, storage=dace.StorageType.GPU_Global, + transient=False) #on GPU + # sdfg.add_stream('stream', dace.float32, transient=True, buffer_size=10) # stream + # sdfg.add_transient('transient', [2, 2], dace.int64) # transient + # sdfg.add_scalar('a_scalar', dace.int32) + + + ############################################ + root = sdfg.add_state('top_level_state', is_start_block=True, is_start_state=True) + ################USE THIS################ + A = root.add_access("Array_normal") + B = root.add_access("Array_transient") + C = root.add_access("Array_onGPU") + # D = root.add_access("stream") + # E = root.add_access("transient") + # F = root.add_access("a_scalar") + + ################MEMLET################ + + + # # state + middle = sdfg.add_state('middle_level_state', is_start_block=False, is_start_state=False) + exit = sdfg.add_state('bottom_level_state', is_start_block=False, is_start_state=False) + + # cfg + sdfg.add_edge(root, middle, dace.InterstateEdge()) + sdfg.add_edge(middle, exit, dace.InterstateEdge()) + + # dfg edges/ Memlets + root.add_nedge(A, B, dace.Memlet("Array_normal[0]")) + root.add_edge(B, None, C, None, dace.Memlet("Array_transient[0]")) + print("Total edges", root.number_of_edges()) + # root.add_nedge(A, middle, dace.Memlet("Array_normal[0]")) + + # # tasklet + # tasklet = root.add_tasklet('add', {'tmp_A', 'tmp_B'}, {'tmp_C'}, 'tmp_C = tmp_A + tmp_B', language=dace.Language.Python) + + # # edges inside DFG/Memlet + # root.add_edge(A, None, tasklet, "tmp_A", dace.Memlet("A[0]")) + # root.add_edge(B, None, tasklet, "tmp_B", dace.Memlet("B[0]")) + # root.add_edge(tasklet, "tmp_C", C, None, dace.Memlet("C[0]")) + + sdfg() # uncomment for upstream dace to codegen + code = sdfg.generate_code()[0].clean_code + +def structure(): + sdfg = dace.SDFG('structure') + state = sdfg.add_state('state') + + sdfg() + code = sdfg.generate_code()[0].clean_code + +if __name__ == "__main__": + # handcrafted_sdfg_scalar_add() + structure() \ No newline at end of file From cff27f4ef599690a23a9bf4d750da400d1763151 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 4 Aug 2024 16:13:24 -0600 Subject: [PATCH 26/77] Add new test case, simple codes to understand writing SDFG by hand --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 92 ++++++++++++++++++- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index f211be0aec..fbbd004557 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -1,5 +1,5 @@ import dace -from dace.sdfg.propagation import propagate_states +import numpy as np # Handcrafted SDFG for scalar addition @@ -69,7 +69,93 @@ def structure(): sdfg() code = sdfg.generate_code()[0].clean_code - + +# Compute C = A+B +# vector +def vector_add(): + sdfg = dace.SDFG('vector_add') + #########GLOBAL VARIABLES######### + # # data(vector add) + sdfg.add_array('A', [10], dace.float64) + sdfg.add_array('B', [10], dace.float64) + sdfg.add_array('C', [10], dace.float64) + + ###########STATE, CFG, GLOBAL DATA################ + # # add state + state = sdfg.add_state('sum', is_start_block=True) + a = state.add_read('A') + b = state.add_read('B') + c = state.add_write('C') + + ###########DFG################ + # Add nodes + # # map + add_entry, add_exit = state.add_map('add_map', dict(i='0:10'), schedule=dace.ScheduleType.Default) + # # tasklet + t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i'), internal_connector='_a') + state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i'), internal_connector='_b') + state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i'), internal_connector='_c') + + ###########CODEGEN################ + + A = np.random.rand(10) + B = np.random.rand(10) + C = np.zeros(10) + + print(A) + print(B) + print(C) + sdfg(A, B, C) + print(C) + +# Compute C = A+B +# scalar +def scalar_add(): + sdfg = dace.SDFG('scalar_add') + #########GLOBAL VARIABLES######### + # # data(vector add) + sdfg.add_array('A', [1], dace.float64) + sdfg.add_array('B', [1], dace.float64) + sdfg.add_array('C', [1], dace.float64) + + ###########STATE, CFG, GLOBAL DATA################ + # # add state + state = sdfg.add_state('sum', is_start_block=True) + a = state.add_read('A') + b = state.add_read('B') + c = state.add_write('C') + + ###########DFG################ + # Add nodes + # # map + # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) + # # tasklet + t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + # state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i')) + # state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i')) + # state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i')) + + # # Add memlet_path + # state.add_memlet_path(a, t1, dst_conn='_a', memlet=dace.Memlet(f"A[i]")) + # state.add_memlet_path(b, t1, dst_conn='_b', memlet=dace.Memlet(f"B[i]")) + # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) + + # just add_edge + state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) + state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) + state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) + + ###########CODEGEN################ + sdfg() + code = sdfg.generate_code()[0].clean_code + if __name__ == "__main__": # handcrafted_sdfg_scalar_add() - structure() \ No newline at end of file + # structure() + # scalar_add() + vector_add() From 3715101e1f1b6731596212471a86086856076c3b Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 5 Aug 2024 15:43:27 -0600 Subject: [PATCH 27/77] Add a new library, poplar --- dace/libraries/poplar/__init__.py | 7 +++++++ .../libraries/poplar/environments/__init__.py | 2 ++ dace/libraries/poplar/environments/poplar.py | 21 +++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 dace/libraries/poplar/__init__.py create mode 100644 dace/libraries/poplar/environments/__init__.py create mode 100644 dace/libraries/poplar/environments/poplar.py diff --git a/dace/libraries/poplar/__init__.py b/dace/libraries/poplar/__init__.py new file mode 100644 index 0000000000..4aa3fba752 --- /dev/null +++ b/dace/libraries/poplar/__init__.py @@ -0,0 +1,7 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from dace.library import register_library +from .nodes import * +from .environments import * +from .utils import * + +register_library(__name__, "poplar") diff --git a/dace/libraries/poplar/environments/__init__.py b/dace/libraries/poplar/environments/__init__.py new file mode 100644 index 0000000000..d0765769da --- /dev/null +++ b/dace/libraries/poplar/environments/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from .poplar import * diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py new file mode 100644 index 0000000000..c571fda64c --- /dev/null +++ b/dace/libraries/poplar/environments/poplar.py @@ -0,0 +1,21 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library + + +@dace.library.environment +class IPU: + + cmake_minimum_version = "3.6" + cmake_packages = ["IPU"] + cmake_files = [] + cmake_variables = {} + cmake_includes = [] + cmake_libraries = ["${IPU_CXX_LIBRARIES}"] + cmake_compile_flags = ["-I${IPU_CXX_HEADER_DIR}"] + cmake_link_flags = ["${IPU_LINKER_FLAGS}"] + + headers = ["poplar.h"] + state_fields = [] + init_code = "This is init code" + finalize_code = "This is finalize code;" # actually if we finalize in the dace program we break pytest :) + dependencies = [] From c9c64ddb54bb7a6eacb3efaaa72f1ec17ccd09e8 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 5 Aug 2024 22:59:25 -0600 Subject: [PATCH 28/77] Fix the issue with checking if the device targer is IPU. Code snippet below checks if the file is frame(.cpp vs .cu) if so matches target and dumps the headers. ``` if backend == 'frame': #on cpu.cpp file + for target in self.targets: + if target.target_name == 'ipu': ``` Some cosmetic changes removed the prints. --- dace/codegen/targets/framecode.py | 61 +++++++++++++++---------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 099c9304a7..9757297570 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -149,21 +149,23 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: global_stream.write("\n".join("#include \"" + h + "\"" for h in headers[backend]), sdfg) # GRAPHCORE - if (backend == dace.DeviceType.IPU): - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) + if backend == 'frame': #on cpu.cpp file + for target in self.targets: + if target.target_name == 'ipu': + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) + global_stream.write('#include ', sdfg) ######################################################### # Custom types datatypes = set() @@ -951,9 +953,6 @@ def generate_code(self, states_generated = self.generate_states(sdfg, global_stream, callsite_stream) # Loop over states_generated and print them - for state in states_generated: - print(state) - print("*" * 50) ####################################################################### # Sanity check @@ -1033,19 +1032,19 @@ def generate_code(self, generated_header = global_stream.getvalue() # header generated_code = callsite_stream.getvalue() # frame - # # Clean up generated code - # gotos = re.findall(r'goto (.*?);', generated_code) - # clean_code = '' - # for line in generated_code.split('\n'): - # # Empty line with semicolon - # if re.match(r'^\s*;\s*', line): - # continue - # # Label that might be unused - # label = re.findall(r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) - # if len(label) > 0: - # if label[0] not in gotos: - # continue - # clean_code += line + '\n' + # Clean up generated code + gotos = re.findall(r'goto (.*?);', generated_code) + clean_code = '' + for line in generated_code.split('\n'): + # Empty line with semicolon + if re.match(r'^\s*;\s*', line): + continue + # Label that might be unused + label = re.findall(r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) + if len(label) > 0: + if label[0] not in gotos: + continue + clean_code += line + '\n' clean_code = generated_code # Return the generated global and local code strings From 30178f058d92409b83f64eb87cbc985884e715bc Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 12 Aug 2024 18:53:05 -0600 Subject: [PATCH 29/77] Comment the IPU type doesn't work as needs frontend support probably, add more handcrafted tests, remove the header generation from framecode.py and add into ipu.py --- dace/codegen/targets/framecode.py | 18 ---- dace/dtypes.py | 11 ++- graphcore_dace/handcrafted_sdfg_scalar_add.py | 95 +++++++++++++++---- 3 files changed, 84 insertions(+), 40 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 9757297570..c0c1804e04 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -148,24 +148,6 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: if backend in headers: global_stream.write("\n".join("#include \"" + h + "\"" for h in headers[backend]), sdfg) - # GRAPHCORE - if backend == 'frame': #on cpu.cpp file - for target in self.targets: - if target.target_name == 'ipu': - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) - global_stream.write('#include ', sdfg) ######################################################### # Custom types datatypes = set() diff --git a/dace/dtypes.py b/dace/dtypes.py index 5fc6893ba5..cb55e24966 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,7 +19,7 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) - IPU = () #: IPU (Graphcore) + # IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) @@ -45,6 +45,7 @@ class StorageType(aenum.AutoNumberEnum): Snitch_TCDM = () #: Cluster-private memory Snitch_L2 = () #: External memory Snitch_SSR = () #: Memory accessed by SSR streamer + # IPU_Tile_Local = () #: IPU Tile-local memory @undefined_safe_enum @@ -79,6 +80,7 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping + # IPU_Map = () #: IPU (Graphcore) # A subset of GPU schedule types @@ -200,7 +202,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + # ScheduleType.IPU_Map: StorageType.IPU_Tile_Local, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -221,7 +224,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, + # ScheduleType.IPU_Map: ScheduleType.IPU_Map } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. @@ -234,6 +238,7 @@ class TilingType(aenum.AutoNumberEnum): StorageType.GPU_Shared: ScheduleType.GPU_ThreadBlock, StorageType.FPGA_Global: ScheduleType.FPGA_Device, StorageType.SVE_Register: ScheduleType.SVE_Map, + # StorageType.IPU_Tile_Local: ScheduleType.IPU_Map } # Translation of types to C types diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index fbbd004557..3ef2260c83 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -1,8 +1,9 @@ import dace import numpy as np +from dace.transformation.interstate.gpu_transform_sdfg import GPUTransformSDFG -# Handcrafted SDFG for scalar addition +# SDFG APIs def handcrafted_sdfg_scalar_add(): sdfg = dace.SDFG('handcrafted_sdfg') @@ -70,8 +71,6 @@ def structure(): sdfg() code = sdfg.generate_code()[0].clean_code -# Compute C = A+B -# vector def vector_add(): sdfg = dace.SDFG('vector_add') #########GLOBAL VARIABLES######### @@ -90,7 +89,7 @@ def vector_add(): ###########DFG################ # Add nodes # # map - add_entry, add_exit = state.add_map('add_map', dict(i='0:10'), schedule=dace.ScheduleType.Default) + add_entry, add_exit = state.add_map('add_map', dict(i='0:10'), schedule=dace.ScheduleType.Sequential) # # tasklet t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') @@ -111,22 +110,26 @@ def vector_add(): sdfg(A, B, C) print(C) -# Compute C = A+B -# scalar def scalar_add(): sdfg = dace.SDFG('scalar_add') #########GLOBAL VARIABLES######### # # data(vector add) - sdfg.add_array('A', [1], dace.float64) - sdfg.add_array('B', [1], dace.float64) - sdfg.add_array('C', [1], dace.float64) + + # sdfg.add_array('A', [1], dace.float64) + # sdfg.add_array('B', [1], dace.float64) + # sdfg.add_array('C', [1], dace.float64) + sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_constant('constant', 1) + ###########STATE, CFG, GLOBAL DATA################ # # add state state = sdfg.add_state('sum', is_start_block=True) - a = state.add_read('A') - b = state.add_read('B') - c = state.add_write('C') + a = state.add_read('A_scalar') + b = state.add_read('B_scalar') + c = state.add_write('C_scalar') ###########DFG################ # Add nodes @@ -146,16 +149,70 @@ def scalar_add(): # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) # just add_edge - state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) - state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) - state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) + state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) + state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) + state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) ###########CODEGEN################ - sdfg() - code = sdfg.generate_code()[0].clean_code + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg(A, B, C) + print("After", C) + + +def only_state(): + sdfg = dace.SDFG('only_state') + sdfg.add_constant('constant_variable', 1) + sdfg.add_symbol('symbol_variable', dace.int64) + sdfg.add_array('A_array', [1], dace.float64) #, storage=dace.StorageType.IPU_Tile_Local, transient=False) + sdfg.add_array('B_array', [1], dace.float64) + sdfg.add_array('C_array', [1], dace.float64) + state1 = sdfg.add_state('state1' , is_start_state=True) + a = state1.add_read('A_array') + b = state1.add_read('B_array') + c = state1.add_write('C_array') + t = state1.add_tasklet('add', {'a', 'b'}, {'c'}, 'c = a + b') + state1.add_edge(a, None, t, 'a', dace.Memlet('A_array[0]')) + state1.add_edge(b, None, t, 'b', dace.Memlet('B_array[0]')) + state1.add_edge(t, 'c', c, None, dace.Memlet('C_array[0]')) + + # state2 = sdfg.add_state('state2') + # state3 = sdfg.add_state('state3') + # state4 = sdfg.add_state('state4') + + # # cfg/program::sequential + # sdfg.add_edge(state1, state2, dace.InterstateEdge()) + # sdfg.add_edge(state2, state3, dace.InterstateEdge()) + # sdfg.add_edge(state3, state4, dace.InterstateEdge()) + + + sdfg(A, B, C) + +#### Python +def add(A, B, C): + C = A + B +# main if __name__ == "__main__": # handcrafted_sdfg_scalar_add() # structure() - # scalar_add() - vector_add() + #add a,b,c values + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + # print (A) + # print (B) + # add(A, B, C) + # print (C) + # only_state() + # print (C) + # vector_add() + scalar_add() \ No newline at end of file From 82d193d520fe34d08c1d5a0292b312168c29dacc Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 12 Aug 2024 18:54:13 -0600 Subject: [PATCH 30/77] Copied codegen from cpu.py, tweaked it and understood the structure of how cpu codegen works for a tasklet --- dace/codegen/targets/ipu.py | 652 +++++++++++++++++++++++------------- 1 file changed, 421 insertions(+), 231 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 1725242e2c..e092a67538 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -1,34 +1,35 @@ # import # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -from typing import TYPE_CHECKING +from io import StringIO +from typing import TYPE_CHECKING, Optional, Tuple, Union from copy import deepcopy -from dace.codegen.targets.framecode import DaCeCodeGenerator -from dace.sdfg.graph import MultiConnectorEdge -from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView -import functools -import itertools -import warnings - -from dace import data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config +from dace import (data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config) +from dace import dtypes, memlet as mm from dace.codegen import cppunparse, exceptions as cgx + from dace.codegen.prettycode import CodeIOStream +import dace.codegen.targets from dace.codegen.targets import cpp, fpga +from dace.codegen.targets.cpu import CPUCodeGen +from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.common import codeblock_to_cpp, sym2cpp, update_persistent_desc from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute from dace.codegen.dispatcher import DefinedType, TargetDispatcher from dace.frontend import operations -from dace.sdfg import nodes, utils as sdutils from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, - dynamic_map_inputs) + dynamic_map_inputs, nodes, utils as sdutils) +from dace.sdfg import nodes, SDFG, SDFGState, ScopeSubgraphView, graph as gr from dace.sdfg.scope import is_devicelevel_gpu, is_in_scope +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView +from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode +from dace.sdfg import nodes, SDFG, SDFGState, ScopeSubgraphView, graph as gr from dace.sdfg.validation import validate_memlet_data -from typing import TYPE_CHECKING, Optional, Tuple, Union -from dace.codegen.codeobject import CodeObject -from dace.codegen.targets.cpu import CPUCodeGen - -if TYPE_CHECKING: - from dace.codegen.targets.ipu import IPUCodeGen - from dace.codegen.targets.cpp import CPUCodeGen +from dace.sdfg.graph import MultiConnectorEdge +from dace.codegen.targets.sve import util as util +import copy +import functools +import itertools +import warnings @registry.autoregister_params(name='ipu') @@ -38,241 +39,387 @@ class IPUCodeGen(TargetCodeGenerator): title = 'IPU' language = 'cpp' - def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): - self._sdfg = sdfg - self._frame = frame_codegen - self._dispatcher = frame_codegen.dispatcher - self._global_sdfg = sdfg - self._generated_nodes = set() - self.calling_codegen = self + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): + print("in IPUCodeGen") + self.has_generated_header = False + self.frame = frame_codegen + self.dispatcher = frame_codegen._dispatcher + self.cpu_codegen: dace.codegen.targets.CPUCodeGen = self.dispatcher.get_generic_node_dispatcher() + self._locals = cppunparse.CPPLocals() + # Scope depth (for defining locals) + self._ldepth = 0 + # Keep nested SDFG schedule when descending into it + self._toplevel_schedule = None - # Register dispatchers - # self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() - ipu_storage = [dtypes.StorageType.Register] - # Register additional dispatchers - # self._dispatcher.register_map_dispatcher(dtypes.ScheduleType.Sequential, self) - self._dispatcher.register_node_dispatcher(self) - self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) - self._dispatcher.register_array_dispatcher(ipu_storage, self) - # Register IPU copies (all internal pairs) - for src_storage, dst_storage in itertools.product(ipu_storage, ipu_storage): - self._dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) + + # self.dispatcher.register_array_dispatcher(dtypes.StorageType.IPU_Tile_Local, self) + + # # Storage + # cpu_storage = [ + # dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register, + # dtypes.StorageType.IPU_Tile_Local + # ] + # # Dispatchers + # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) + # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) + self.dispatcher.register_node_dispatcher(self, self.is_node_tasklet) + # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) + - def state_dispatch_predicate(self, sdfg, state): - return True - # __dace_init_ function is generated if True + # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # self._dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array + # for src_storage, dst_storage in itertools.product(ipu_storage, ipu_storage): + # self._dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) + # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) + + def get_generated_codeobjects(self): + res = super().get_generated_codeobjects() + return res + + # __dace_init_ function @property def has_initializer(self): - return True + return False - # __dace_exit_ function is generated if True + # __dace_exit_ function @property def has_finalizer(self): - return True + return False @staticmethod def cmake_options(): options = [] - - if Config.get("compiler", "ipu", "libs"): - options.append('-DCMAKE_SHARED_LINKER_FLAGS="{}"'.format(Config.get("compiler", "ipu", "libs"))) - + # if Config.get("compiler", "ipu", "libs"): + # options.append('-DCMAKE_SHARED_LINKER_FLAGS="{}"'.format(Config.get("compiler", "ipu", "libs"))) return options - - # This will generate the src/cuda/xyz.cu files and folders using "codeObjects" class. - # We don't need this now as we are mostly concerned about a single file codegen as of now. - def get_generated_codeobjects(self): - fileheader = CodeIOStream() - sdfg = self._global_sdfg - - # Adds - self._frame.generate_fileheader(self._global_sdfg, fileheader, 'poplar') - - # cuda/mpi seemed to be using this follow - params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) - if params_comma: - params_comma = ', ' + params_comma - codelet_file_code = """ -// Copyright (c) 2018 Graphcore Ltd. All rights reserved. -// Copied from tut3_vertices from Poplar SDK tutorials - -#include - -class SumVertex : public poplar::Vertex { - public: - // Fields - poplar::Input> in; - poplar::Output out; - - // Compute function - bool compute() { - *out = 0; - for (const auto &v : in) { - *out += v; - } - return true; - } -}; -""" - - codeobj = CodeObject( - name=sdfg.name + '_codelets', - code=codelet_file_code, - language='cpp', - target=IPUCodeGen, - title='IPU', - linkable=False) - - # Fill in the list - return [codeobj] - + + def is_node_tasklet(self, sdfg, state, node): + if isinstance(node, nodes.Tasklet): + return True + return False + + """ if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes(NestedSDFG, Consume, Map, LibraryNode) + if node.schedule == dtypes.ScheduleType.Sequential: + return True + return False + """ ############################################################################################################ # IPU specific node/state generation ############################################################################################################ - # from cpu.py - def generate_node(self, sdfg:SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node:nodes.Node, function_stream: CodeIOStream, callsite_stream:CodeIOStream): + # def copy_memory( + # self, + # sdfg: SDFG, + # cfg: ControlFlowRegion, + # dfg: StateSubgraphView, + # state_id: int, + # src_node: Union[nodes.Tasklet, nodes.AccessNode], + # dst_node: Union[nodes.Tasklet, nodes.AccessNode], + # edge: MultiConnectorEdge, + # function_stream: CodeIOStream, + # callsite_stream: CodeIOStream, + # ) -> None: + # return self.cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) + # return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) + + # def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: + # self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) + + # def allocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, + # allocation_stream: CodeIOStream) -> None: + + # # if user provided this storage type, then we dump what they said. + # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: + # name = node.data + # size = nodedesc.total_size + # ipu_type = "FLOAT" + # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) + # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) + # return + + # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, + # allocation_stream) - self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, node, node.desc(sdfg), function_stream, callsite_stream) - - if isinstance(node, nodes.NestedSDFG): - # Dynamically obtain node generator according to class name - try: - gen = getattr(self, "_generate_" + type(node).__name__) - except AttributeError: - if isinstance(node, nodes.LibraryNode): - raise NodeNotExpandedError(sdfg, state_id, dfg.node_id(node)) - raise - # _generate_Tasklet() example - - gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - # Mark node as "generated" - self._generated_nodes.add(node) - # self._locals.clear_scope(self._ldepth + 1) + # def deallocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, + # node: nodes.Node, nodedesc: data.Data, function_stream: CodeIOStream, + # callsite_stream: CodeIOStream) -> None: + # # unless any cpu allocations no need for IPUs + # pass + # # return self.cpu_codegen.deallocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, + # # callsite_stream) + + # def allocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, + # node: nodes.AccessNode, nodedesc: data.Array, function_stream: CodeIOStream, + # declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + # # Make sure the codegen includes the appropriate header files + # self.add_header(function_stream) + + # name = node.data + # print("ALLOCATE ARRAY - ", name) + # # # Based on the hardware, the total size must be 16^2 + # # assert nodedesc.total_size == 16 * 16 + # # # Majority is detected by the strides of the data + # # maj = 'row' if nodedesc.strides[-1] == 1 else 'col' + + # # Write a fragment based on the storage type + # if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: + # ctype = 'wmma::fragment' + # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) + # # else: + # # ctype = 'wmma::fragment'.format( + # # mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) + # # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) - def generate_state(self, - sdfg:SDFG, - cfg: ControlFlowRegion, - state: SDFGState, - function_stream: CodeIOStream, - callsite_stream:CodeIOStream, - generate_state_footer:bool = True): - - self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - - def generate_scope(self, - sdfg: SDFG, - cfg: ControlFlowRegion, - dfg_scope: ScopeSubgraphView, - state_id: int, - function_stream: CodeIOStream, - callsite_stream: CodeIOStream) -> None: - # Get the first entry node of Map - entry_node = dfg_scope.source_nodes()[0] - - # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, entry_node) - callsite_stream.write('{', cfg, state_id, entry_node) - - # cpp.presynchronize_streams(sdfg, cfg, dfg_scope, state_id, entry_node, callsite_stream) #TODO: add some other function of own. - # Should we ? - self.generate_node(sdfg, cfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) - # generated nested subgraphs - self._dispatcher.dispatch_subgraph(sdfg, - cfg, - dfg_scope, - state_id, - function_stream, - callsite_stream, - skip_entry_node=True) - - def declare_array(self, - sdfg: SDFG, - cfg: ControlFlowRegion, - dfg: StateSubgraphView, - state_id: int, - node: nodes.Node, - nodedesc: data.Data, - function_stream: CodeIOStream, - declaration_stream: CodeIOStream) -> None: - print("IN DECLARE_ARRAY") - fsymbols = self._frame.symbols_and_constants(sdfg) - # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent - # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). - # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if - # `nodedesc` is a View and `dfg` is None. - if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): - raise NotImplementedError("The declare_array method should only be used for variables " - "that must have their declaration and allocation separate.") - - name = node.root_data - ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame) - - if nodedesc.transient is False: - return + # # # Add the ctype to defined_vars so that the codegen can properly pass + # # # fragments to functions as an object reference. + # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) + # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + # allocation_stream) + + def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): + """(TASKLET only) + 0. Declarations + 1. Generate pre tasklet + 2. Generate tasklet code + 3. Generate post tasklet + 4. Writes + """ + inner_stream, codegen = self.declarations(cfg, state_id, node, function_stream) + self.dispatcher.defined_vars.enter_scope(node) + ############################################################################################################ + self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) + self.tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream) + after_memlets_stream = self.post_tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen) + ############################################################################################################ + callsite_stream.write('{', cfg, state_id, node) + callsite_stream.write(inner_stream.getvalue(), cfg, state_id, node) + callsite_stream.write(after_memlets_stream.getvalue()) + callsite_stream.write('}', cfg, state_id, node) + self._locals.clear_scope(self._ldepth + 1) + self.dispatcher.defined_vars.exit_scope(node) + + def declarations(self, cfg, state_id, node, function_stream): + self.add_header(function_stream) + inner_stream = CodeIOStream() + state_dfg: SDFGState = cfg.nodes()[state_id] + codegen = self.cpu_codegen or self + return inner_stream,codegen + + def post_tasklet(self, sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen): + after_memlets_stream = CodeIOStream() + codegen.generate_tasklet_postamble(sdfg, cfg, state, state_id, node, function_stream, inner_stream, + after_memlets_stream) + # Process outgoing memlets + codegen.process_out_memlets(sdfg, cfg, state_id, node, state, self.dispatcher, inner_stream, True, function_stream) + return after_memlets_stream + + def tasklet(self, sdfg, cfg, state, state_id, node, function_stream, inner_stream): + inner_stream.write("\n ///////////////////\n", cfg, state_id, node) + # Currently cpu + self.unparse_ipu_tasklet(sdfg, cfg, state_id, state, node, function_stream, inner_stream, self._locals, + self._ldepth, self._toplevel_schedule) + inner_stream.write(" ///////////////////\n\n", cfg, state_id, node) + + def pre_tasklet(self, sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen): + after_memlets_stream = CodeIOStream() + codegen.generate_tasklet_preamble(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, + after_memlets_stream) + # SOME VARIABLE DECLARATIONS + # post-memlet tasklet-preamble code + + callsite_stream.write(after_memlets_stream.getvalue()) + self.add_pre_tasklet_declarations(sdfg, cfg, state_id, state, node, function_stream, inner_stream) - # Check if array is already declared - if self._dispatcher.declared_arrays.has(ptrname): - return + def unparse_ipu_tasklet(self, sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + toplevel_schedule): + # Change it later to IPU specific + self.cpu_codegen.unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + toplevel_schedule) - # Compute array size - arrsize = nodedesc.total_size - if not isinstance(nodedesc.dtype, dtypes.opaque): - arrsize_bytes = arrsize * nodedesc.dtype.bytes - - if (nodedesc.storage == dtypes.StorageType.Register): - ctypedef = dtypes.pointer(nodedesc.dtype).ctype - declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) - #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); - declaration_stream.write(f'{nodedesc.dtype.ctype} {name}_const = graph.addConstant<{nodedesc.dtype.ctype}>({nodedesc.dtype.ctype.capitalize}, {arrsize}, {nodedesc.ctype}({nodedesc.dtype.ctype}));\n', cfg, state_id, node) - self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) - return - else: - raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - - -#### Helpers - def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label, state_struct=True): - # prepend = [] - # if state_struct: - # prepend = ['__state'] - # fsyms = node.sdfg.used_symbols(all_symbols=False, keep_defined_in_mapping=True) - # args = ', '.join(prepend + [argval for _, _, argval in memlet_references] + [ - # cpp.sym2cpp(symval) for symname, symval in sorted(node.symbol_mapping.items()) - # if symname in fsyms and symname not in sdfg.constants - # ]) - # return f'{sdfg_label}({args});' - args = '' - return f'{sdfg_label}({args});' #TODO: add args later - -#### Node Generators(What node to generate) - callback from generate_node() - def _generate_NestedSDFG( - self, - sdfg: SDFG, - cfg: ControlFlowRegion, - dfg: ScopeSubgraphView, - state_id: int, - node: nodes.NestedSDFG, - function_stream: CodeIOStream, - callsite_stream: CodeIOStream, - ): - state_dfg = cfg.nodes()[state_id] - # Emit nested SDFG as a separate function - nested_stream = CodeIOStream() - nested_global_stream = CodeIOStream() - - # unique name generation of function - sdfg_label = "%s_%d_%d_%d" % (node.sdfg.name, sdfg.cfg_id, state_id, dfg.node_id(node)) + def add_pre_tasklet_declarations(self, sdfg, cfg, state_id, state, node, function_stream, inner_stream): - # Generate function call - codegen = self.calling_codegen - memlet_references = None # TODO: add memlet references later - callsite_stream.write(codegen.generate_nsdfg_call(sdfg, cfg, state_dfg, node, memlet_references, - sdfg_label), - cfg, state_id, node) - # callsite_stream.write(sdfg_label, cfg, state_id, node) + arrays = set() + for edge in state.in_edges(node): + u = edge.src + memlet = edge.data + src_node = state.memlet_path(edge)[0].src + + if edge.dst_conn: # Not (None or "") + if edge.dst_conn in arrays: # Disallow duplicates + raise SyntaxError("Duplicates found in memlets") + ctype = node.in_connectors[edge.dst_conn].ctype + # Special case: code->code + if isinstance(src_node, nodes.CodeNode): + shared_data_name = edge.data.data + if not shared_data_name: + # Very unique name. TODO: Make more intuitive + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, state.node_id(src_node), + state.node_id(node), edge.src_conn) + + # Read variable from shared storage + defined_type, _ = self._dispatcher.defined_vars.get(shared_data_name) + if defined_type in (DefinedType.Scalar, DefinedType.Pointer): + assign_str = (f"const {ctype} {edge.dst_conn} = {shared_data_name};") + else: + assign_str = (f"const {ctype} &{edge.dst_conn} = {shared_data_name};") + inner_stream.write(assign_str, cfg, state_id, [edge.src, edge.dst]) + self._dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}") + + else: + self.dispatcher.dispatch_copy( + src_node, + node, + edge, + sdfg, + cfg, + state, + state_id, + function_stream, + inner_stream, + ) + + # Also define variables in the C++ unparser scope + self._locals.define(edge.dst_conn, -1, self._ldepth + 1, ctype) + arrays.add(edge.dst_conn) + + # def generate_state(self, + # sdfg:SDFG, + # cfg: ControlFlowRegion, + # state: SDFGState, + # function_stream: CodeIOStream, + # callsite_stream:CodeIOStream, + # generate_state_footer:bool = True): + # debug_print_self(self) + # self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) + + # def declare_array(self, + # sdfg: SDFG, + # cfg: ControlFlowRegion, + # dfg: StateSubgraphView, + # state_id: int, + # node: nodes.Node, + # nodedesc: data.Data, + # function_stream: CodeIOStream, + # declaration_stream: CodeIOStream) -> None: + # print("IN DECLARE_ARRAY") + # fsymbols = self._frame.symbols_and_constants(sdfg) + # # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent + # # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). + # # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if + # # `nodedesc` is a View and `dfg` is None. + # if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): + # raise NotImplementedError("The declare_array method should only be used for variables " + # "that must have their declaration and allocation separate.") + + # name = node.root_data + # ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame) + + # if nodedesc.transient is False: + # return + + # # Check if array is already declared + # if self._dispatcher.declared_arrays.has(ptrname): + # return + + # # Compute array size + # arrsize = nodedesc.total_size + # if not isinstance(nodedesc.dtype, dtypes.opaque): + # arrsize_bytes = arrsize * nodedesc.dtype.bytes + + # if (nodedesc.storage == dtypes.StorageType.Register): + # ctypedef = dtypes.pointer(nodedesc.dtype).ctype + # declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) + # #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); + # declaration_stream.write(f'{nodedesc.dtype.ctype} {name}_const = graph.addConstant<{nodedesc.dtype.ctype}>({nodedesc.dtype.ctype.capitalize}, {arrsize}, {nodedesc.ctype}({nodedesc.dtype.ctype}));\n', cfg, state_id, node) + # self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) + # return + # else: + # raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) +############################################################################################################ +# #### Helpers + def add_header(self, function_stream: CodeIOStream): + if self.has_generated_header: + return + self.has_generated_header = True + + # headers + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + # namespace + function_stream.write(f'using namespace poplar; \n') + function_stream.write(f'using namespace poplar::program; \n') + + # def debug_print_self(self): + # print("IN GENERATE_STATE") + # # print below ones as well + # print("TargetDispatcher:", self._dispatcher) + # print("init_code", self._frame._initcode.getvalue()) + # print("exit_code", self._frame._exitcode.getvalue()) + # print("Len env:", len(self._frame.environments)) + # for _x in self._frame.statestruct: + # print("statestruct:", _x) + # print("environments:", self._frame.environments) + # print("targets:", self._frame.targets) + # print("to_allocate:", self._frame.to_allocate) + # print("where_allocated:", self._frame.where_allocated) + # print("fsyms:", self._frame.fsyms) + # print("_symbols_and_constants:", self._frame._symbols_and_constants) + # print("arglist:", self._frame.arglist) + # print ("DONE") + # print("DISPATCHER Data") + # print ("used_env", self._dispatcher.used_environments) + # print ("used_targets", self._frame.dispatcher.used_targets) + # print("DONE") + # ####### + # print("TargetCodeGenerator:", self) + # print("language", self.language) + # # print("TargetDispatcher:", self._dispatcher.used_targets) + + # def generate_scope(self, + # sdfg: SDFG, + # cfg: ControlFlowRegion, + # dfg_scope: ScopeSubgraphView, + # state_id: int, + # function_stream: CodeIOStream, + # callsite_stream: CodeIOStream) -> None: + # # Get the first entry node of Map + # entry_node = dfg_scope.source_nodes()[0] + + # # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, entry_node) + # callsite_stream.write('{', cfg, state_id, entry_node) + + # # cpp.presynchronize_streams(sdfg, cfg, dfg_scope, state_id, entry_node, callsite_stream) #TODO: add some other function of own. + # # Should we ? + # # self.generate_node(sdfg, cfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) + # # generated nested subgraphs + # self._dispatcher.dispatch_subgraph(sdfg, + # cfg, + # dfg_scope, + # state_id, + # function_stream, + # callsite_stream, + # skip_entry_node=True) # def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, # function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -291,7 +438,7 @@ def _generate_NestedSDFG( # symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) - #$$$$ First dace::copy() + # #$$$$ First dace::copy() # for var, r in zip(map_header.map.params, map_header.map.range): # begin, end, skip = r @@ -302,5 +449,48 @@ def _generate_NestedSDFG( # cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) # self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) - # subgraphs_scope_call + + # This will generate the src/cuda/xyz.cu files and folders using "codeObjects" class. + # We don't need this now as we are mostly concerned about a single file codegen as of now. + # def get_generated_codeobjects(self): + # fileheader = CodeIOStream() + # sdfg = self._global_sdfg + + # # cuda/mpi seemed to be using this follow + # params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) + # if params_comma: + # params_comma = ', ' + params_comma + # codelet_file_code = """ + # // Copyright (c) 2018 Graphcore Ltd. All rights reserved. + # // Copied from tut3_vertices from Poplar SDK tutorials + + # #include + + # class SumVertex : public poplar::Vertex { + # public: + # // Fields + # poplar::Input> in; + # poplar::Output out; + + # // Compute function + # bool compute() { + # *out = 0; + # for (const auto &v : in) { + # *out += v; + # } + # return true; + # } + # }; + # """ + + # codeobj = CodeObject( + # name=sdfg.name + '_codelets', + # code=codelet_file_code, + # language='cpp', + # target=IPUCodeGen, + # title='IPU', + # linkable=False) + + # # Fill in the list + # return [codeobj] \ No newline at end of file From 8416d823ed6e4e3a5f10869d3a5e34db40e0ca7b Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 13 Aug 2024 11:39:27 -0600 Subject: [PATCH 31/77] Add IPU in dtypes.py --- dace/dtypes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index cb55e24966..e1759f6b7c 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,7 +19,7 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) - # IPU = () #: IPU (Graphcore) + IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) @@ -45,7 +45,7 @@ class StorageType(aenum.AutoNumberEnum): Snitch_TCDM = () #: Cluster-private memory Snitch_L2 = () #: External memory Snitch_SSR = () #: Memory accessed by SSR streamer - # IPU_Tile_Local = () #: IPU Tile-local memory + IPU_Tile_Local = () #: IPU Tile-local memory @undefined_safe_enum @@ -80,7 +80,7 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping - # IPU_Map = () #: IPU (Graphcore) + IPU_Map = () #: IPU (Graphcore) # A subset of GPU schedule types From 6a254c312ff1d92e9408f2d2afa9cc9b34c3caf2 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 13 Aug 2024 11:40:29 -0600 Subject: [PATCH 32/77] Use IPU from StorageType in sdfg.add_scalar --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 3ef2260c83..4f0b63ebe0 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -118,7 +118,7 @@ def scalar_add(): # sdfg.add_array('A', [1], dace.float64) # sdfg.add_array('B', [1], dace.float64) # sdfg.add_array('C', [1], dace.float64) - sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.IPU_Tile_Local, transient=False) sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) sdfg.add_constant('constant', 1) From 3d6e96f54c8197a962dfb0c50f9e15d0d9193d2d Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 13 Aug 2024 11:49:31 -0600 Subject: [PATCH 33/77] Revert "Add IPU in dtypes.py" This reverts commit 8416d823ed6e4e3a5f10869d3a5e34db40e0ca7b. --- dace/dtypes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index e1759f6b7c..cb55e24966 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,7 +19,7 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) - IPU = () #: IPU (Graphcore) + # IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) @@ -45,7 +45,7 @@ class StorageType(aenum.AutoNumberEnum): Snitch_TCDM = () #: Cluster-private memory Snitch_L2 = () #: External memory Snitch_SSR = () #: Memory accessed by SSR streamer - IPU_Tile_Local = () #: IPU Tile-local memory + # IPU_Tile_Local = () #: IPU Tile-local memory @undefined_safe_enum @@ -80,7 +80,7 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping - IPU_Map = () #: IPU (Graphcore) + # IPU_Map = () #: IPU (Graphcore) # A subset of GPU schedule types From 6efc81e01121b0164af25c8405e4fd95e2350c37 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 13 Aug 2024 11:49:53 -0600 Subject: [PATCH 34/77] Revert "Use IPU from StorageType in sdfg.add_scalar" This reverts commit 6a254c312ff1d92e9408f2d2afa9cc9b34c3caf2. --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 4f0b63ebe0..3ef2260c83 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -118,7 +118,7 @@ def scalar_add(): # sdfg.add_array('A', [1], dace.float64) # sdfg.add_array('B', [1], dace.float64) # sdfg.add_array('C', [1], dace.float64) - sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.IPU_Tile_Local, transient=False) + sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) sdfg.add_constant('constant', 1) From cc02d9bd170fa95721d134885a5b4fefff4b3abd Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Tue, 13 Aug 2024 13:15:41 -0600 Subject: [PATCH 35/77] Replace pre_tasklet with generate_read, former comes from cpu.py later is from sve/codegen.py the later is much readable and simpler to understand --- dace/codegen/targets/ipu.py | 67 ++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index e092a67538..ecf1c14f3d 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -191,7 +191,9 @@ def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGSta inner_stream, codegen = self.declarations(cfg, state_id, node, function_stream) self.dispatcher.defined_vars.enter_scope(node) ############################################################################################################ - self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) + # self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) + for edge in state.in_edges(node): + self.generate_read(sdfg, state, edge, inner_stream) self.tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream) after_memlets_stream = self.post_tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen) ############################################################################################################ @@ -285,6 +287,69 @@ def add_pre_tasklet_declarations(self, sdfg, cfg, state_id, state, node, functio # Also define variables in the C++ unparser scope self._locals.define(edge.dst_conn, -1, self._ldepth + 1, ctype) arrays.add(edge.dst_conn) + + def generate_read(self, sdfg: SDFG, state: SDFGState, edge: graph.MultiConnectorEdge[mm.Memlet], + code: CodeIOStream): + """ + Responsible for generating code for reads into a Tasklet, given the ingoing edge. + """ + if edge.dst_conn is None: + return + src_node = state.memlet_path(edge)[0].src + dst_type = edge.dst.in_connectors[edge.dst_conn] + dst_name = edge.dst_conn + if isinstance(src_node, nodes.Tasklet): + ################## + # Code->Code edges + src_type = edge.src.out_connectors[edge.src_conn] + if util.is_vector(src_type) and util.is_vector(dst_type): + # Directly read from shared vector register + code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_scalar(dst_type): + # Directly read from shared scalar register + code.write(f'{dst_type} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_vector(dst_type): + # Scalar broadcast from shared scalar register + code.write( + f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' + ) + else: + raise util.NotSupportedError('Unsupported Code->Code edge') + elif isinstance(src_node, nodes.AccessNode): + ################## + # Read from AccessNode + desc = src_node.desc(sdfg) + if isinstance(desc, data.Array): + # Copy from array + if util.is_pointer(dst_type): + ################## + # Pointer reference + code.write( + f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') + elif util.is_vector(dst_type): + raise util.NotSupportedError('Unsupported read from array which is vector type, util.is_vector()') + else: + ################## + # Scalar read from array + code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') + elif isinstance(desc, data.Scalar): + # Refer to shared variable + src_type = desc.dtype + if util.is_vector(src_type) and util.is_vector(dst_type): + # Directly read from shared vector register + code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_scalar(dst_type): + # Directly read from shared scalar register + code.write(f'{dst_type} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_vector(dst_type): + # Scalar broadcast from shared scalar register + code.write( + f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' + ) + else: + raise util.NotSupportedError('Unsupported Scalar->Code edge') + else: + raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported') # def generate_state(self, # sdfg:SDFG, From 03560054a0a995f6f665dc59225c3d76eabf62cd Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 23 Aug 2024 13:37:00 -0600 Subject: [PATCH 36/77] create gpu_vector_add and cpu_* version --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 69 +++++++++++++++++-- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 3ef2260c83..fd89f31771 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -110,8 +110,67 @@ def vector_add(): sdfg(A, B, C) print(C) -def scalar_add(): - sdfg = dace.SDFG('scalar_add') +def gpu_scalar_add(): + sdfg = dace.SDFG('gpu_scalar_add') + #########GLOBAL VARIABLES######### + # # data(vector add) + + # sdfg.add_array('A', [1], dace.float64) + # sdfg.add_array('B', [1], dace.float64) + # sdfg.add_array('C', [1], dace.float64) + sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + sdfg.add_constant('constant', 1) + + + ###########STATE, CFG, GLOBAL DATA################ + # # add state + state = sdfg.add_state('sum', is_start_block=True) + a = state.add_read('A_scalar') + b = state.add_read('B_scalar') + c = state.add_write('C_scalar') + + ###########DFG################ + # Add nodes + # # map + # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) + # # tasklet + t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + # state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i')) + # state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i')) + # state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i')) + + # # Add memlet_path + # state.add_memlet_path(a, t1, dst_conn='_a', memlet=dace.Memlet(f"A[i]")) + # state.add_memlet_path(b, t1, dst_conn='_b', memlet=dace.Memlet(f"B[i]")) + # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) + + # just add_edge + state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) + state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) + state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + + + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) + + ###########CODEGEN################ + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg = sdfg(A, B, C) + sdfg.apply_transformations(GPUTransformSDFG) + print("After", C) + +def cpu_scalar_add(): + sdfg = dace.SDFG('cpu_scalar_add') #########GLOBAL VARIABLES######### # # data(vector add) @@ -152,6 +211,7 @@ def scalar_add(): state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) @@ -164,10 +224,9 @@ def scalar_add(): print(A) print(B) print("Before", C) - sdfg(A, B, C) + sdfg = sdfg(A, B, C) print("After", C) - def only_state(): sdfg = dace.SDFG('only_state') sdfg.add_constant('constant_variable', 1) @@ -215,4 +274,4 @@ def add(A, B, C): # only_state() # print (C) # vector_add() - scalar_add() \ No newline at end of file + gpu_scalar_add() From 5596ef8a64d31a7c7130663daf7f6a7d1b76c96e Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 23 Aug 2024 20:23:15 -0600 Subject: [PATCH 37/77] Created the most simplest code for codegen of allocate_array and dispatch_copy. Allocate memory on device/host Copy memory from one to another Used Storage=IPU_Memory --- graphcore_dace/handcrafted_sdfg_scalar_add.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index fd89f31771..58c2a7fa5f 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -113,30 +113,30 @@ def vector_add(): def gpu_scalar_add(): sdfg = dace.SDFG('gpu_scalar_add') #########GLOBAL VARIABLES######### - # # data(vector add) - - # sdfg.add_array('A', [1], dace.float64) - # sdfg.add_array('B', [1], dace.float64) - # sdfg.add_array('C', [1], dace.float64) - sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) - sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) - sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) - sdfg.add_constant('constant', 1) + + sdfg.add_scalar("A_scalar_cpu", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("A_scalar_gpu", dace.float64, storage=dace.StorageType.GPU_Global, transient=True) + # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_constant('constant', 1) - ###########STATE, CFG, GLOBAL DATA################ - # # add state + # ###########STATE, CFG, GLOBAL DATA################ + # # # add state state = sdfg.add_state('sum', is_start_block=True) - a = state.add_read('A_scalar') - b = state.add_read('B_scalar') - c = state.add_write('C_scalar') + a_cpu = state.add_read('A_scalar_cpu') + a_gpu = state.add_write('A_scalar_gpu') + + # b = state.add_read('B_scalar') + # c = state.add_write('C_scalar') + state.add_edge(a_cpu, None, a_gpu, None, dace.Memlet(f"A_scalar_cpu")) - ###########DFG################ - # Add nodes - # # map - # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) - # # tasklet - t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + # ###########DFG################ + # # Add nodes + # # # map + # # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) + # # # tasklet + # t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') # Add add_edge_pair(map mostly) # state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i')) @@ -148,10 +148,10 @@ def gpu_scalar_add(): # state.add_memlet_path(b, t1, dst_conn='_b', memlet=dace.Memlet(f"B[i]")) # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) - # just add_edge - state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) - state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) - state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + # # just add_edge + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) @@ -165,7 +165,7 @@ def gpu_scalar_add(): print(A) print(B) print("Before", C) - sdfg = sdfg(A, B, C) + sdfg = sdfg(A) sdfg.apply_transformations(GPUTransformSDFG) print("After", C) From 2a424f34fbb63bf6ada227abc62fa5adfe21e41c Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 23 Aug 2024 20:24:53 -0600 Subject: [PATCH 38/77] Add IPU_Memory as new data type --- dace/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index cb55e24966..7aab0a61d8 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -45,7 +45,7 @@ class StorageType(aenum.AutoNumberEnum): Snitch_TCDM = () #: Cluster-private memory Snitch_L2 = () #: External memory Snitch_SSR = () #: Memory accessed by SSR streamer - # IPU_Tile_Local = () #: IPU Tile-local memory + IPU_Memory = () #: IPU Tile-local memory @undefined_safe_enum From cbfdc42416fed7f4888c048b890361ead66b22fd Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 23 Aug 2024 20:26:24 -0600 Subject: [PATCH 39/77] Added 1. register_array_dispatcher -> allocate_array/deallocate_array 2. register_copy_dispatcher -> copy_memory Notes: using ipu_storage vs gpu_storage breaks. links with last 2 commits --- dace/codegen/targets/ipu.py | 146 ++++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 16 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index ecf1c14f3d..cc91d96bdf 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -5,6 +5,7 @@ from copy import deepcopy from dace import (data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config) from dace import dtypes, memlet as mm +from dace import Memlet from dace.codegen import cppunparse, exceptions as cgx from dace.codegen.prettycode import CodeIOStream @@ -54,25 +55,25 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): # self.dispatcher.register_array_dispatcher(dtypes.StorageType.IPU_Tile_Local, self) - # # Storage - # cpu_storage = [ - # dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register, - # dtypes.StorageType.IPU_Tile_Local - # ] + # Storage + # ipu_storage = [dtypes.StorageType.IPU_Memory] + gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.IPU_Memory] + + self.dispatcher.register_array_dispatcher(gpu_storage, self) # allocate_array/deallocate_array + for storage in gpu_storage: + for other_storage in gpu_storage: + self.dispatcher.register_copy_dispatcher(storage, other_storage, None, self) + self.dispatcher.register_copy_dispatcher(other_storage, storage, None, self) + + # # Dispatchers # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) - self.dispatcher.register_node_dispatcher(self, self.is_node_tasklet) - + # self.dispatcher.register_node_dispatcher(self, self.is_node_tasklet) # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) - - - - # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) - # self._dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array - # for src_storage, dst_storage in itertools.product(ipu_storage, ipu_storage): - # self._dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) + # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + def get_generated_codeobjects(self): res = super().get_generated_codeobjects() @@ -179,6 +180,119 @@ def is_node_tasklet(self, sdfg, state, node): # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, # allocation_stream) + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + self.add_header(function_stream) + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + try: + self.dispatcher.defined_vars.get(dataname) + return + except KeyError: + pass # The variable was not defined, we can continue + + # Check if array is already declared + declared = False + try: + self.dispatcher.declared_arrays.get(dataname) + declared = True # Array was already declared in this or upper scopes + except KeyError: # Array not declared yet + pass + + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): + nodedesc = update_persistent_desc(nodedesc, sdfg) + + result_decl = StringIO() + result_alloc = StringIO() + arrsize = nodedesc.total_size + is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) + arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = '%s *' % nodedesc.dtype.ctype + + # Different types of GPU arrays + if nodedesc.storage == dtypes.StorageType.GPU_Global: + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + + if nodedesc.pool: + cudastream = getattr(node, '_cuda_stream', 'nullptr') + if cudastream != 'nullptr': + cudastream = f'__state->gpu_context->streams[{cudastream}]' + result_alloc.write( + f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n' + ) + self._emit_sync(result_alloc) + else: + # Strides are left to the user's discretion + result_alloc.write("malloc calls") + # result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' % + # (self.backend, dataname, arrsize_malloc)) + + if node.setzero: + result_alloc.write('DACE_GPU_CHECK(%sMemset(%s, 0, %s));\n' % (self.backend, dataname, arrsize_malloc)) + if isinstance(nodedesc, data.Array) and nodedesc.start_offset != 0: + result_alloc.write(f'{dataname} += {cpp.sym2cpp(nodedesc.start_offset)};\n') + elif nodedesc.storage == dtypes.StorageType.Register: + if is_dynamically_sized: + raise ValueError('Dynamic allocation of registers not allowed') + if nodedesc.start_offset != 0: + raise NotImplementedError('Start offset unsupported for registers') + szstr = ' = {0}' if node.setzero else '' + result_decl.write("%s %s[%s]%s;\n" % (nodedesc.dtype.ctype, dataname, sym2cpp(arrsize), szstr)) + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + elif nodedesc.storage == dtypes.StorageType.IPU_Memory: + result_decl.write("# #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5});") # decl + result_alloc.write(" graph.setTileMapping(v1[i][j], i * 2 + j);") # Mapping on + else: + raise NotImplementedError("CUDA: Unimplemented storage type " + str(nodedesc.storage)) + + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) + + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + self.backend = 'cuda' # temporary hack + if nodedesc.storage == dtypes.StorageType.GPU_Global: + if not nodedesc.pool: # If pooled, will be freed somewhere else + callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node) + elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: + callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) + elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \ + nodedesc.storage == dtypes.StorageType.Register: + pass # Do nothing + elif nodedesc.storage == dtypes.StorageType.IPU_Memory: + callsite_stream.write(" poplar::deallocate array") + else: + raise NotImplementedError + + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + memlet: Memlet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state = cfg.state(state_id) + if isinstance(src_node, nodes.Tasklet): + src_storage = dtypes.StorageType.Register + src_parent = state.entry_node(src_node) + dst_schedule = None if src_parent is None else src_parent.map.schedule + else: + src_storage = src_node.desc(sdfg).storage + + if isinstance(dst_node, nodes.Tasklet): + dst_storage = dtypes.StorageType.Register + else: + dst_storage = dst_node.desc(sdfg).storage + + dst_parent = state.entry_node(dst_node) + dst_schedule = None if dst_parent is None else dst_parent.map.schedule + + callsite_stream.write("poplar::copy calls") + # # Emit actual copy + # self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, + # callsite_stream) + def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): """(TASKLET only) @@ -263,13 +377,13 @@ def add_pre_tasklet_declarations(self, sdfg, cfg, state_id, state, node, functio state.node_id(node), edge.src_conn) # Read variable from shared storage - defined_type, _ = self._dispatcher.defined_vars.get(shared_data_name) + defined_type, _ = self.dispatcher.defined_vars.get(shared_data_name) if defined_type in (DefinedType.Scalar, DefinedType.Pointer): assign_str = (f"const {ctype} {edge.dst_conn} = {shared_data_name};") else: assign_str = (f"const {ctype} &{edge.dst_conn} = {shared_data_name};") inner_stream.write(assign_str, cfg, state_id, [edge.src, edge.dst]) - self._dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}") + self.dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}") else: self.dispatcher.dispatch_copy( From 623ab3b1e9748e93a92b8e860020c333e99b1651 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sat, 24 Aug 2024 17:42:59 -0600 Subject: [PATCH 40/77] generate addVariable() API --- dace/codegen/targets/ipu.py | 131 ++++++++++++------ graphcore_dace/handcrafted_sdfg_scalar_add.py | 21 ++- 2 files changed, 107 insertions(+), 45 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index cc91d96bdf..eea0c13c44 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -179,6 +179,71 @@ def is_node_tasklet(self, sdfg, state, node): # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, # allocation_stream) +# def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, +# node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, +# declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: +# dataname = node.data +# allocname = cpp.ptr(dataname, nodedesc, sdfg, self._frame) +# if nodedesc.storage == dtypes.StorageType.GPU_Global: +# fmtargs = { +# 'name': allocname, # TODO: Handle persistent streams +# 'allocname': allocname, +# 'type': nodedesc.dtype.ctype, +# 'is_pow2': sym2cpp(sympy.log(nodedesc.buffer_size, 2).is_Integer), +# 'location': '%s_%s_%s' % (cfg.cfg_id, state_id, dfg.node_id(node)) +# } + +# ctypedef = 'dace::GPUStream<{type}, {is_pow2}>'.format(**fmtargs) +# self._dispatcher.defined_vars.add(allocname, DefinedType.Stream, ctypedef) + +# if is_array_stream_view(sdfg, dfg, node): +# edges = dfg.out_edges(node) +# if len(edges) > 1: +# raise NotImplementedError("Cannot handle streams writing to multiple arrays.") + +# fmtargs['ptr'] = nodedesc.sink + ' + ' + cpp_array_expr( +# sdfg, edges[0].data, with_brackets=False, codegen=self._frame) + +# # Assuming 1D subset of sink/src +# # sym2cpp(edges[0].data.subset[-1]) +# fmtargs['size'] = sym2cpp(nodedesc.buffer_size) + +# # (important) Ensure GPU array is allocated before the stream +# datanode = dfg.out_edges(node)[0].dst +# sinkdesc = sdfg.arrays[datanode.data] +# self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, datanode, sinkdesc, function_stream, +# allocation_stream) + +# function_stream.write( +# 'DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' +# .format(**fmtargs), cfg, state_id, node) +# self._globalcode.write( +# """ +# DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result); +# void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result) {{ +# result = dace::AllocGPUArrayStreamView<{type}, {is_pow2}>(ptr, size); +# }}""".format(**fmtargs), cfg, state_id, node) +# declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, +# node) +# allocation_stream.write('__dace_alloc_{location}({ptr}, {size}, {allocname});'.format(**fmtargs), cfg, +# state_id, node) +# else: +# fmtargs['size'] = sym2cpp(nodedesc.buffer_size) + +# function_stream.write( +# 'DACE_EXPORTED void __dace_alloc_{location}(uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' +# .format(**fmtargs), cfg, state_id, node) +# self._globalcode.write( +# """ +# DACE_EXPORTED void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result); +# void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result) {{ +# result = dace::AllocGPUStream<{type}, {is_pow2}>({size}); +# }}""".format(**fmtargs), cfg, state_id, node) +# declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, +# node) +# allocation_stream.write('__dace_alloc_{location}({size}, {allocname});'.format(**fmtargs), cfg, +# state_id, node) + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, @@ -200,6 +265,21 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV except KeyError: # Array not declared yet pass + + # if isinstance(nodedesc, dace.data.Stream): + # return self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + # allocation_stream) + + + #print nodedesc type + + #return self.allocate_poplar_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + # allocation_stream) + # elif isinstance(nodedesc, dace.data.Scalar): + # return self.allocate_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + # allocation_stream) + + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -208,32 +288,15 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV arrsize = nodedesc.total_size is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) - ctypedef = '%s *' % nodedesc.dtype.ctype + ctypedef = nodedesc.dtype.ctype + shape = nodedesc.shape + # Different types of GPU arrays - if nodedesc.storage == dtypes.StorageType.GPU_Global: - if not declared: - result_decl.write('%s %s;\n' % (ctypedef, dataname)) - self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) - - if nodedesc.pool: - cudastream = getattr(node, '_cuda_stream', 'nullptr') - if cudastream != 'nullptr': - cudastream = f'__state->gpu_context->streams[{cudastream}]' - result_alloc.write( - f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n' - ) - self._emit_sync(result_alloc) - else: - # Strides are left to the user's discretion - result_alloc.write("malloc calls") - # result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' % - # (self.backend, dataname, arrsize_malloc)) - - if node.setzero: - result_alloc.write('DACE_GPU_CHECK(%sMemset(%s, 0, %s));\n' % (self.backend, dataname, arrsize_malloc)) - if isinstance(nodedesc, data.Array) and nodedesc.start_offset != 0: - result_alloc.write(f'{dataname} += {cpp.sym2cpp(nodedesc.start_offset)};\n') + if nodedesc.storage == dtypes.StorageType.IPU_Memory: + # Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); + result_alloc.write("Tensor %s = _state->graph.addVariable(%s, {%s});\n" % (dataname, nodedesc.dtype.ctype.capitalize(), sym2cpp(arrsize))) + self.dispatcher.defined_vars.add(dataname, DefinedType.ArrayInterface, ctypedef) elif nodedesc.storage == dtypes.StorageType.Register: if is_dynamically_sized: raise ValueError('Dynamic allocation of registers not allowed') @@ -242,11 +305,8 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV szstr = ' = {0}' if node.setzero else '' result_decl.write("%s %s[%s]%s;\n" % (nodedesc.dtype.ctype, dataname, sym2cpp(arrsize), szstr)) self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) - elif nodedesc.storage == dtypes.StorageType.IPU_Memory: - result_decl.write("# #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5});") # decl - result_alloc.write(" graph.setTileMapping(v1[i][j], i * 2 + j);") # Mapping on else: - raise NotImplementedError("CUDA: Unimplemented storage type " + str(nodedesc.storage)) + raise NotImplementedError("IPU: Unimplemented storage type " + str(nodedesc.storage)) declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) @@ -254,18 +314,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) - self.backend = 'cuda' # temporary hack - if nodedesc.storage == dtypes.StorageType.GPU_Global: - if not nodedesc.pool: # If pooled, will be freed somewhere else - callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node) - elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: - callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) - elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \ - nodedesc.storage == dtypes.StorageType.Register: - pass # Do nothing - elif nodedesc.storage == dtypes.StorageType.IPU_Memory: - callsite_stream.write(" poplar::deallocate array") + if nodedesc.storage == dtypes.StorageType.IPU_Memory or \ + nodedesc.storage == dtypes.StorageType.Register: + pass # IPU variables are C++ objects and are automatically deallocated else: raise NotImplementedError diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 58c2a7fa5f..76ff535e59 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -114,8 +114,10 @@ def gpu_scalar_add(): sdfg = dace.SDFG('gpu_scalar_add') #########GLOBAL VARIABLES######### - sdfg.add_scalar("A_scalar_cpu", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) - sdfg.add_scalar("A_scalar_gpu", dace.float64, storage=dace.StorageType.GPU_Global, transient=True) + sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) # sdfg.add_constant('constant', 1) @@ -124,12 +126,21 @@ def gpu_scalar_add(): # ###########STATE, CFG, GLOBAL DATA################ # # # add state state = sdfg.add_state('sum', is_start_block=True) - a_cpu = state.add_read('A_scalar_cpu') - a_gpu = state.add_write('A_scalar_gpu') + + scalar_read = state.add_read('scalarNode') + scalar_write = state.add_write('write_to_scalar') + array_ = state.add_read('arrayNode') + stream_ = state.add_read('StreamNode') + + + # b = state.add_read('B_scalar') # c = state.add_write('C_scalar') - state.add_edge(a_cpu, None, a_gpu, None, dace.Memlet(f"A_scalar_cpu")) + state.add_edge(scalar_read, None, scalar_write, None, dace.Memlet(f"scalarNode[0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) + state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) + # ###########DFG################ # # Add nodes From 6cd24ea3cec30559268bd5c91845d467f7752b22 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 25 Aug 2024 11:57:12 -0600 Subject: [PATCH 41/77] Add support for IPU types. --- dace/codegen/targets/ipu.py | 9 +++--- dace/codegen/targets/ipu_files/ipu_utils.py | 25 +++++++++++++++ graphcore_dace/handcrafted_sdfg_scalar_add.py | 31 ++++++++++++++++++- 3 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 dace/codegen/targets/ipu_files/ipu_utils.py diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index eea0c13c44..7267774841 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -26,7 +26,7 @@ from dace.sdfg import nodes, SDFG, SDFGState, ScopeSubgraphView, graph as gr from dace.sdfg.validation import validate_memlet_data from dace.sdfg.graph import MultiConnectorEdge -from dace.codegen.targets.sve import util as util +from dace.codegen.targets.ipu_files import ipu_utils as ipu_utils import copy import functools import itertools @@ -290,12 +290,13 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) ctypedef = nodedesc.dtype.ctype shape = nodedesc.shape - + print("Available keys in TYPE_TO_IPU:", ipu_utils.TYPE_TO_IPU.keys()) + print("Type of nodedesc.dtype.ctype:", nodedesc.dtype.ctype) # Different types of GPU arrays if nodedesc.storage == dtypes.StorageType.IPU_Memory: - # Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); - result_alloc.write("Tensor %s = _state->graph.addVariable(%s, {%s});\n" % (dataname, nodedesc.dtype.ctype.capitalize(), sym2cpp(arrsize))) + # Tensor c1 = graph.addConstant(DOUBLE, {4}); + result_alloc.write("Tensor %s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], sym2cpp(arrsize))) self.dispatcher.defined_vars.add(dataname, DefinedType.ArrayInterface, ctypedef) elif nodedesc.storage == dtypes.StorageType.Register: if is_dynamically_sized: diff --git a/dace/codegen/targets/ipu_files/ipu_utils.py b/dace/codegen/targets/ipu_files/ipu_utils.py new file mode 100644 index 0000000000..3300ae1854 --- /dev/null +++ b/dace/codegen/targets/ipu_files/ipu_utils.py @@ -0,0 +1,25 @@ + +""" +Utils for the IPU target. +""" + +import dace +import dace.codegen.targets + + +# Convert from DACE Types to IPU Types +TYPE_TO_IPU = { + dace.bool: 'BOOL', + dace.int8: 'CHAR', + dace.int16: 'SHORT', + dace.int32: 'INT', + dace.int64: 'LONGLONG', # LONG is not supported in IPU + dace.uint8: 'UNSIGNED_CHAR', + dace.uint16: 'UNSIGNED_SHORT', + dace.uint32: 'UNSINGED_INT', + dace.uint64: 'UNSINGNED_LONGLONG', + dace.float16: 'HALF', + dace.float32: 'FLOAT', + dace.float64: 'DOUBLE', + dace.string: 'char*', # Not sure if this is correct +} diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 76ff535e59..8390d73b1b 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -114,7 +114,18 @@ def gpu_scalar_add(): sdfg = dace.SDFG('gpu_scalar_add') #########GLOBAL VARIABLES######### - sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode1", dace.bool, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode2", dace.int32, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode3", dace.int64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode4", dace.uint8, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode5", dace.uint64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode6", dace.float16, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode7", dace.float32, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode8", dace.string, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) + + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) @@ -128,6 +139,15 @@ def gpu_scalar_add(): state = sdfg.add_state('sum', is_start_block=True) scalar_read = state.add_read('scalarNode') + scalar_read1 = state.add_read('scalarNode1') + scalar_read2 = state.add_read('scalarNode2') + scalar_read3 = state.add_read('scalarNode3') + scalar_read4 = state.add_read('scalarNode4') + scalar_read5 = state.add_read('scalarNode5') + scalar_read6 = state.add_read('scalarNode6') + scalar_read7 = state.add_read('scalarNode7') + scalar_read8 = state.add_read('scalarNode8') + scalar_read9 = state.add_read('scalarNode9') scalar_write = state.add_write('write_to_scalar') array_ = state.add_read('arrayNode') stream_ = state.add_read('StreamNode') @@ -138,6 +158,15 @@ def gpu_scalar_add(): # b = state.add_read('B_scalar') # c = state.add_write('C_scalar') state.add_edge(scalar_read, None, scalar_write, None, dace.Memlet(f"scalarNode[0]")) + state.add_edge(scalar_read1, None, scalar_write, None, dace.Memlet(f"scalarNode1[0]")) + state.add_edge(scalar_read2, None, scalar_write, None, dace.Memlet(f"scalarNode2[0]")) + state.add_edge(scalar_read3, None, scalar_write, None, dace.Memlet(f"scalarNode3[0]")) + state.add_edge(scalar_read4, None, scalar_write, None, dace.Memlet(f"scalarNode4[0]")) + state.add_edge(scalar_read5, None, scalar_write, None, dace.Memlet(f"scalarNode5[0]")) + state.add_edge(scalar_read6, None, scalar_write, None, dace.Memlet(f"scalarNode6[0]")) + state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) + state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) + state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) From e6610dae3afd42a879d7e6b1c99882a0fb5fafd5 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 25 Aug 2024 14:05:33 -0600 Subject: [PATCH 42/77] Fix shape issue, next add streams support --- dace/codegen/codegen.py | 3 -- dace/codegen/targets/ipu.py | 46 ++++++++++--------- graphcore_dace/handcrafted_sdfg_scalar_add.py | 6 +-- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index 62d7adeb08..6a02a4a57d 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -246,9 +246,6 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: ] # Create code objects for each target - print("Used targets:", used_targets) - print("Frame targets:", frame.targets) - print("Frame " + str(frame)) for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 7267774841..c6faebef0c 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -1,6 +1,7 @@ # import # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from io import StringIO +import sympy from typing import TYPE_CHECKING, Optional, Tuple, Union from copy import deepcopy from dace import (data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config) @@ -41,7 +42,6 @@ class IPUCodeGen(TargetCodeGenerator): language = 'cpp' def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): - print("in IPUCodeGen") self.has_generated_header = False self.frame = frame_codegen self.dispatcher = frame_codegen._dispatcher @@ -182,8 +182,9 @@ def is_node_tasklet(self, sdfg, state, node): # def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, # node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, # declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: +# allocation_stream.write("// IPU Stream Allocation") # dataname = node.data -# allocname = cpp.ptr(dataname, nodedesc, sdfg, self._frame) +# allocname = cpp.ptr(dataname, nodedesc, sdfg, self.frame) # if nodedesc.storage == dtypes.StorageType.GPU_Global: # fmtargs = { # 'name': allocname, # TODO: Handle persistent streams @@ -249,6 +250,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: self.add_header(function_stream) + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) try: @@ -269,16 +271,12 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV # if isinstance(nodedesc, dace.data.Stream): # return self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, # allocation_stream) - - - #print nodedesc type - - #return self.allocate_poplar_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - # allocation_stream) - # elif isinstance(nodedesc, dace.data.Scalar): - # return self.allocate_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - # allocation_stream) - + if isinstance(nodedesc, dace.data.View): + return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) + elif isinstance(nodedesc, dace.data.Reference): + return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + declaration_stream, allocation_stream) if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -287,17 +285,23 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV result_alloc = StringIO() arrsize = nodedesc.total_size is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) - arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) - ctypedef = nodedesc.dtype.ctype + #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = 'Tensor *' shape = nodedesc.shape - print("Available keys in TYPE_TO_IPU:", ipu_utils.TYPE_TO_IPU.keys()) - print("Type of nodedesc.dtype.ctype:", nodedesc.dtype.ctype) - # Different types of GPU arrays + # Different types of memories if nodedesc.storage == dtypes.StorageType.IPU_Memory: - # Tensor c1 = graph.addConstant(DOUBLE, {4}); - result_alloc.write("Tensor %s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], sym2cpp(arrsize))) - self.dispatcher.defined_vars.add(dataname, DefinedType.ArrayInterface, ctypedef) + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + + if nodedesc.pool: + raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) + else: + shape_poplar_format = ', '.join([str(sh) for sh in shape]) + result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) + + elif nodedesc.storage == dtypes.StorageType.Register: if is_dynamically_sized: raise ValueError('Dynamic allocation of registers not allowed') @@ -308,7 +312,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) else: raise NotImplementedError("IPU: Unimplemented storage type " + str(nodedesc.storage)) - + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 8390d73b1b..b8eaee0f81 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -126,8 +126,8 @@ def gpu_scalar_add(): sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) - sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) - sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_array("arrayNode", [10, 10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) @@ -167,7 +167,7 @@ def gpu_scalar_add(): state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) - state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0, 0]")) state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) From a4261412cc16d6c9cf169e34d502c76507ea530b Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 25 Aug 2024 15:38:18 -0600 Subject: [PATCH 43/77] Make more readable for humans. --- dace/codegen/targets/ipu.py | 131 +++++++++++------- graphcore_dace/handcrafted_sdfg_scalar_add.py | 4 +- 2 files changed, 80 insertions(+), 55 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index c6faebef0c..65314c6bf4 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -179,10 +179,74 @@ def is_node_tasklet(self, sdfg, state, node): # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, # allocation_stream) -# def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, -# node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, -# declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: -# allocation_stream.write("// IPU Stream Allocation") + def allocate_ipu_scalar(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + + result_decl = StringIO() + result_alloc = StringIO() + arrsize = nodedesc.total_size + is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) + #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = 'Tensor *' + shape = nodedesc.shape + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + # Check if array is already declared + declared = self.dispatcher.declared_arrays.has(dataname) + # Different types of memories + if nodedesc.storage == dtypes.StorageType.IPU_Memory: + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + + if nodedesc.pool: + raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) + else: + shape_poplar_format = ', '.join([str(sh) for sh in shape]) + result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) + else: + raise NotImplementedError("IPU: Unimplemented StorageType " + str(nodedesc.storage)) + + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) + + def allocate_ipu_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + + result_decl = StringIO() + result_alloc = StringIO() + arrsize = nodedesc.total_size + is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) + #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = 'Tensor *' + shape = nodedesc.shape + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + # Check if array is already declared + declared = self.dispatcher.declared_arrays.has(dataname) + # Different types of memories + if nodedesc.storage == dtypes.StorageType.IPU_Memory: + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + + if nodedesc.pool: + raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) + else: + shape_poplar_format = ', '.join([str(sh) for sh in shape]) + result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) + else: + raise NotImplementedError("IPU: Unimplemented StorageType " + str(nodedesc.storage)) + + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) + + def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + return NotImplementedError("IPU Stream not implemented yet") # dataname = node.data # allocname = cpp.ptr(dataname, nodedesc, sdfg, self.frame) # if nodedesc.storage == dtypes.StorageType.GPU_Global: @@ -251,6 +315,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: self.add_header(function_stream) + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): + nodedesc = update_persistent_desc(nodedesc, sdfg) + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) try: @@ -259,62 +326,20 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV except KeyError: pass # The variable was not defined, we can continue - # Check if array is already declared - declared = False - try: - self.dispatcher.declared_arrays.get(dataname) - declared = True # Array was already declared in this or upper scopes - except KeyError: # Array not declared yet - pass - - - # if isinstance(nodedesc, dace.data.Stream): - # return self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - # allocation_stream) + if isinstance(nodedesc, dace.data.Stream): + return self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + allocation_stream) if isinstance(nodedesc, dace.data.View): return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.Reference): return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) - - if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): - nodedesc = update_persistent_desc(nodedesc, sdfg) - - result_decl = StringIO() - result_alloc = StringIO() - arrsize = nodedesc.total_size - is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) - #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) - ctypedef = 'Tensor *' - shape = nodedesc.shape - - # Different types of memories - if nodedesc.storage == dtypes.StorageType.IPU_Memory: - if not declared: - result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; - self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) - - if nodedesc.pool: - raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) - else: - shape_poplar_format = ', '.join([str(sh) for sh in shape]) - result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) - - - elif nodedesc.storage == dtypes.StorageType.Register: - if is_dynamically_sized: - raise ValueError('Dynamic allocation of registers not allowed') - if nodedesc.start_offset != 0: - raise NotImplementedError('Start offset unsupported for registers') - szstr = ' = {0}' if node.setzero else '' - result_decl.write("%s %s[%s]%s;\n" % (nodedesc.dtype.ctype, dataname, sym2cpp(arrsize), szstr)) - self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) - else: - raise NotImplementedError("IPU: Unimplemented storage type " + str(nodedesc.storage)) + elif isinstance(nodedesc, dace.data.Array): + return self.allocate_ipu_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + elif isinstance(nodedesc, dace.data.Scalar): + return self.allocate_ipu_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) - declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) - allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index b8eaee0f81..603a7a8307 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -124,11 +124,11 @@ def gpu_scalar_add(): sdfg.add_scalar("scalarNode7", dace.float32, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_scalar("scalarNode8", dace.string, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) - + sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_array("arrayNode", [10, 10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) - sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) # sdfg.add_constant('constant', 1) From 89722c3abe370cc1ac86cdf9fa720cc4f781a4d5 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 25 Aug 2024 17:10:51 -0600 Subject: [PATCH 44/77] Implement setTileMapping using 'mapdataontile' Intelligently maps the data on tiles based on scalar/array nodes and maps to setTileMapping 1. Mapping a vertex will also be similar 2. This the not the most efficient mapping, currently arrays are just mapped equally on 10 tiles. 3. Add new test - simplified --- dace/codegen/targets/ipu.py | 105 ++++++++++++++++-- graphcore_dace/handcrafted_sdfg_scalar_add.py | 78 ++++++++++++- 2 files changed, 174 insertions(+), 9 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 65314c6bf4..2668fa0fb4 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -309,7 +309,92 @@ def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubg # allocation_stream.write('__dace_alloc_{location}({size}, {allocname});'.format(**fmtargs), cfg, # state_id, node) - + def decidemapping(self, dataname, nodedesc, sdfg): + + # Get the shape of the data descriptor + shape = nodedesc.shape + # Get the total size of the data descriptor + size = nodedesc.total_size + + # CREATE a dictionary to store the mapping of the data to the tile + dataToTileMap = {} + # Get the number of tiles + numTiles = 10 + # Get the number of elements in the data descriptor + numElements = size + + if (numElements < numTiles): # special case + numTiles = numElements + + # Get the number of elements per tile + numElementsPerTile = numElements // numTiles + # Get the number of elements in the last tile + numElementsLastTile = numElements % numTiles + + # Loop over the number of tiles + for i in range(numTiles): + # Get the start index of the tile + start = i * numElementsPerTile + # Get the end index of the tile + end = start + numElementsPerTile + if (end - start > 1): + # Get the data of the tile with slicing + data = dataname + ".slice(" + "[" + str(start) + ":" + str(end) + "]" + ")" + else: + data = dataname + "[" + str(start) + "]" + + # Add the data to the tile mapping + dataToTileMap[data] = i + + # # Get the start index of the last tile + # start = numTiles * numElementsPerTile + # # Get the end index of the last tile + # end = start + numElementsLastTile + # # Get the data of the last tile + # data = dataname + "[" + str(start) + ":" + str(end) + "]" + # # Add the data to the tile mapping + # dataToTileMap[data] = numTiles - 1 + + return dataToTileMap + + # TODO:Similar mapVertexOntile + def mapdataontile(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + if isinstance(nodedesc, dace.data.Array): + self.mapArrayOnTile(sdfg, cfg, state_id, node, nodedesc, allocation_stream) + elif isinstance(nodedesc, dace.data.Scalar): + self.mapScalarOnTile(sdfg, cfg, state_id, node, nodedesc, allocation_stream) + else: + raise NotImplementedError("Unimplemented mapping for this AccessNode: {}".format(type(nodedesc))) + + def mapArrayOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + # Map array intelligently + spreadOverTiles = True + if spreadOverTiles: + dataToTileMap = self.decidemapping(dataname, nodedesc, sdfg) + # Map array over multiple tiles + # loop over the dataToTileMap and set the mapping + # import pprint + # pprint.pprint(dataToTileMap) + + for data, tilenumber in dataToTileMap.items(): + setTileMappingCall = f"_state->graph.setTileMapping({data}, {tilenumber});" + allocation_stream.write(setTileMappingCall, cfg, state_id, node) + else: + # Map array, given only 1 element maps on one tile + tilenumber = 0 + setTileMappingCall = f"_state->graph.setTileMapping({dataname}, {tilenumber});" + allocation_stream.write(setTileMappingCall, cfg, state_id, node) + + def mapScalarOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + # Map scalar, given only 1 element maps on one tile + tilenumber = 0 + setTileMappingCall = f"_state->graph.setTileMapping({dataname}, {tilenumber});" + allocation_stream.write(setTileMappingCall, cfg, state_id, node) + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: @@ -327,18 +412,24 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV pass # The variable was not defined, we can continue if isinstance(nodedesc, dace.data.Stream): - return self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) - if isinstance(nodedesc, dace.data.View): - return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + elif isinstance(nodedesc, dace.data.View): + self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.Reference): - return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.Array): - return self.allocate_ipu_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.allocate_ipu_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.mapdataontile(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.Scalar): - return self.allocate_ipu_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.allocate_ipu_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.mapdataontile(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + else: + raise NotImplementedError("Unimplemented type: {}".format(type(nodedesc))) + + # Mapping on tiles def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 603a7a8307..35a1755669 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -110,6 +110,79 @@ def vector_add(): sdfg(A, B, C) print(C) +def gpu_accessnode_test(): + sdfg = dace.SDFG('gpu_accessnode_test') + #########GLOBAL VARIABLES######### + + # sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode1", dace.bool, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode2", dace.int32, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode3", dace.int64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode4", dace.uint8, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode5", dace.uint64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode6", dace.float16, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode7", dace.float32, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode8", dace.string, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_constant('constant', 1) + + + # ###########STATE, CFG, GLOBAL DATA################ + # # # add state + state = sdfg.add_state('sum', is_start_block=True) + + # scalar_read = state.add_read('scalarNode') + # scalar_read1 = state.add_read('scalarNode1') + # scalar_read2 = state.add_read('scalarNode2') + # scalar_read3 = state.add_read('scalarNode3') + # scalar_read4 = state.add_read('scalarNode4') + # scalar_read5 = state.add_read('scalarNode5') + # scalar_read6 = state.add_read('scalarNode6') + # scalar_read7 = state.add_read('scalarNode7') + # scalar_read8 = state.add_read('scalarNode8') + # scalar_read9 = state.add_read('scalarNode9') + scalar_write = state.add_write('write_to_scalar') + array_ = state.add_read('arrayNode') + # stream_ = state.add_read('StreamNode') + + + + + # b = state.add_read('B_scalar') + # c = state.add_write('C_scalar') + # state.add_edge(scalar_read, None, scalar_write, None, dace.Memlet(f"scalarNode[0]")) + # state.add_edge(scalar_read1, None, scalar_write, None, dace.Memlet(f"scalarNode1[0]")) + # state.add_edge(scalar_read2, None, scalar_write, None, dace.Memlet(f"scalarNode2[0]")) + # state.add_edge(scalar_read3, None, scalar_write, None, dace.Memlet(f"scalarNode3[0]")) + # state.add_edge(scalar_read4, None, scalar_write, None, dace.Memlet(f"scalarNode4[0]")) + # state.add_edge(scalar_read5, None, scalar_write, None, dace.Memlet(f"scalarNode5[0]")) + # state.add_edge(scalar_read6, None, scalar_write, None, dace.Memlet(f"scalarNode6[0]")) + # state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) + # state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) + # state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) + # state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) + + + ###########CODEGEN################ + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg = sdfg(A) + sdfg.apply_transformations(GPUTransformSDFG) + print("After", C) + + def gpu_scalar_add(): sdfg = dace.SDFG('gpu_scalar_add') #########GLOBAL VARIABLES######### @@ -126,7 +199,7 @@ def gpu_scalar_add(): sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) - sdfg.add_array("arrayNode", [10, 10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) @@ -167,7 +240,7 @@ def gpu_scalar_add(): state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) - state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0, 0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) @@ -315,3 +388,4 @@ def add(A, B, C): # print (C) # vector_add() gpu_scalar_add() + gpu_accessnode_test() From 06d5dc34a5b784bb3b3f2127a31919204546d5dd Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 28 Aug 2024 16:10:53 -0600 Subject: [PATCH 45/77] Add poplar as a library --- dace/libraries/poplar/__init__.py | 2 +- dace/libraries/poplar/environments/poplar.py | 49 ++++++++--- dace/libraries/poplar/nodes/__init__.py | 1 + dace/libraries/poplar/nodes/popmm.py | 79 ++++++++++++++++++ tests/library/poplar/poplar_matmul.py | 88 ++++++++++++++++++++ 5 files changed, 207 insertions(+), 12 deletions(-) create mode 100644 dace/libraries/poplar/nodes/__init__.py create mode 100644 dace/libraries/poplar/nodes/popmm.py create mode 100644 tests/library/poplar/poplar_matmul.py diff --git a/dace/libraries/poplar/__init__.py b/dace/libraries/poplar/__init__.py index 4aa3fba752..728a102bac 100644 --- a/dace/libraries/poplar/__init__.py +++ b/dace/libraries/poplar/__init__.py @@ -2,6 +2,6 @@ from dace.library import register_library from .nodes import * from .environments import * -from .utils import * + register_library(__name__, "poplar") diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index c571fda64c..58930ab93e 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -1,21 +1,48 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import os +from dace.config import Config import dace.library - +import ctypes.util +import warnings +from typing import Union @dace.library.environment class IPU: - cmake_minimum_version = "3.6" + cmake_minimum_version = None cmake_packages = ["IPU"] cmake_files = [] cmake_variables = {} cmake_includes = [] - cmake_libraries = ["${IPU_CXX_LIBRARIES}"] - cmake_compile_flags = ["-I${IPU_CXX_HEADER_DIR}"] - cmake_link_flags = ["${IPU_LINKER_FLAGS}"] - - headers = ["poplar.h"] - state_fields = [] - init_code = "This is init code" - finalize_code = "This is finalize code;" # actually if we finalize in the dace program we break pytest :) + cmake_libraries = [] + cmake_compile_flags = ["-std=c++11"] + cmake_link_flags = ["-lpoplar", "-lpoputil", "-lpoplin"] + headers = ["poplar/Engine.hpp", "poplar/Graph.hpp", "poplar/IPUModel.hpp", "poplin/MatMul.hpp", "poplin/codelets.hpp", "popops/codelets.hpp", "poputil/TileMapping.hpp"] + state_fields = [ + "// IPUModel APIs", + "IPUModel ipuModel;", + "Device device = ipuModel.createDevice();", + "Target target = device.getTarget();", + "// Create the Graph object", + "Graph graph(target);", + "popops::addCodelets(graph);", + "poplin::addCodelets(graph);", + "// Create a control program that is a sequence of steps", + "Sequence prog;" + + ] + init_code = """ + // IPUINIT. + // Nothing for now. + """ + finalize_code = """ + auto engine = Engine{__state->graph, __state->prog, {{"debug.retainDebugInformation", "true"}}}; + engine.load(__state->device); + // Run the control program + std::cout << "Running program\n"; + engine.run(0); + std::cout << "Program complete\n"; + engine.printProfileSummary(std::cout, {{"showExecutionSteps", "true"}}); + return 0; + """ dependencies = [] + diff --git a/dace/libraries/poplar/nodes/__init__.py b/dace/libraries/poplar/nodes/__init__.py new file mode 100644 index 0000000000..89eb792be6 --- /dev/null +++ b/dace/libraries/poplar/nodes/__init__.py @@ -0,0 +1 @@ +from .popmm import IPUMatMul diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py new file mode 100644 index 0000000000..b8616da1ec --- /dev/null +++ b/dace/libraries/poplar/nodes/popmm.py @@ -0,0 +1,79 @@ +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace import dtypes +from dace.symbolic import symstr +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.codegen.targets.ipu_files import ipu_utils as ipu_utils + + +@dace.library.expansion +class ExpandMMPopLib(ExpandTransformation): + + environments = [environments.poplar.IPU] + + @staticmethod + def expansion(node, parent_state, parent_sdfg): + (adesc, bdesc, cdesc) = node.validate(parent_sdfg, parent_state) + + A_poplar_type = ipu_utils.TYPE_TO_IPU[adesc.dtype] + B_poplar_type = ipu_utils.TYPE_TO_IPU[bdesc.dtype] + C_poplar_type = ipu_utils.TYPE_TO_IPU[cdesc.dtype] + + + init = f""" + // Add variables to the graph + Tensor m1 = graph.addVariable({A_poplar_type}, {900, 600}, "m1"); + Tensor m2 = graph.addVariable({B_poplar_type}, {600, 300}, "m2"); + Tensor m3 = graph.addVariable({C_poplar_type}, {300, 200}, "m3"); + poputil::mapTensorLinearly(__state->graph, m1); + poputil::mapTensorLinearly(__state->graph, m2); + poputil::mapTensorLinearly(__state->graph, m3); + + Tensor m4 = poplin::matMul(__state->graph, m1, m2, __state->prog, "m4"); + """ + + code = f""" + {init} + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dtypes.Language.CPP) + return tasklet + + +@dace.library.node +class IPUMatMul(dace.sdfg.nodes.LibraryNode): + """Executes poplin::matMul. + """ + # Global properties + implementations = { + "MM": ExpandMMPopLib, + } + default_implementation = None + + def __init__(self, name): + super().__init__(name, inputs={"_inbufferA", "_inbufferB"}, outputs={"_outbufferC"}) + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer) of the three data descriptors in the + parent SDFG. + """ + + inbufferA, inbufferB, outbufferC = None, None, None + for e in state.out_edges(self): + if e.src_conn == "_outbufferC": + outbufferC = sdfg.arrays[e.data.data] + for e in state.in_edges(self): + if e.dst_conn == "_inbufferA": + inbufferA = sdfg.arrays[e.data.data] + if e.dst_conn == "_inbufferB": + inbufferB = sdfg.arrays[e.data.data] + + + return (inbufferA, inbufferB, outbufferC) diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py new file mode 100644 index 0000000000..76436d5708 --- /dev/null +++ b/tests/library/poplar/poplar_matmul.py @@ -0,0 +1,88 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.memlet import Memlet +import dace.libraries.poplar as poplar +import numpy as np +import pytest + +############################################################################### + +def make_sdfg(dtype): + + sdfg = dace.SDFG("poplar_matmul") + state = sdfg.add_state("matmul_state") + + sdfg.add_array('A', [10], dtype) + sdfg.add_array('B', [10], dtype) + sdfg.add_array('C', [10], dtype) + + a = state.add_access("A") + b = state.add_access("B") + c = state.add_access("C") + + poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL") + poplar_mm_node.implementation = "MM" + + state.add_memlet_path(a, poplar_mm_node, dst_conn="_inbufferA", memlet=dace.Memlet(f"A")) + state.add_memlet_path(b, poplar_mm_node, dst_conn="_inbufferB", memlet=dace.Memlet(f"B")) + state.add_memlet_path(poplar_mm_node, c, src_conn="_outbufferC", memlet=dace.Memlet(f"C")) + + return sdfg + + +############################################################################### + + +# def _test_poplar(info, sdfg, dtype): + +# poplar_sdfg = sdfg.compile() + + + +@pytest.mark.poplar +def test_poplar(): + sdfg = make_sdfg(np.float64) + sdfg.compile() + print("Success!") + +############################################################################### + +# N = dace.symbol('N', dtype=dace.int64) + + +# @dace.program +# def dace_bcast(A: dace.float32[N]): +# dace.comm.Bcast(A, root=0) + + +# @pytest.mark.mpi +# def test_dace_bcast(): +# from mpi4py import MPI as MPI4PY +# comm = MPI4PY.COMM_WORLD +# rank = comm.Get_rank() +# commsize = comm.Get_size() +# mpi_sdfg = None +# if commsize < 2: +# raise ValueError("This test is supposed to be run with at least two processes!") +# for r in range(0, commsize): +# if r == rank: +# mpi_sdfg = dace_bcast.compile() +# comm.Barrier() + +# length = 128 +# if rank == 0: +# A = np.full([length], np.pi, dtype=np.float32) +# else: +# A = np.random.randn(length).astype(np.float32) + +# mpi_sdfg(A=A, N=length) + +# assert (np.allclose(A, np.full([length], np.pi, dtype=np.float32))) + + +############################################################################### + +if __name__ == "__main__": + test_poplar() + # test_dace_bcast() +############################################################################### From 296135318d7a8718ea0cc73bcdabbe99feef447d Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 29 Aug 2024 16:10:24 -0600 Subject: [PATCH 46/77] 1. Fix compilations issues from the previous commit. 2. Modify test to be "FLOAT" and not "DOUBLE" 3. Add new interface file which helps to add #include<> vs #include "". --- dace/libraries/poplar/environments/poplar.py | 36 +++++++++---------- .../poplar/include/poplar_dace_interface.h | 13 +++++++ dace/libraries/poplar/nodes/popmm.py | 7 ++-- tests/library/poplar/poplar_matmul.py | 2 +- 4 files changed, 34 insertions(+), 24 deletions(-) create mode 100644 dace/libraries/poplar/include/poplar_dace_interface.h diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index 58930ab93e..099d56ebe4 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -15,33 +15,31 @@ class IPU: cmake_includes = [] cmake_libraries = [] cmake_compile_flags = ["-std=c++11"] - cmake_link_flags = ["-lpoplar", "-lpoputil", "-lpoplin"] - headers = ["poplar/Engine.hpp", "poplar/Graph.hpp", "poplar/IPUModel.hpp", "poplin/MatMul.hpp", "poplin/codelets.hpp", "popops/codelets.hpp", "poputil/TileMapping.hpp"] + cmake_link_flags = ["-L -lpoplar -lpopops -lpoplin -lpoputil"] + headers = [ "../include/poplar_dace_interface.h"] state_fields = [ - "// IPUModel APIs", - "IPUModel ipuModel;", - "Device device = ipuModel.createDevice();", - "Target target = device.getTarget();", - "// Create the Graph object", - "Graph graph(target);", - "popops::addCodelets(graph);", - "poplin::addCodelets(graph);", - "// Create a control program that is a sequence of steps", - "Sequence prog;" - + "// IPUModel APIs", + "IPUModel ipuModel", + "Device device", + "Target target", + "Graph graph", + "Sequence prog", ] init_code = """ - // IPUINIT. - // Nothing for now. + __state->device = __state->ipuModel.createDevice(); + __state->target = __state->device.getTarget(); + __state->graph = Graph(__state->target); + popops::addCodelets(__state->graph); + poplin::addCodelets(__state->graph); """ finalize_code = """ auto engine = Engine{__state->graph, __state->prog, {{"debug.retainDebugInformation", "true"}}}; - engine.load(__state->device); + engine.load(__state->device); // Run the control program - std::cout << "Running program\n"; + std::cout << "Running program"; engine.run(0); - std::cout << "Program complete\n"; - engine.printProfileSummary(std::cout, {{"showExecutionSteps", "true"}}); + std::cout << "Program complete"; + // engine.printProfileSummary(std::cout, {{"showExecutionSteps", "true"}}); return 0; """ dependencies = [] diff --git a/dace/libraries/poplar/include/poplar_dace_interface.h b/dace/libraries/poplar/include/poplar_dace_interface.h new file mode 100644 index 0000000000..e0ba919b7b --- /dev/null +++ b/dace/libraries/poplar/include/poplar_dace_interface.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace poplar; +using namespace poplar::program; \ No newline at end of file diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py index b8616da1ec..dff9057fed 100644 --- a/dace/libraries/poplar/nodes/popmm.py +++ b/dace/libraries/poplar/nodes/popmm.py @@ -24,13 +24,12 @@ def expansion(node, parent_state, parent_sdfg): init = f""" // Add variables to the graph - Tensor m1 = graph.addVariable({A_poplar_type}, {900, 600}, "m1"); - Tensor m2 = graph.addVariable({B_poplar_type}, {600, 300}, "m2"); - Tensor m3 = graph.addVariable({C_poplar_type}, {300, 200}, "m3"); + Tensor m1 = __state->graph.addVariable(FLOAT, {900, 600}, "m1"); + Tensor m2 = __state->graph.addVariable(FLOAT, {600, 300}, "m2"); + Tensor m3 = __state->graph.addVariable(FLOAT, {300, 200}, "m3"); poputil::mapTensorLinearly(__state->graph, m1); poputil::mapTensorLinearly(__state->graph, m2); poputil::mapTensorLinearly(__state->graph, m3); - Tensor m4 = poplin::matMul(__state->graph, m1, m2, __state->prog, "m4"); """ diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py index 76436d5708..dadbff835c 100644 --- a/tests/library/poplar/poplar_matmul.py +++ b/tests/library/poplar/poplar_matmul.py @@ -41,7 +41,7 @@ def make_sdfg(dtype): @pytest.mark.poplar def test_poplar(): - sdfg = make_sdfg(np.float64) + sdfg = make_sdfg(np.float32) sdfg.compile() print("Success!") From 75821d78dcb3b518a8a311411bd2457f868b4ec8 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 29 Aug 2024 16:27:03 -0600 Subject: [PATCH 47/77] Add missing ; --- dace/libraries/poplar/environments/poplar.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index 099d56ebe4..0c497260ad 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -19,11 +19,11 @@ class IPU: headers = [ "../include/poplar_dace_interface.h"] state_fields = [ "// IPUModel APIs", - "IPUModel ipuModel", - "Device device", - "Target target", - "Graph graph", - "Sequence prog", + "IPUModel ipuModel;", + "Device device;", + "Target target;", + "Graph graph;", + "Sequence prog;", ] init_code = """ __state->device = __state->ipuModel.createDevice(); From 1bb6ead1e8aa811e6fabf30a9aee8f6749bc7a96 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 29 Aug 2024 16:29:29 -0600 Subject: [PATCH 48/77] Fix missing directory paths --- dace/libraries/poplar/environments/poplar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index 0c497260ad..e710c50793 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -16,7 +16,7 @@ class IPU: cmake_libraries = [] cmake_compile_flags = ["-std=c++11"] cmake_link_flags = ["-L -lpoplar -lpopops -lpoplin -lpoputil"] - headers = [ "../include/poplar_dace_interface.h"] + headers = [ "../../include/poplar_dace_interface.h"] state_fields = [ "// IPUModel APIs", "IPUModel ipuModel;", From 3b47aaedabd2f4108635d175035ba2490b647518 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 29 Aug 2024 16:41:44 -0600 Subject: [PATCH 49/77] 1. Remove buggy/extra IPUModel and fix some more bug --- dace/codegen/targets/framecode.py | 8 ++++---- dace/libraries/poplar/nodes/popmm.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index c0c1804e04..cb53194458 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -216,10 +216,10 @@ def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre self.statestruct.extend(env.state_fields) # GRAPHCORE - self.statestruct.append('IPUModel ipuModel;') - self.statestruct.append('Device device = ipuModel.createDevice();') - self.statestruct.append('Target target = device.getTarget();') - self.statestruct.append('Graph graph(target);') + # self.statestruct.append('IPUModel ipuModel;') + # self.statestruct.append('Device device = ipuModel.createDevice();') + # self.statestruct.append('Target target = device.getTarget();') + # self.statestruct.append('Graph graph(target);') # Instrumentation preamble diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py index dff9057fed..ad48239a02 100644 --- a/dace/libraries/poplar/nodes/popmm.py +++ b/dace/libraries/poplar/nodes/popmm.py @@ -24,9 +24,9 @@ def expansion(node, parent_state, parent_sdfg): init = f""" // Add variables to the graph - Tensor m1 = __state->graph.addVariable(FLOAT, {900, 600}, "m1"); - Tensor m2 = __state->graph.addVariable(FLOAT, {600, 300}, "m2"); - Tensor m3 = __state->graph.addVariable(FLOAT, {300, 200}, "m3"); + Tensor m1 = __state->graph.addVariable(FLOAT, '{'900, 600'}', "m1"); + Tensor m2 = __state->graph.addVariable(FLOAT, '{'600, 300'}', "m2"); + Tensor m3 = __state->graph.addVariable(FLOAT, '{'300, 200'}', "m3"); poputil::mapTensorLinearly(__state->graph, m1); poputil::mapTensorLinearly(__state->graph, m2); poputil::mapTensorLinearly(__state->graph, m3); From 7ceb8668db3c7f583dbb4c3f78cdd5c73c85f820 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Thu, 29 Aug 2024 16:46:20 -0600 Subject: [PATCH 50/77] Fix curly braces not found in code as python f{} strings don't interpret it use {{ instead --- dace/libraries/poplar/nodes/popmm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py index ad48239a02..8854c068b3 100644 --- a/dace/libraries/poplar/nodes/popmm.py +++ b/dace/libraries/poplar/nodes/popmm.py @@ -24,9 +24,9 @@ def expansion(node, parent_state, parent_sdfg): init = f""" // Add variables to the graph - Tensor m1 = __state->graph.addVariable(FLOAT, '{'900, 600'}', "m1"); - Tensor m2 = __state->graph.addVariable(FLOAT, '{'600, 300'}', "m2"); - Tensor m3 = __state->graph.addVariable(FLOAT, '{'300, 200'}', "m3"); + Tensor m1 = __state->graph.addVariable(FLOAT, {{900, 600}}, "m1"); + Tensor m2 = __state->graph.addVariable(FLOAT, {{600, 300}}, "m2"); + Tensor m3 = __state->graph.addVariable(FLOAT, {{300, 200}}, "m3"); poputil::mapTensorLinearly(__state->graph, m1); poputil::mapTensorLinearly(__state->graph, m2); poputil::mapTensorLinearly(__state->graph, m3); From 3fa7a55130152c94ae470e6de064cecd21762ce0 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 30 Aug 2024 18:58:25 -0600 Subject: [PATCH 51/77] Fix 1. Compilation error (C++14/C++11). 2. Fix headers issue during compilation - extra ../ present 3. Fix libraries issue partially, almost there after this fix Fix IPUConfig.cmake, ipu-config.cmake not found error, was wrongly using IPU but poplar libraries start with poplar --- dace/libraries/poplar/environments/poplar.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index e710c50793..c367a11c05 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -9,14 +9,14 @@ class IPU: cmake_minimum_version = None - cmake_packages = ["IPU"] + cmake_packages = ["poplar"] # Find = POPLARConfig.cmake | poplar-config.cmake cmake_files = [] cmake_variables = {} cmake_includes = [] cmake_libraries = [] - cmake_compile_flags = ["-std=c++11"] - cmake_link_flags = ["-L -lpoplar -lpopops -lpoplin -lpoputil"] - headers = [ "../../include/poplar_dace_interface.h"] + cmake_compile_flags = [] + cmake_link_flags = ["-lpoplar -lpopops -lpoplin -lpoputil"] #-L/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/lib + headers = [ "../include/poplar_dace_interface.h"] state_fields = [ "// IPUModel APIs", "IPUModel ipuModel;", From 63fad929373cd83df37e897205632a4193a94535 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sat, 31 Aug 2024 19:09:01 -0600 Subject: [PATCH 52/77] Add arguments to the library, still we are not able to connect the inputs to the library and the inputs to SDFG, seems like we need to dig into some other ways of allocation of variables and such details. --- dace/libraries/poplar/nodes/popmm.py | 16 ++++++++++++++-- tests/library/poplar/poplar_matmul.py | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py index 8854c068b3..eee02fadf0 100644 --- a/dace/libraries/poplar/nodes/popmm.py +++ b/dace/libraries/poplar/nodes/popmm.py @@ -23,6 +23,11 @@ def expansion(node, parent_state, parent_sdfg): init = f""" + {{ + {A_poplar_type} A = {node.A_scalar_param}; + Tensor B = {node.B_scalar_param}; + Tensor C = {node.C_scalar_param}; + }} // Add variables to the graph Tensor m1 = __state->graph.addVariable(FLOAT, {{900, 600}}, "m1"); Tensor m2 = __state->graph.addVariable(FLOAT, {{600, 300}}, "m2"); @@ -54,9 +59,16 @@ class IPUMatMul(dace.sdfg.nodes.LibraryNode): "MM": ExpandMMPopLib, } default_implementation = None - - def __init__(self, name): + + A_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="A scalar") + B_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="B scalar") + C_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="C scalar") + + def __init__(self, name, A_scalar_param, B_scalar_param, C_scalar_param): super().__init__(name, inputs={"_inbufferA", "_inbufferB"}, outputs={"_outbufferC"}) + self.A_scalar_param = A_scalar_param + self.B_scalar_param = B_scalar_param + self.C_scalar_param = C_scalar_param def validate(self, sdfg, state): """ diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py index dadbff835c..f163fa226e 100644 --- a/tests/library/poplar/poplar_matmul.py +++ b/tests/library/poplar/poplar_matmul.py @@ -20,7 +20,7 @@ def make_sdfg(dtype): b = state.add_access("B") c = state.add_access("C") - poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL") + poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL", A_scalar_param=10, B_scalar_param=10, C_scalar_param=0) poplar_mm_node.implementation = "MM" state.add_memlet_path(a, poplar_mm_node, dst_conn="_inbufferA", memlet=dace.Memlet(f"A")) From 3bae6551663705ad66fa466096102204bc8a4ca4 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 2 Sep 2024 10:25:57 -0600 Subject: [PATCH 53/77] Fix link time libraries vs compile time libraries issue --- dace/libraries/poplar/environments/poplar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py index c367a11c05..cf2462ec7d 100644 --- a/dace/libraries/poplar/environments/poplar.py +++ b/dace/libraries/poplar/environments/poplar.py @@ -13,9 +13,9 @@ class IPU: cmake_files = [] cmake_variables = {} cmake_includes = [] - cmake_libraries = [] + cmake_libraries = ["poplar", "popops", "poplin", "poputil"] cmake_compile_flags = [] - cmake_link_flags = ["-lpoplar -lpopops -lpoplin -lpoputil"] #-L/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/lib + cmake_link_flags = [] #-L/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/lib headers = [ "../include/poplar_dace_interface.h"] state_fields = [ "// IPUModel APIs", From 15d802363a7da4bb3e8fe9212e5073d2ad964dc6 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 13 Sep 2024 11:13:01 -0600 Subject: [PATCH 54/77] changes to test, ipu_test is now the new base, added state dump, next one generates the golden code for poplar. --- dace/codegen/codegen.py | 4 + dace/codegen/targets/framecode.py | 2 + dace/codegen/targets/ipu.py | 411 +++++++++++++++--- dace/dtypes.py | 11 +- graphcore_dace/handcrafted_sdfg_scalar_add.py | 184 +++++++- graphcore_dace/ipu_test.py | 192 ++++++++ 6 files changed, 733 insertions(+), 71 deletions(-) create mode 100644 graphcore_dace/ipu_test.py diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index 6a02a4a57d..56125bb6fd 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -250,6 +250,10 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: target_objects.extend(tgt.get_generated_codeobjects()) # Ensure that no new targets were dynamically added + print("\nused_targets = ", frame._dispatcher.used_targets) + print("\nframe.targets = ", frame.targets) + print("\nframe=", frame) + assert frame._dispatcher.used_targets == (frame.targets - {frame}) # add a header file for calling the SDFG diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index cb53194458..b83f1d7783 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -891,6 +891,7 @@ def generate_code(self, # Allocate outer-level transients self.allocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) + callsite_stream.write('called allocate_arrays_in_scope outer\n', sdfg) # Define constants as top-level-allocated for cname, (ctype, _) in sdfg.constants_prop.items(): @@ -946,6 +947,7 @@ def generate_code(self, # Deallocate transients self.deallocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) + callsite_stream.write('called deallocate_arrays_in_scope internal transient\n', sdfg) # Now that we have all the information about dependencies, generate # header and footer diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 2668fa0fb4..4907a243e2 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -1,8 +1,9 @@ # import # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from io import StringIO +from dace.codegen.codeobject import CodeObject import sympy -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union from copy import deepcopy from dace import (data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config) from dace import dtypes, memlet as mm @@ -33,38 +34,91 @@ import itertools import warnings +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen +import pdb; +def is_ipu_kernel(sdfg, state): + """ + Returns whether the given state is an FPGA kernel and should be dispatched + to the FPGA code generator. + + :return: True if this is an FPGA kernel, False otherwise. + """ + # pdb.set_trace() + data_nodes = state.data_nodes() + at_least_one_fpga_array = False + for n in data_nodes: + desc = n.desc(sdfg) + print(desc.storage.name, desc.storage, desc) + if desc.storage == dtypes.StorageType.IPU_Memory: + at_least_one_fpga_array = True + if isinstance(desc, data.Scalar): + continue + if desc.storage != dtypes.StorageType.IPU_Memory: + return False + return at_least_one_fpga_array + @registry.autoregister_params(name='ipu') class IPUCodeGen(TargetCodeGenerator): """ IPU(Graphcore) code generator. """ target_name = 'ipu' title = 'IPU' language = 'cpp' + _in_device_code = False + + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): - def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): + self.program_name = sdfg.name + self.has_generated_header = False self.frame = frame_codegen self.dispatcher = frame_codegen._dispatcher - self.cpu_codegen: dace.codegen.targets.CPUCodeGen = self.dispatcher.get_generic_node_dispatcher() - self._locals = cppunparse.CPPLocals() + self.cpu_codegen: Optional['CPUCodeGen'] = None + # self._locals = cppunparse.CPPLocals() # Scope depth (for defining locals) self._ldepth = 0 # Keep nested SDFG schedule when descending into it self._toplevel_schedule = None + self._localcode = CodeIOStream() + self._globalcode = CodeIOStream() + self._initcode = CodeIOStream() + self._exitcode = CodeIOStream() + self._global_sdfg: SDFG = sdfg + self._arglists: Dict[nodes.MapEntry, Dict[str, data.Data]] = {} + # Keep track of current "scope entry/exit" code streams for extra + # code generation + self.scope_entry_stream = self._initcode + self.scope_exit_stream = self._exitcode + self._ipu_streams, self._ipu_events = 0, 0 + self._kernels_dependencies = dict() + self._kernels_names_to_id = dict() + self._num_kernels = 0 + self._host_codes = [] + self._kernel_codes = [] + # Register dispatchers + self.cpu_codegen = self.dispatcher.get_generic_node_dispatcher() + + self.dispatcher.register_state_dispatcher(self, predicate=is_ipu_kernel) # self.dispatcher.register_array_dispatcher(dtypes.StorageType.IPU_Tile_Local, self) # Storage # ipu_storage = [dtypes.StorageType.IPU_Memory] - gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.IPU_Memory] + ipu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.IPU_Memory] - self.dispatcher.register_array_dispatcher(gpu_storage, self) # allocate_array/deallocate_array - for storage in gpu_storage: - for other_storage in gpu_storage: + self.dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array + for storage in ipu_storage: + for other_storage in dtypes.StorageType: self.dispatcher.register_copy_dispatcher(storage, other_storage, None, self) self.dispatcher.register_copy_dispatcher(other_storage, storage, None, self) + + + + # # Dispatchers # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) @@ -74,21 +128,117 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # def preprocess(self, sdfg: SDFG) -> None: + # # hack to get the ipu codegen to work + # # self._toplevel_schedule = dtypes.ScheduleType.IPU_SCHEDULE def get_generated_codeobjects(self): - res = super().get_generated_codeobjects() - return res + + execution_mode = Config.get("compiler", "xilinx", "mode") + + kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self.program_name) + if execution_mode == "software_emulation": + kernel_file_name += "_sw_emu.xclbin\"" + xcl_emulation_mode = "\"sw_emu\"" + xilinx_sdx = "DACE_VITIS_DIR" + elif execution_mode == "hardware_emulation": + kernel_file_name += "_hw_emu.xclbin\"" + xcl_emulation_mode = "\"hw_emu\"" + xilinx_sdx = "DACE_VITIS_DIR" + elif execution_mode == "hardware" or execution_mode == "simulation": + kernel_file_name += "_hw.xclbin\"" + xcl_emulation_mode = None + xilinx_sdx = None + else: + raise cgx.CodegenError("Unknown Xilinx execution mode: {}".format(execution_mode)) + + set_env_vars = "" + set_str = "dace::set_environment_variable(\"{}\", {});\n" + unset_str = "dace::unset_environment_variable(\"{}\");\n" + set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) + if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) + set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) + if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) + set_env_vars += set_str.format( + "EMCONFIG_PATH", + "DACE_BINARY_DIR") if execution_mode == 'hardware_emulation' else unset_str.format("EMCONFIG_PATH") + + host_code = CodeIOStream() + host_code.write("""\ +#include "dace/xilinx/host.h" +#include "dace/dace.h" +#include "dace/xilinx/stream.h" +""") + host_code.write("\n\n") + + self.frame.generate_fileheader(self._global_sdfg, host_code, 'xilinx_host') + + params_comma = self._global_sdfg.init_signature(free_symbols=self.frame.free_symbols(self._global_sdfg)) + if params_comma: + params_comma = ', ' + params_comma + + host_code.write(""" +DACE_EXPORTED int __dace_init_xilinx({sdfg_state_name} *__state{signature}) {{ + {environment_variables} + + __state->fpga_context = new dace_fpga_context(); + __state->fpga_context->Get().MakeProgram({kernel_file_name}); + return 0; +}} + +DACE_EXPORTED int __dace_exit_xilinx({sdfg_state_name} *__state) {{ + delete __state->fpga_context; + return 0; +}} + +{host_code}""".format(signature=params_comma, + sdfg=self._global_sdfg, + sdfg_state_name=cpp.mangle_dace_state_struct_name(self._global_sdfg), + environment_variables=set_env_vars, + kernel_file_name=kernel_file_name, + host_code="".join([ + "{separator}\n// Kernel: {kernel_name}" + "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) + for (name, code) in self._host_codes + ]))) + + host_code_obj = CodeObject(self.program_name, + host_code.getvalue(), + "cpp", + IPUCodeGen, + "IPU", + target_type="host") + + kernel_code_objs = [ + CodeObject(kernel_name, + code, + "cpp", + IPUCodeGen, + "IPU", + target_type="device") for (kernel_name, code) in self._kernel_codes + ] + + res = super().get_generated_codeobjects() # Not sure why is this object here, fix it later. + print(res) + return [host_code_obj] + kernel_code_objs + res # __dace_init_ function @property def has_initializer(self): - return False + return True # __dace_exit_ function @property def has_finalizer(self): + return True + + def state_dispatch_predicate(self, sdfg, state): + if self._toplevel_schedule == dtypes.ScheduleType.IPU_SCHEDULE: + print("TRUE SAMEERAN") + return True return False + @staticmethod def cmake_options(): options = [] @@ -637,63 +787,192 @@ def generate_read(self, sdfg: SDFG, state: SDFGState, edge: graph.MultiConnector else: raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported') - # def generate_state(self, - # sdfg:SDFG, - # cfg: ControlFlowRegion, - # state: SDFGState, - # function_stream: CodeIOStream, - # callsite_stream:CodeIOStream, - # generate_state_footer:bool = True): - # debug_print_self(self) - # self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) - - # def declare_array(self, - # sdfg: SDFG, - # cfg: ControlFlowRegion, - # dfg: StateSubgraphView, - # state_id: int, - # node: nodes.Node, - # nodedesc: data.Data, - # function_stream: CodeIOStream, - # declaration_stream: CodeIOStream) -> None: - # print("IN DECLARE_ARRAY") - # fsymbols = self._frame.symbols_and_constants(sdfg) - # # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent - # # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). - # # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if - # # `nodedesc` is a View and `dfg` is None. - # if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): - # raise NotImplementedError("The declare_array method should only be used for variables " - # "that must have their declaration and allocation separate.") - - # name = node.root_data - # ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame) - - # if nodedesc.transient is False: - # return - - # # Check if array is already declared - # if self._dispatcher.declared_arrays.has(ptrname): - # return - - # # Compute array size - # arrsize = nodedesc.total_size - # if not isinstance(nodedesc.dtype, dtypes.opaque): - # arrsize_bytes = arrsize * nodedesc.dtype.bytes - - # if (nodedesc.storage == dtypes.StorageType.Register): - # ctypedef = dtypes.pointer(nodedesc.dtype).ctype - # declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) - # #Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); - # declaration_stream.write(f'{nodedesc.dtype.ctype} {name}_const = graph.addConstant<{nodedesc.dtype.ctype}>({nodedesc.dtype.ctype.capitalize}, {arrsize}, {nodedesc.ctype}({nodedesc.dtype.ctype}));\n', cfg, state_id, node) - # self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) - # return - # else: - # raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - + def generate_state(self, + sdfg:SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream:CodeIOStream, + generate_state_footer:bool = True): + + print("IPU STATE\n") + # disp = self.dispatcher.get_scope_dispatcher(dtypes.ScheduleType.Unrolled) + ipu_disp = self.dispatcher.get_state_dispatcher(sdfg, state=state) + cpu_disp = self.cpu_codegen + self.dispatcher._used_targets.add(ipu_disp) + self.dispatcher._used_targets.add(cpu_disp) + + state_id = state.block_id + + if IPUCodeGen._in_device_code: + print("IN DEVICE CODE") + subgraphs = dace.sdfg.concurrent_subgraphs(state) + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) + NotImplementedError("IPU Device codegen not supported") + else: + print("IN HOST CODE") + kernels = [] # List of tuples (subgraph, kernel_id) + # Start a new state code generation: reset previous dependencies if any + self._kernels_dependencies.clear() + self._kernels_names_to_id.clear() + + # For now only 1 kernel. + kernels = [(state, 0)] + self._num_kernels = len(kernels) + + state_parameters = [] + state_host_header_stream = CodeIOStream() + state_host_body_stream = CodeIOStream() + instrumentation_stream = CodeIOStream() + + # Kernels are now sorted considering their dependencies + for kern, kern_id in kernels: + callsite_stream.write("\n SJJ: kernel started") + # Generate all kernels in this state + subgraphs = dace.sdfg.concurrent_subgraphs(kern) + single_sgs: list(ScopeSubgraphView) = [] + for sg in subgraphs: + if sg is not None: + single_sgs.append(sg) + # skip multigraphs for now + + shared_transients = set(sdfg.shared_transients()) + # Allocate global memory transients, unless they are shared with + # other states + all_transients = set(kern.all_transients()) + allocated = set(shared_transients) + for node in kern.data_nodes(): + data = node.desc(sdfg) + if node.data not in all_transients or node.data in allocated: + continue + if (data.storage == dtypes.StorageType.IPU_Memory and not isinstance(data, data.View)): + print("Allocating data") + allocated.add(node.data) + self.dispatcher.dispatch_allocate(sdfg, cfg, kern, state_id, node, data, function_stream, + callsite_stream) + callsite_stream.write("\n SJJ: Data allocated") + # Create a unique kernel name to avoid name clashes + # If this kernels comes from a Nested SDFG, use that name also + if sdfg.parent_nsdfg_node is not None: + kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{cfg.cfg_id}" + else: + kernel_name = f"{state.label}_{kern_id}_{cfg.cfg_id}" + + # Add kernel name to the list of kernels + self._kernels_names_to_id[kernel_name] = kern_id + # Generate kernel code + self.generate_kernel(sdfg, cfg, state, kernel_name, single_sgs, function_stream, callsite_stream, + state_host_header_stream, state_host_body_stream, instrumentation_stream, + state_parameters, kern_id) + callsite_stream.write("\n SJJ: Kernel generated") + + kernel_host_stream = CodeIOStream() + self.generate_host_function(sdfg, cfg, state, state_id, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream) + + # Store code strings to be passed to compilation phase + self._host_codes.append((kernel_name, kernel_host_stream.getvalue())) + + # self.frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + ############################################################################################################ # #### Helpers - + ## Generate the global function here + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mmlt.Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self.dispatcher.dispatch_copy(src_node, dst_node, edge, sdfg, cfg, state_dfg, state_id, function_stream, + callsite_stream) + + def generate_nested_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, nest_name: str, + subgraphs: List[ScopeSubgraphView], function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + + for sg in subgraphs: + self.dispatcher.dispatch_subgraph(sdfg, + cfg, + sg, + sdfg.node_id(state), + function_stream, + callsite_stream, + skip_entry_node=False) + + def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream): + # Basic arguments setting + kernel_args_call_host = [] + kernel_args_opencl = [] + # Include state in args + kernel_args_opencl.append(f"{cpp.mangle_dace_state_struct_name(self._global_sdfg)} *__state") + kernel_args_call_host.append(f"__state") + + # real code starts + + host_function_name = f"__dace_runstate_{cfg.cfg_id}_{state.name}_{state_id}" + function_stream.write("\n\nDACE_EXPORTED void {}({});\n\n".format(host_function_name, + ", ".join(kernel_args_opencl))) + + # add generated header information + kernel_host_stream.write(state_host_header_stream.getvalue()) + + kernel_host_stream.write(f"""\ + DACE_EXPORTED void {host_function_name}({', '.join(kernel_args_opencl)}) {{""") + + kernel_host_stream.write(f"""\ + hlslib::ocl::Program program = __state->fpga_context->Get().CurrentlyLoadedProgram();\ + """) + # Create a vector to collect all events that are being generated to allow + # waiting before exiting this state + kernel_host_stream.write("std::vector all_events;") + + # Kernels invocations + kernel_host_stream.write(state_host_body_stream.getvalue()) + + # Wait for all events + kernel_host_stream.write("hlslib::ocl::WaitForEvents(all_events);") + + kernel_host_stream.write("}\n") + + callsite_stream.write("{}({});".format(host_function_name, ", ".join(kernel_args_call_host))) + + def generate_kernel(self, + sdfg: dace.SDFG, + cfg: ControlFlowRegion, + state: dace.SDFGState, + kernel_name: str, + subgraphs: list, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + state_host_header_stream: CodeIOStream, + state_host_body_stream: CodeIOStream, + instrumentation_stream: CodeIOStream, + state_parameters: list, + kernel_id: int = None): + """ + Entry point for generating an FPGA Kernel out of the given subgraphs. + + :param sdfg: + :param state: + :param kernel_name: the generated kernel name. + :param subgraphs: the connected components that constitute this kernel. + :param function_stream: CPU code stream, contains global declarations. + :param callsite_stream: CPU code stream, contains code for invoking kernels, ... + :param state_host_header_stream: Device-specific host code stream: contains the host code + for the state global declarations. + :param state_host_body_stream: Device-specific host code stream: contains all the code related + to this state, for creating transient buffers, spawning kernels, and synchronizing them. + :param instrumentation_stream: Code for profiling kernel execution time. + :param state_parameters: a list of parameters that must be passed to the state. It will get populated + considering all the parameters needed by the kernels in this state. + :param kernel_id: Unique ID of this kernels as computed in the generate_state function + """ + kernel_stream = CodeIOStream() + # # Actual kernel code generation + # self.generate_kernel_internal(sdfg, cfg, state, kernel_name, predecessors, subgraphs, kernel_stream, + # state_host_header_stream, state_host_body_stream, instrumentation_stream, + # function_stream, callsite_stream, state_parameters) + kernel_stream.write(f"// Kernel {kernel_name} called here", sdfg, state) + # Store code strings to be passed to compilation phase + self._kernel_codes.append((kernel_name, kernel_stream.getvalue())) + def add_header(self, function_stream: CodeIOStream): if self.has_generated_header: return diff --git a/dace/dtypes.py b/dace/dtypes.py index 7aab0a61d8..6d2ebed7e8 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,7 +19,7 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) - # IPU = () #: IPU (Graphcore) + IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) @@ -80,8 +80,11 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping - # IPU_Map = () #: IPU (Graphcore) + IPU_SCHEDULE = () #: IPU (Graphcore) +IPU_SCHEDULES = [ + ScheduleType.IPU_SCHEDULE, +] # A subset of GPU schedule types GPU_SCHEDULES = [ @@ -203,7 +206,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, ScheduleType.Snitch: StorageType.Snitch_TCDM, - # ScheduleType.IPU_Map: StorageType.IPU_Tile_Local, + # ScheduleType.IPU_SCHEDULE: StorageType.IPU_Memory, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -238,7 +241,7 @@ class TilingType(aenum.AutoNumberEnum): StorageType.GPU_Shared: ScheduleType.GPU_ThreadBlock, StorageType.FPGA_Global: ScheduleType.FPGA_Device, StorageType.SVE_Register: ScheduleType.SVE_Map, - # StorageType.IPU_Tile_Local: ScheduleType.IPU_Map + # StorageType.IPU_Memory: ScheduleType.IPU_SCHEDULE, } # Translation of types to C types diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py index 35a1755669..5eb745c000 100644 --- a/graphcore_dace/handcrafted_sdfg_scalar_add.py +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -372,6 +372,187 @@ def only_state(): def add(A, B, C): C = A + B + + +def allocate_data(sdfg): + + # data + sdfg.add_array('A', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + # add a _tmp1 accessnode with transient state, shape 1 and dtype int32 + sdfg.add_array('_tmp1', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.Register, + location=None, + transient=True, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + # me, mx = state.add_map('outer', dict(i='0:2')) + # nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) + # state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) + # state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) + +def gpu_vector_add_python_copy(): + + # # add a _tmp1 accessnode with transient state, shape 1 and dtype int32 + # sdfg.add_array('_tmp1_outer', + # shape=[1], + # dtype=dace.int32, + # storage=dace.StorageType.Register, + # location=None, + # transient=True, + # strides=[1], + # offset=[0], + # lifetime=dace.AllocationLifetime.Scope, + # debuginfo=None, total_size=1) + def nested() -> dace.SDFG: + # Inner SDFG + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.int32) + nsdfg.add_array('b', [1], dace.int32) + nsdfg.add_array('c', [1], dace.int32) + nsdfg.add_transient('t', [1], dace.int32) + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + nstate = nsdfg.add_state() + irnode = nstate.add_read('t') + task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + iwnode = nstate.add_write('c') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + return nsdfg + + + ############################################################### + # Outer SDFG + sdfg = dace.SDFG('gpu_vector_add_python_copy') + # data + sdfg.add_array('A_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + sdfg.add_symbol('i', dace.int32) + + # State machine + initstate = sdfg.add_state("init") + state = sdfg.add_state() + rnode = state.add_read('A_outer') + rnodeb = state.add_read('B_outer') + wnode = state.add_write('C_outer') + me, mx = state.add_map('map_parallelizn', dict(i='0:20')) + nsdfg_node = state.add_nested_sdfg(nested(), None, {'a', 'b'}, {'c'}, schedule=dace.ScheduleType.Sequential) + state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A_outer', 'i')) + state.add_memlet_path(rnodeb, me, nsdfg_node, dst_conn='b', memlet=dace.Memlet.simple('B_outer', 'i')) + state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='c', memlet=dace.Memlet.simple('C_outer', 'i')) + + # add state edges + sdfg.add_edge(initstate, state, dace.InterstateEdge()) + + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + print("A Values:", A) + print("B Values:", B) + print("C Values:", C) + + sdfg = sdfg(A, B, C) + + +# def gpu_vec_add_python(): + +# @dace.program +# def gpu_vector_add(A: dace.int32, B: dace.int32, C: dace.int32): +# for i in dace.map[0:20]: # parallelization construct +# C[i] = A[i] + B[i] + +# sdfg = gpu_vector_add.to_sdfg(simplify=False) # compiled SDFG +# sdfg.apply_transformations(GPUTransformSDFG) + +# # call with values +# A = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# B = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... +# sdfg(A, B, C) + # main if __name__ == "__main__": # handcrafted_sdfg_scalar_add() @@ -387,5 +568,6 @@ def add(A, B, C): # only_state() # print (C) # vector_add() - gpu_scalar_add() + # gpu_scalar_add() gpu_accessnode_test() + # gpu_vector_add_python_copy() diff --git a/graphcore_dace/ipu_test.py b/graphcore_dace/ipu_test.py new file mode 100644 index 0000000000..244ae9204c --- /dev/null +++ b/graphcore_dace/ipu_test.py @@ -0,0 +1,192 @@ +import dace +import numpy as np +from dace.transformation.interstate.gpu_transform_sdfg import GPUTransformSDFG + + +def nested() -> dace.SDFG: + # Inner SDFG + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.int32) + nsdfg.add_array('b', [1], dace.int32) + nsdfg.add_array('c', [1], dace.int32) + nsdfg.add_transient('t', [1], dace.int32) + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + nstate = nsdfg.add_state() + irnode = nstate.add_read('t') + task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + iwnode = nstate.add_write('c') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + + return nsdfg + +def ipu_vector_add_python_copy(): + + ############################################################### + # Outer SDFG + sdfg = dace.SDFG('gpu_vector_add_python_copy') + # data + sdfg.add_array('A_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + sdfg.add_symbol('i', dace.int32) + + # State machine + initstate = sdfg.add_state("init") + state = sdfg.add_state() + rnode = state.add_read('A_outer') + rnodeb = state.add_read('B_outer') + wnode = state.add_write('C_outer') + me, mx = state.add_map('map_parallelizn', dict(i='0:20')) #, schedule=dace.ScheduleType.IPU_SCHEDULE) + nsdfg_node = state.add_nested_sdfg(nested(), None, {'a', 'b'}, {'c'}, schedule=dace.ScheduleType.Sequential) + state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A_outer', 'i')) + state.add_memlet_path(rnodeb, me, nsdfg_node, dst_conn='b', memlet=dace.Memlet.simple('B_outer', 'i')) + state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='c', memlet=dace.Memlet.simple('C_outer', 'i')) + + # add state edges + sdfg.add_edge(initstate, state, dace.InterstateEdge()) + + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + print("A Values:", A) + print("B Values:", B) + print("C Values:", C) + + sdfg = sdfg(A, B, C) + +def ipu_test1(): + nsdfg = dace.SDFG('ipu_test1') + # data + nsdfg.add_array('a', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + nsdfg.add_array('b', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + # Add a C array + nsdfg.add_array('c', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + + nsdfg.add_symbol('i', dace.int32) + # nsdfg.add_transient('t', [1], dace.int32) + nsdfg.add_array('t', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + # nstate = nsdfg.add_state() + # irnode = nstate.add_read('t') + # task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + # iwnode = nstate.add_write('c') + # nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + # nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + # nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + # codeobjects = nsdfg(A, B, C).generate_code() + code = nsdfg(A, B, C).generate_code(recompile=False)[0].clean_code + + +# main +if __name__ == "__main__": + ipu_test1() + # nested() + # ipu_vector_add_python_copy() + + \ No newline at end of file From 5207d2d05c2f4757f5d06d3e4dc874952d42c4d0 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 13 Sep 2024 15:12:44 -0600 Subject: [PATCH 55/77] 1. Insert all the golden file code from Poplar example. 2. In the Host/ side the pipeline + functions are present. 3. The pipeline is not yet set, next patch might set it. 4. Lots of bugs still 5. Cerete Host/ Device/ files along with cpu/ --- dace/codegen/targets/ipu.py | 332 +++++++++++++++++++++++------------- 1 file changed, 209 insertions(+), 123 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 4907a243e2..7a2e2eb8da 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -29,6 +29,9 @@ from dace.sdfg.validation import validate_memlet_data from dace.sdfg.graph import MultiConnectorEdge from dace.codegen.targets.ipu_files import ipu_utils as ipu_utils +from dace.codegen.targets.cpp import (codeblock_to_cpp, cpp_array_expr, memlet_copy_to_absolute_strides, sym2cpp, + synchronize_streams, unparse_cr, mangle_dace_state_struct_name) + import copy import functools import itertools @@ -128,79 +131,208 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) - # def preprocess(self, sdfg: SDFG) -> None: + def preprocess(self, sdfg: SDFG) -> None: + + #create a new string + str_decl = StringIO() + str_decl = f""" +// Declare variables +optional device; // Declaration +Graph graph; // Declaration +map tensors; // Declaration +map programs; // Declaration +OptionFlags ENGINE_OPTIONS; // Declaration +map programIds; // Declaration +vector programsList; // Declaration +vector hostData; // Declaration +Engine engine; +""" + # Add above code to the statestruct + self.frame.statestruct.append(str_decl) + # # hack to get the ipu codegen to work # # self._toplevel_schedule = dtypes.ScheduleType.IPU_SCHEDULE def get_generated_codeobjects(self): - - execution_mode = Config.get("compiler", "xilinx", "mode") - - kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self.program_name) - if execution_mode == "software_emulation": - kernel_file_name += "_sw_emu.xclbin\"" - xcl_emulation_mode = "\"sw_emu\"" - xilinx_sdx = "DACE_VITIS_DIR" - elif execution_mode == "hardware_emulation": - kernel_file_name += "_hw_emu.xclbin\"" - xcl_emulation_mode = "\"hw_emu\"" - xilinx_sdx = "DACE_VITIS_DIR" - elif execution_mode == "hardware" or execution_mode == "simulation": - kernel_file_name += "_hw.xclbin\"" - xcl_emulation_mode = None - xilinx_sdx = None - else: - raise cgx.CodegenError("Unknown Xilinx execution mode: {}".format(execution_mode)) - - set_env_vars = "" - set_str = "dace::set_environment_variable(\"{}\", {});\n" - unset_str = "dace::unset_environment_variable(\"{}\");\n" - set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) - if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) - set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) - if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) - set_env_vars += set_str.format( - "EMCONFIG_PATH", - "DACE_BINARY_DIR") if execution_mode == 'hardware_emulation' else unset_str.format("EMCONFIG_PATH") - - host_code = CodeIOStream() - host_code.write("""\ -#include "dace/xilinx/host.h" -#include "dace/dace.h" -#include "dace/xilinx/stream.h" -""") - host_code.write("\n\n") - - self.frame.generate_fileheader(self._global_sdfg, host_code, 'xilinx_host') - + fileheader = CodeIOStream() + fileheader.write(""" + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + #include + + using ::std::map; + using ::std::optional; + using ::std::string; + using ::std::vector; + + using ::poplar::Device; + using ::poplar::DeviceManager; + using ::poplar::Engine; + using ::poplar::FLOAT; + using ::poplar::Graph; + using ::poplar::OptionFlags; + using ::poplar::TargetType; + using ::poplar::Tensor; + using ::poplar::program::Copy; + using ::poplar::program::Execute; + using ::poplar::program::Program; + using ::poplar::program::Repeat; + using ::poplar::program::Sequence; + + + """) + fileheader.write("\n\n") + constdefines = CodeIOStream() + constdefines.write("""const auto NUM_DATA_ITEMS = 200000;""") + constdefines.write("\n\n") + params_comma = self._global_sdfg.init_signature(free_symbols=self.frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma - + + host_code = CodeIOStream() host_code.write(""" -DACE_EXPORTED int __dace_init_xilinx({sdfg_state_name} *__state{signature}) {{ - {environment_variables} +#include + +{file_header} +{const_defines} + +DACE_EXPORTED int __dace_init_ipu({sdfg_state_name} *__state{params}); +DACE_EXPORTED int __dace_exit_ipu({sdfg_state_name} *__state); + +// {other_globalcode} + +int __dace_init_ipu({sdfg_state_name} *__state{params}) {{ + + __state->tensors = map{{}}; // Assignment + __state->programs = map{{}}; // Assignment + ENGINE_OPTIONS = OptionFlags{{ + // Assignment + {{"target.saveArchive", "archive.a"}}, + {{"debug.instrument", "true"}}, + {{"debug.instrumentCompute", "true"}}, + {{"debug.instrumentControlFlow", "true"}}, + {{"debug.computeInstrumentationLevel", "tile"}}, + {{"debug.outputAllSymbols", "true"}}, + {{"autoReport.all", "true"}}, + {{"autoReport.outputSerializedGraph", "true"}}, + {{"debug.retainDebugInformation", "true"}}, + }}; + __state->programIds = map(); // Assignment + __state->programsList = vector(__state->programs.size()); // Assignment + int index = 0; + for (auto &nameToProgram : __state->programs) {{ + __state->programIds[nameToProgram.first] = index; + __state->programsList[index] = nameToProgram.second; + index++; + }} + __state->hostData = vector(NUM_DATA_ITEMS, 1); // Assignment - __state->fpga_context = new dace_fpga_context(); - __state->fpga_context->Get().MakeProgram({kernel_file_name}); return 0; }} -DACE_EXPORTED int __dace_exit_xilinx({sdfg_state_name} *__state) {{ - delete __state->fpga_context; +int __dace_exit_ipu({sdfg_state_name} *__state) {{ return 0; }} -{host_code}""".format(signature=params_comma, - sdfg=self._global_sdfg, - sdfg_state_name=cpp.mangle_dace_state_struct_name(self._global_sdfg), - environment_variables=set_env_vars, - kernel_file_name=kernel_file_name, - host_code="".join([ +auto __dace_getIpuDevice({sdfg_state_name} *__state, const unsigned int numIpus = 1) -> optional +{{ + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) {{ + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) {{ + std::cout << " - attached" << std::endl; + device = {{std::move(d)}}; + break; + }} else {{ + std::cout << std::endl << "Error attaching to device" << std::endl; + }} + }} + return device; +}} + +auto __dace_createGraphAndAddCodelets({sdfg_state_name} *__state, const optional &device) -> Graph +{{ + Graph graph; // Declaration + graph = poplar::Graph(device->getTarget()); // Assignment + + // Add our custom codelet, building from CPP source + // with the given popc compiler options + graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + + // Add the codelets for the popops librarys + popops::addCodelets(graph); + return graph; +}} + +auto __dace_buildComputeGraph({sdfg_state_name} *__state, Graph &graph, map &tensors, + map &programs, const int numTiles) +{{ + // Add tensors + tensors["data"] = graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + poputil::mapTensorLinearly(graph, tensors["data"]); + + // // Add programs and wire up data + // const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + // auto cs = graph.addComputeSet("loopBody"); + // for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ + // const auto sliceEnd = + // std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); + // const auto sliceStart = tileNum * NumElemsPerTile; + + // auto v = graph.addVertex( + // cs, "SkeletonVertex", + // {{{{"data", tensors["data"].slice(sliceStart, sliceEnd)}}}}); + // graph.setInitialValue(v["howMuchToAdd"], tileNum); + // graph.setPerfEstimate(v, + // 100); // Ideally you'd get this as right as + // possible + // graph.setTileMapping(v, tileNum); + // }} + // auto executeIncrementVertex = Execute(cs); + + // auto mainProgram = Repeat(10, executeIncrementVertex, "repeat10x"); + // programs["main"] = mainProgram; // Program 0 will be the main program +}} + + +auto __dace_defineDataStreams({sdfg_state_name} *__state, Graph &graph, map &tensors, + map &programs) +{{ + auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = + graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + auto copyToIpuProgram = Copy(toIpuStream, tensors["data"]); + auto copyToHostProgram = Copy(tensors["data"], fromIpuStream); + + programs["copy_to_ipu"] = copyToIpuProgram; + programs["copy_to_host"] = copyToHostProgram; +}} + +{host_code_seperator}""".format(params=params_comma, + sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg), + other_globalcode=self._globalcode.getvalue(), + file_header=fileheader.getvalue(), + const_defines = constdefines.getvalue(), + sdfg=self._global_sdfg, + host_code_seperator="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) - for (name, code) in self._host_codes - ]))) + for (name, code) in self._host_codes]))) host_code_obj = CodeObject(self.program_name, host_code.getvalue(), @@ -209,6 +341,7 @@ def get_generated_codeobjects(self): "IPU", target_type="host") + # Device object kernel_code_objs = [ CodeObject(kernel_name, code, @@ -217,20 +350,18 @@ def get_generated_codeobjects(self): "IPU", target_type="device") for (kernel_name, code) in self._kernel_codes ] - - res = super().get_generated_codeobjects() # Not sure why is this object here, fix it later. - print(res) - return [host_code_obj] + kernel_code_objs + res + + return [host_code_obj] + kernel_code_objs # __dace_init_ function @property def has_initializer(self): - return True + return False # __dace_exit_ function @property def has_finalizer(self): - return True + return False def state_dispatch_predicate(self, sdfg, state): if self._toplevel_schedule == dtypes.ScheduleType.IPU_SCHEDULE: @@ -794,7 +925,6 @@ def generate_state(self, function_stream: CodeIOStream, callsite_stream:CodeIOStream, generate_state_footer:bool = True): - print("IPU STATE\n") # disp = self.dispatcher.get_scope_dispatcher(dtypes.ScheduleType.Unrolled) ipu_disp = self.dispatcher.get_state_dispatcher(sdfg, state=state) @@ -1002,29 +1132,30 @@ def add_header(self, function_stream: CodeIOStream): # def debug_print_self(self): # print("IN GENERATE_STATE") + # # print below ones as well - # print("TargetDispatcher:", self._dispatcher) - # print("init_code", self._frame._initcode.getvalue()) - # print("exit_code", self._frame._exitcode.getvalue()) - # print("Len env:", len(self._frame.environments)) - # for _x in self._frame.statestruct: + # print("TargetDispatcher:", self.dispatcher) + # print("init_code", self.frame._initcode.getvalue()) + # print("exit_code", self.frame._exitcode.getvalue()) + # print("Len env:", len(self.frame.environments)) + # for _x in self.frame.statestruct: # print("statestruct:", _x) - # print("environments:", self._frame.environments) - # print("targets:", self._frame.targets) - # print("to_allocate:", self._frame.to_allocate) - # print("where_allocated:", self._frame.where_allocated) - # print("fsyms:", self._frame.fsyms) - # print("_symbols_and_constants:", self._frame._symbols_and_constants) - # print("arglist:", self._frame.arglist) + # print("environments:", self.frame.environments) + # print("targets:", self.frame.targets) + # print("to_allocate:", self.frame.to_allocate) + # print("where_allocated:", self.frame.where_allocated) + # print("fsyms:", self.frame.fsyms) + # print("_symbols_and_constants:", self.frame._symbols_and_constants) + # print("arglist:", self.frame.arglist) # print ("DONE") # print("DISPATCHER Data") - # print ("used_env", self._dispatcher.used_environments) - # print ("used_targets", self._frame.dispatcher.used_targets) + # print ("used_env", self.dispatcher.used_environments) + # print ("used_targets", self.frame.dispatcher.used_targets) # print("DONE") # ####### # print("TargetCodeGenerator:", self) # print("language", self.language) - # # print("TargetDispatcher:", self._dispatcher.used_targets) + # print("TargetDispatcher:", self._dispatcher.used_targets) # def generate_scope(self, # sdfg: SDFG, @@ -1079,48 +1210,3 @@ def add_header(self, function_stream: CodeIOStream): # cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) # self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) - - - # This will generate the src/cuda/xyz.cu files and folders using "codeObjects" class. - # We don't need this now as we are mostly concerned about a single file codegen as of now. - # def get_generated_codeobjects(self): - # fileheader = CodeIOStream() - # sdfg = self._global_sdfg - - # # cuda/mpi seemed to be using this follow - # params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) - # if params_comma: - # params_comma = ', ' + params_comma - # codelet_file_code = """ - # // Copyright (c) 2018 Graphcore Ltd. All rights reserved. - # // Copied from tut3_vertices from Poplar SDK tutorials - - # #include - - # class SumVertex : public poplar::Vertex { - # public: - # // Fields - # poplar::Input> in; - # poplar::Output out; - - # // Compute function - # bool compute() { - # *out = 0; - # for (const auto &v : in) { - # *out += v; - # } - # return true; - # } - # }; - # """ - - # codeobj = CodeObject( - # name=sdfg.name + '_codelets', - # code=codelet_file_code, - # language='cpp', - # target=IPUCodeGen, - # title='IPU', - # linkable=False) - - # # Fill in the list - # return [codeobj] \ No newline at end of file From f6ac62f48bf2f82a6c5553e752b716e7190ac9f5 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 13 Sep 2024 16:16:06 -0600 Subject: [PATCH 56/77] fix bug where dace_init_target_ was missing --- dace/codegen/targets/ipu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 7a2e2eb8da..977705881e 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -356,12 +356,12 @@ def get_generated_codeobjects(self): # __dace_init_ function @property def has_initializer(self): - return False + return True # __dace_exit_ function @property def has_finalizer(self): - return False + return True def state_dispatch_predicate(self, sdfg, state): if self._toplevel_schedule == dtypes.ScheduleType.IPU_SCHEDULE: From e563d47bb7159c6c73cdc2991f842b47bab22da8 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 13 Sep 2024 17:21:37 -0600 Subject: [PATCH 57/77] Revert "Add arguments to the library, still we are not able to connect the inputs to the library and the inputs to SDFG, seems like we need to dig into some other ways of allocation of variables and such details." Reverting this as this is making it hard to understand a node as of now. This reverts commit 63fad929373cd83df37e897205632a4193a94535. --- dace/libraries/poplar/nodes/popmm.py | 16 ++-------------- tests/library/poplar/poplar_matmul.py | 2 +- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py index eee02fadf0..8854c068b3 100644 --- a/dace/libraries/poplar/nodes/popmm.py +++ b/dace/libraries/poplar/nodes/popmm.py @@ -23,11 +23,6 @@ def expansion(node, parent_state, parent_sdfg): init = f""" - {{ - {A_poplar_type} A = {node.A_scalar_param}; - Tensor B = {node.B_scalar_param}; - Tensor C = {node.C_scalar_param}; - }} // Add variables to the graph Tensor m1 = __state->graph.addVariable(FLOAT, {{900, 600}}, "m1"); Tensor m2 = __state->graph.addVariable(FLOAT, {{600, 300}}, "m2"); @@ -59,16 +54,9 @@ class IPUMatMul(dace.sdfg.nodes.LibraryNode): "MM": ExpandMMPopLib, } default_implementation = None - - A_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="A scalar") - B_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="B scalar") - C_scalar_param = dace.properties.Property(allow_none=False, default=0, desc="C scalar") - - def __init__(self, name, A_scalar_param, B_scalar_param, C_scalar_param): + + def __init__(self, name): super().__init__(name, inputs={"_inbufferA", "_inbufferB"}, outputs={"_outbufferC"}) - self.A_scalar_param = A_scalar_param - self.B_scalar_param = B_scalar_param - self.C_scalar_param = C_scalar_param def validate(self, sdfg, state): """ diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py index f163fa226e..dadbff835c 100644 --- a/tests/library/poplar/poplar_matmul.py +++ b/tests/library/poplar/poplar_matmul.py @@ -20,7 +20,7 @@ def make_sdfg(dtype): b = state.add_access("B") c = state.add_access("C") - poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL", A_scalar_param=10, B_scalar_param=10, C_scalar_param=0) + poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL") poplar_mm_node.implementation = "MM" state.add_memlet_path(a, poplar_mm_node, dst_conn="_inbufferA", memlet=dace.Memlet(f"A")) From 940f6bc347511996b07789bb1894ca1b43ca0316 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 15 Sep 2024 08:45:30 -0600 Subject: [PATCH 58/77] Add library node, register it, modify test for the same, goal is to have IPUCodegen + library codegen together. --- dace/codegen/targets/ipu.py | 8 ++++++- tests/library/poplar/poplar_matmul.py | 34 ++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 977705881e..774fe6851e 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -126,12 +126,13 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # # Dispatchers # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) + self.dispatcher.register_node_dispatcher(self, self.is_node_library_node) # self.dispatcher.register_node_dispatcher(self, self.is_node_tasklet) # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) - def preprocess(self, sdfg: SDFG) -> None: + def preprocess(self, sdfg: SDFG) -> None: #create a new string str_decl = StringIO() @@ -382,6 +383,11 @@ def is_node_tasklet(self, sdfg, state, node): return True return False + def is_node_library_node(self, sdfg, state, node): + if isinstance(node, nodes.LibraryNode): + return True + return False + """ if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes(NestedSDFG, Consume, Map, LibraryNode) if node.schedule == dtypes.ScheduleType.Sequential: return True diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py index dadbff835c..adbcdf2ee4 100644 --- a/tests/library/poplar/poplar_matmul.py +++ b/tests/library/poplar/poplar_matmul.py @@ -11,10 +11,38 @@ def make_sdfg(dtype): sdfg = dace.SDFG("poplar_matmul") state = sdfg.add_state("matmul_state") + sdfg.add_array('A', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + sdfg.add_array('B', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + # Add a C array + sdfg.add_array('C', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) - sdfg.add_array('A', [10], dtype) - sdfg.add_array('B', [10], dtype) - sdfg.add_array('C', [10], dtype) a = state.add_access("A") b = state.add_access("B") From 2ffc0c9520b1460ce00b16dfd63886a0df55200a Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Sun, 15 Sep 2024 08:46:53 -0600 Subject: [PATCH 59/77] Supress the building process --- dace/sdfg/sdfg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 3e5f58a413..9ec5808c05 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -2243,8 +2243,8 @@ def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': sdfg = self # Compile the code and get the shared library path - shared_library = compiler.configure_and_compile(program_folder, sdfg.name) - + # shared_library = compiler.configure_and_compile(program_folder, sdfg.name) + shared_library = "/dev/null" # TODO: Revert this HACK to supress the compiler output. # If provided, save output to path or filename if output_file is not None: if os.path.isdir(output_file): From ad13cfc85438a5f93ad1e02018a911edbd396c77 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 16 Sep 2024 10:00:41 -0600 Subject: [PATCH 60/77] Attempt to add Node dispatcher --- dace/codegen/targets/ipu.py | 79 +++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 774fe6851e..f787b7f8c4 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -100,6 +100,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self._num_kernels = 0 self._host_codes = [] self._kernel_codes = [] + self._generated_nodes = [] # Register dispatchers @@ -126,8 +127,8 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # # Dispatchers # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) - self.dispatcher.register_node_dispatcher(self, self.is_node_library_node) - # self.dispatcher.register_node_dispatcher(self, self.is_node_tasklet) + # self.dispatcher.register_node_dispatcher(self, self.is_node_library_node) + self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) @@ -366,7 +367,6 @@ def has_finalizer(self): def state_dispatch_predicate(self, sdfg, state): if self._toplevel_schedule == dtypes.ScheduleType.IPU_SCHEDULE: - print("TRUE SAMEERAN") return True return False @@ -384,15 +384,13 @@ def is_node_tasklet(self, sdfg, state, node): return False def is_node_library_node(self, sdfg, state, node): + print("NODE is = ", type(node).__name__) if isinstance(node, nodes.LibraryNode): - return True + return True return False - """ if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes(NestedSDFG, Consume, Map, LibraryNode) - if node.schedule == dtypes.ScheduleType.Sequential: - return True - return False - """ + def node_dispatch_predicate(self, sdfg, state, node): + return True ############################################################################################################ # IPU specific node/state generation ############################################################################################################ @@ -752,30 +750,44 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView # self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, # callsite_stream) - def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, - function_stream: CodeIOStream, callsite_stream: CodeIOStream): - """(TASKLET only) - 0. Declarations - 1. Generate pre tasklet - 2. Generate tasklet code - 3. Generate post tasklet - 4. Writes - """ - inner_stream, codegen = self.declarations(cfg, state_id, node, function_stream) - self.dispatcher.defined_vars.enter_scope(node) - ############################################################################################################ - # self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) - for edge in state.in_edges(node): - self.generate_read(sdfg, state, edge, inner_stream) - self.tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream) - after_memlets_stream = self.post_tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen) - ############################################################################################################ - callsite_stream.write('{', cfg, state_id, node) - callsite_stream.write(inner_stream.getvalue(), cfg, state_id, node) - callsite_stream.write(after_memlets_stream.getvalue()) - callsite_stream.write('}', cfg, state_id, node) - self._locals.clear_scope(self._ldepth + 1) - self.dispatcher.defined_vars.exit_scope(node) + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + print("Generating node: ", node.label) + # Dynamically obtain node generator according to class name + # gen = getattr(self, '_generate_' + type(node).__name__, False) + # if gen is not False: # Not every node type has a code generator here + # gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # return + + # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + # def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, + # function_stream: CodeIOStream, callsite_stream: CodeIOStream): + # """(TASKLET only) + # 0. Declarations + # 1. Generate pre tasklet + # 2. Generate tasklet code + # 3. Generate post tasklet + # 4. Writes + # """ + # callsite_stream.write(f"// Generating node {node.label}\n") + # inner_stream, codegen = self.declarations(cfg, state_id, node, function_stream) + # self.dispatcher.defined_vars.enter_scope(node) + # ############################################################################################################ + # # self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) + # for edge in state.in_edges(node): + # self.generate_read(sdfg, state, edge, inner_stream) + # callsite_stream.write('SJJ:TASKLET', cfg, state_id, node) + # function_stream.write("SJJ:TASKLET Call {0}() {{\n".format(node.label), cfg, state_id, node) + # self.tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream) + # after_memlets_stream = self.post_tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen) + # ############################################################################################################ + # callsite_stream.write('{', cfg, state_id, node) + # callsite_stream.write(inner_stream.getvalue(), cfg, state_id, node) + # callsite_stream.write(after_memlets_stream.getvalue()) + # callsite_stream.write('}', cfg, state_id, node) + # self._locals.clear_scope(self._ldepth + 1) + # self.dispatcher.defined_vars.exit_scope(node) def declarations(self, cfg, state_id, node, function_stream): self.add_header(function_stream) @@ -812,6 +824,7 @@ def pre_tasklet(self, sdfg, cfg, state, state_id, node, function_stream, callsit def unparse_ipu_tasklet(self, sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, toplevel_schedule): # Change it later to IPU specific + function_stream.write(f"SJJ: {node.label}() {{\n", cfg, state_id, node) self.cpu_codegen.unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, toplevel_schedule) From 6d5189f26c317fcc35346019c806cc09d66034ec Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Wed, 18 Sep 2024 22:49:04 -0600 Subject: [PATCH 61/77] Turn off the node dispatcher and generate a state using some code from fpga and cuda. This creates a codegen which dumps both library + nodes --- dace/codegen/targets/framecode.py | 50 +++++++++++++++++++++++++++++++ dace/codegen/targets/ipu.py | 22 ++++++++++++-- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index b83f1d7783..dfd2c1d52f 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -412,6 +412,56 @@ def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeI # Footer callsite_stream.write('}', sdfg) + def generate_ipu_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + global_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = True): + callsite_stream.write(f'// GENIPU_STATE() {state.label} ({state.block_id})\n', sdfg) + sid = state.block_id + + # Emit internal transient array allocation + self.allocate_arrays_in_scope(sdfg, cfg, state, global_stream, callsite_stream) + + callsite_stream.write('\n') + + # Invoke all instrumentation providers + for instr in self._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_begin(sdfg, state, callsite_stream, global_stream) + + ##################### + # Create dataflow graph for state's children. + + # DFG to code scheme: Only generate code for nodes whose all + # dependencies have been executed (topological sort). + # For different connected components, run them concurrently. + + components = dace.sdfg.concurrent_subgraphs(state) + + if len(components) <= 1: + self._dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, + skip_entry_node=False) + else: + callsite_stream.write("{") + self._dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, + skip_entry_node=False) + callsite_stream.write("}") + + ##################### + # Write state footer + + if generate_state_footer: + # Emit internal transient array deallocation + self.deallocate_arrays_in_scope(sdfg, state.parent_graph, state, global_stream, callsite_stream) + + # Invoke all instrumentation providers + for instr in self._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_end(sdfg, state, callsite_stream, global_stream) + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index f787b7f8c4..6354f69b32 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -128,7 +128,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) # self.dispatcher.register_node_dispatcher(self, self.is_node_library_node) - self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) + # self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) @@ -955,11 +955,29 @@ def generate_state(self, if IPUCodeGen._in_device_code: print("IN DEVICE CODE") + + to_allocate = dace.sdfg.local_transients(sdfg, state, None) + allocated = set() subgraphs = dace.sdfg.concurrent_subgraphs(state) + + for node in state.data_nodes(): + data = node.desc(sdfg) + if node.data not in to_allocate or node.data in allocated: + continue + # Make sure there are no global transients in the nested state + # that are thus not gonna be allocated + if data.storage == dtypes.StorageType.IPU_Memory and not isinstance(data, data.View): + raise cgx.CodegenError("Cannot allocate global memory from device code.") + allocated.add(node.data) + # Allocate transients + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, data, function_stream, + callsite_stream) + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) - NotImplementedError("IPU Device codegen not supported") + else: print("IN HOST CODE") + self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) kernels = [] # List of tuples (subgraph, kernel_id) # Start a new state code generation: reset previous dependencies if any self._kernels_dependencies.clear() From e3193c443f58db81cc4ca23af05813420c12021c Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Fri, 20 Sep 2024 17:23:26 -0600 Subject: [PATCH 62/77] Revert "Supress the building process" This reverts commit 2ffc0c9520b1460ce00b16dfd63886a0df55200a. --- dace/sdfg/sdfg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 9ec5808c05..3e5f58a413 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -2243,8 +2243,8 @@ def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': sdfg = self # Compile the code and get the shared library path - # shared_library = compiler.configure_and_compile(program_folder, sdfg.name) - shared_library = "/dev/null" # TODO: Revert this HACK to supress the compiler output. + shared_library = compiler.configure_and_compile(program_folder, sdfg.name) + # If provided, save output to path or filename if output_file is not None: if os.path.isdir(output_file): From 6ff38a63cf0e868cf029dd80e281eccabf3474b3 Mon Sep 17 00:00:00 2001 From: Sameeranjoshi Date: Mon, 30 Sep 2024 12:47:23 -0600 Subject: [PATCH 63/77] Move the headers to a common runtime include/ folder dace/runtime/include, debugging issue on real IPU machine, pushing incomplete changes" --- dace/codegen/targets/framecode.py | 26 +- dace/codegen/targets/ipu.py | 533 ++++++++++++---------- dace/runtime/include/dace/dace.h | 3 + dace/runtime/include/dace/poplar/host.h | 6 + dace/runtime/include/dace/poplar_common.h | 5 + dace/runtime/include/dace/poplar_device.h | 6 + dace/runtime/include/dace/poplar_host.h | 54 +++ 7 files changed, 377 insertions(+), 256 deletions(-) create mode 100644 dace/runtime/include/dace/poplar/host.h create mode 100644 dace/runtime/include/dace/poplar_common.h create mode 100644 dace/runtime/include/dace/poplar_device.h create mode 100644 dace/runtime/include/dace/poplar_host.h diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index dfd2c1d52f..fd8997d01d 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -411,7 +411,7 @@ def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeI # Footer callsite_stream.write('}', sdfg) - + def generate_ipu_state(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -439,16 +439,16 @@ def generate_ipu_state(self, # dependencies have been executed (topological sort). # For different connected components, run them concurrently. - components = dace.sdfg.concurrent_subgraphs(state) + # components = dace.sdfg.concurrent_subgraphs(state) - if len(components) <= 1: - self._dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, - skip_entry_node=False) - else: - callsite_stream.write("{") - self._dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, - skip_entry_node=False) - callsite_stream.write("}") + # if len(components) <= 1: + # self._dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, + # skip_entry_node=False) + # else: + # callsite_stream.write("{") + # self._dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, + # skip_entry_node=False) + # callsite_stream.write("}") ##################### # Write state footer @@ -556,7 +556,7 @@ def dispatch_state(state: SDFGState) -> str: opbar.done() # Write exit label - callsite_stream.write(f'__state_exit_{sdfg.cfg_id}:;', sdfg) + # callsite_stream.write(f'__state_exit_{sdfg.cfg_id}:;', sdfg) return states_generated @@ -941,7 +941,7 @@ def generate_code(self, # Allocate outer-level transients self.allocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) - callsite_stream.write('called allocate_arrays_in_scope outer\n', sdfg) + # callsite_stream.write('called allocate_arrays_in_scope outer\n', sdfg) # Define constants as top-level-allocated for cname, (ctype, _) in sdfg.constants_prop.items(): @@ -997,7 +997,7 @@ def generate_code(self, # Deallocate transients self.deallocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) - callsite_stream.write('called deallocate_arrays_in_scope internal transient\n', sdfg) + # callsite_stream.write('called deallocate_arrays_in_scope internal transient\n', sdfg) # Now that we have all the information about dependencies, generate # header and footer diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 6354f69b32..1f05500527 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -134,205 +134,140 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) def preprocess(self, sdfg: SDFG) -> None: + self.frame.statestruct.append('dace_poplar_context *poplar_context;') - #create a new string - str_decl = StringIO() - str_decl = f""" -// Declare variables -optional device; // Declaration -Graph graph; // Declaration -map tensors; // Declaration -map programs; // Declaration -OptionFlags ENGINE_OPTIONS; // Declaration -map programIds; // Declaration -vector programsList; // Declaration -vector hostData; // Declaration -Engine engine; -""" +# #create a new string +# str_decl = StringIO() +# str_decl = f""" +# optional device; +# Graph graph; +# map tensors; +# map programs; +# OptionFlags engineOptions; +# map programIds; +# vector programsList; +# vector hostData; +# """ # Add above code to the statestruct - self.frame.statestruct.append(str_decl) + # self.frame.statestruct.append(str_decl) - # # hack to get the ipu codegen to work - # # self._toplevel_schedule = dtypes.ScheduleType.IPU_SCHEDULE + pass def get_generated_codeobjects(self): - fileheader = CodeIOStream() - fileheader.write(""" - #include - #include - #include - #include - #include - #include - #include - - #include - #include - #include - #include - #include - #include - #include - - using ::std::map; - using ::std::optional; - using ::std::string; - using ::std::vector; - - using ::poplar::Device; - using ::poplar::DeviceManager; - using ::poplar::Engine; - using ::poplar::FLOAT; - using ::poplar::Graph; - using ::poplar::OptionFlags; - using ::poplar::TargetType; - using ::poplar::Tensor; - using ::poplar::program::Copy; - using ::poplar::program::Execute; - using ::poplar::program::Program; - using ::poplar::program::Repeat; - using ::poplar::program::Sequence; - - - """) - fileheader.write("\n\n") - constdefines = CodeIOStream() - constdefines.write("""const auto NUM_DATA_ITEMS = 200000;""") - constdefines.write("\n\n") + # structdecl = CodeIOStream() + # structdecl.write(f""" + # optional device; + # Graph graph; + # map tensors; + # map programs; + # OptionFlags engineOptions; + # map programIds; + # vector programsList; + # vector hostData; + # """) + + # fileheader = CodeIOStream() + # fileheader.write(""" + # #include + # #include + # #include + # #include + # #include + # #include + + # #include + # #include + # #include + # #include + # #include + # #include + # #include + + # using ::std::map; + # using ::std::optional; + # using ::std::string; + # using ::std::vector; + + # using ::poplar::Device; + # using ::poplar::DeviceManager; + # using ::poplar::Engine; + # using ::poplar::FLOAT; + # using ::poplar::Graph; + # using ::poplar::OptionFlags; + # using ::poplar::TargetType; + # using ::poplar::Tensor; + # using ::poplar::program::Copy; + # using ::poplar::program::Program; + # using ::poplar::program::Execute; + # using ::poplar::program::Repeat; + # """) + # fileheader.write("\n\n") + # constdefines = CodeIOStream() + # constdefines.write("""const auto NUM_DATA_ITEMS = 200000;""") + # constdefines.write("\n\n") params_comma = self._global_sdfg.init_signature(free_symbols=self.frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma + host_code = CodeIOStream() host_code.write(""" -#include - +#include "dace/poplar/host.h" +#include "dace/dace.h" +""") + + fileheader = CodeIOStream() + self.frame.generate_fileheader(self._global_sdfg, fileheader, 'poplar') + + host_code.write(""" {file_header} -{const_defines} - -DACE_EXPORTED int __dace_init_ipu({sdfg_state_name} *__state{params}); -DACE_EXPORTED int __dace_exit_ipu({sdfg_state_name} *__state); // {other_globalcode} -int __dace_init_ipu({sdfg_state_name} *__state{params}) {{ - - __state->tensors = map{{}}; // Assignment - __state->programs = map{{}}; // Assignment - ENGINE_OPTIONS = OptionFlags{{ - // Assignment - {{"target.saveArchive", "archive.a"}}, - {{"debug.instrument", "true"}}, - {{"debug.instrumentCompute", "true"}}, - {{"debug.instrumentControlFlow", "true"}}, - {{"debug.computeInstrumentationLevel", "tile"}}, - {{"debug.outputAllSymbols", "true"}}, - {{"autoReport.all", "true"}}, - {{"autoReport.outputSerializedGraph", "true"}}, - {{"debug.retainDebugInformation", "true"}}, - }}; - __state->programIds = map(); // Assignment - __state->programsList = vector(__state->programs.size()); // Assignment - int index = 0; - for (auto &nameToProgram : __state->programs) {{ - __state->programIds[nameToProgram.first] = index; - __state->programsList[index] = nameToProgram.second; - index++; - }} - __state->hostData = vector(NUM_DATA_ITEMS, 1); // Assignment - +DACE_EXPORTED int __dace_init_ipu({sdfg_state_name} *__state{params}) {{ + __state->poplar_context = new dace_poplar_context(); return 0; }} -int __dace_exit_ipu({sdfg_state_name} *__state) {{ +DACE_EXPORTED int __dace_exit_ipu({sdfg_state_name} *__state) {{ + delete __state->poplar_context; return 0; }} -auto __dace_getIpuDevice({sdfg_state_name} *__state, const unsigned int numIpus = 1) -> optional +DACE_EXPORTED auto getIpuDevice(const unsigned int numIpus = 1) -> optional {{ - DeviceManager manager = DeviceManager::createDeviceManager(); - optional device = std::nullopt; - for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) {{ - std::cout << "Trying to attach to IPU " << d.getId(); - if (d.attach()) {{ - std::cout << " - attached" << std::endl; - device = {{std::move(d)}}; - break; - }} else {{ - std::cout << std::endl << "Error attaching to device" << std::endl; + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) {{ + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) {{ + std::cout << " - attached" << std::endl; + device = {{std::move(d)}}; + break; + }} else {{ + std::cout << std::endl << "Error attaching to device" << std::endl; + }} }} - }} - return device; -}} - -auto __dace_createGraphAndAddCodelets({sdfg_state_name} *__state, const optional &device) -> Graph -{{ - Graph graph; // Declaration - graph = poplar::Graph(device->getTarget()); // Assignment - - // Add our custom codelet, building from CPP source - // with the given popc compiler options - graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); - - // Add the codelets for the popops librarys - popops::addCodelets(graph); - return graph; -}} - -auto __dace_buildComputeGraph({sdfg_state_name} *__state, Graph &graph, map &tensors, - map &programs, const int numTiles) -{{ - // Add tensors - tensors["data"] = graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); - poputil::mapTensorLinearly(graph, tensors["data"]); - - // // Add programs and wire up data - // const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; - // auto cs = graph.addComputeSet("loopBody"); - // for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ - // const auto sliceEnd = - // std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); - // const auto sliceStart = tileNum * NumElemsPerTile; - - // auto v = graph.addVertex( - // cs, "SkeletonVertex", - // {{{{"data", tensors["data"].slice(sliceStart, sliceEnd)}}}}); - // graph.setInitialValue(v["howMuchToAdd"], tileNum); - // graph.setPerfEstimate(v, - // 100); // Ideally you'd get this as right as - // possible - // graph.setTileMapping(v, tileNum); - // }} - // auto executeIncrementVertex = Execute(cs); - - // auto mainProgram = Repeat(10, executeIncrementVertex, "repeat10x"); - // programs["main"] = mainProgram; // Program 0 will be the main program + return device; }} - -auto __dace_defineDataStreams({sdfg_state_name} *__state, Graph &graph, map &tensors, - map &programs) +DACE_EXPORTED auto defineDataStreams({sdfg_state_name} &__state) {{ - auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); - auto fromIpuStream = - graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + auto toIpuStream = __state.poplar_context->graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = __state.poplar_context->graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); - auto copyToIpuProgram = Copy(toIpuStream, tensors["data"]); - auto copyToHostProgram = Copy(tensors["data"], fromIpuStream); - - programs["copy_to_ipu"] = copyToIpuProgram; - programs["copy_to_host"] = copyToHostProgram; + __state.poplar_context->programs["copy_to_ipu"] = Copy(toIpuStream, __state.poplar_context->tensors["data"]); + __state.poplar_context->programs["copy_to_host"] = Copy(__state.poplar_context->tensors["data"], fromIpuStream); }} {host_code_seperator}""".format(params=params_comma, sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg), other_globalcode=self._globalcode.getvalue(), file_header=fileheader.getvalue(), - const_defines = constdefines.getvalue(), sdfg=self._global_sdfg, host_code_seperator="".join([ - "{separator}\n// Kernel: {kernel_name}" + "{separator}\n// Dataflow graph building: {kernel_name}" "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes]))) @@ -977,72 +912,110 @@ def generate_state(self, else: print("IN HOST CODE") - self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - kernels = [] # List of tuples (subgraph, kernel_id) - # Start a new state code generation: reset previous dependencies if any - self._kernels_dependencies.clear() - self._kernels_names_to_id.clear() - - # For now only 1 kernel. - kernels = [(state, 0)] - self._num_kernels = len(kernels) - - state_parameters = [] - state_host_header_stream = CodeIOStream() - state_host_body_stream = CodeIOStream() - instrumentation_stream = CodeIOStream() - - # Kernels are now sorted considering their dependencies - for kern, kern_id in kernels: - callsite_stream.write("\n SJJ: kernel started") - # Generate all kernels in this state - subgraphs = dace.sdfg.concurrent_subgraphs(kern) - single_sgs: list(ScopeSubgraphView) = [] - for sg in subgraphs: - if sg is not None: - single_sgs.append(sg) - # skip multigraphs for now - - shared_transients = set(sdfg.shared_transients()) - # Allocate global memory transients, unless they are shared with - # other states - all_transients = set(kern.all_transients()) - allocated = set(shared_transients) - for node in kern.data_nodes(): - data = node.desc(sdfg) - if node.data not in all_transients or node.data in allocated: - continue - if (data.storage == dtypes.StorageType.IPU_Memory and not isinstance(data, data.View)): - print("Allocating data") - allocated.add(node.data) - self.dispatcher.dispatch_allocate(sdfg, cfg, kern, state_id, node, data, function_stream, - callsite_stream) - callsite_stream.write("\n SJJ: Data allocated") - # Create a unique kernel name to avoid name clashes - # If this kernels comes from a Nested SDFG, use that name also - if sdfg.parent_nsdfg_node is not None: - kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{cfg.cfg_id}" - else: - kernel_name = f"{state.label}_{kern_id}_{cfg.cfg_id}" - - # Add kernel name to the list of kernels - self._kernels_names_to_id[kernel_name] = kern_id - # Generate kernel code - self.generate_kernel(sdfg, cfg, state, kernel_name, single_sgs, function_stream, callsite_stream, - state_host_header_stream, state_host_body_stream, instrumentation_stream, - state_parameters, kern_id) - callsite_stream.write("\n SJJ: Kernel generated") - - kernel_host_stream = CodeIOStream() - self.generate_host_function(sdfg, cfg, state, state_id, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream) - - # Store code strings to be passed to compilation phase - self._host_codes.append((kernel_name, kernel_host_stream.getvalue())) - - # self.frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) ############################################################################################################ # #### Helpers + + def generate_ipu_cpuside_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = True): + sid = state.block_id + + callsite_stream.write(f'// Ipu pipeline \n', sdfg) + callsite_stream.write(f""" + // Data initialization + __state->poplar_context->hostData = vector(NUM_DATA_ITEMS, 1); + + // Real code pipeline starts from here. + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + __state->poplar_context->device = getIpuDevice(1); + if (!__state->poplar_context->device.has_value()) {{ + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return; + }} + """) + ##################### + # Create dataflow graph for state's children. + + + # Start a new state code generation: reset previous dependencies if any + self._kernels_dependencies.clear() + self._kernels_names_to_id.clear() + + # For now only 1 kernel. + kernels = [(state, 0)] + + + state_host_header_stream = CodeIOStream() + state_host_body_stream = CodeIOStream() + instrumentation_stream = CodeIOStream() + + for kern, kern_id in kernels: + if sdfg.parent_nsdfg_node is not None: + kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{cfg.cfg_id}" + else: + kernel_name = f"{state.label}_{kern_id}_{cfg.cfg_id}" + self._kernels_names_to_id[kernel_name] = kern_id + + kernel_host_stream = CodeIOStream() + function_stream.write(f"// kernel_name = {kernel_name}\n") + self.generate_host_function(sdfg, cfg, state, sid, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream) + + # Store code strings to be passed to compilation phase + self._host_codes.append((kernel_name, kernel_host_stream.getvalue())) + + ##################### + # Write state footer(After kernel call?) + callsite_stream.write(f""" + std::cout << "STEP 3: Define data streams" << std::endl; + defineDataStreams(*__state); // Pass the state directly + + std::cout << "STEP 4: Create engine and compile graph" << std::endl; + __state->poplar_context->engineOptions = OptionFlags{{ + {{"target.saveArchive", "archive.a"}}, + {{"debug.instrument", "true"}}, + {{"debug.instrumentCompute", "true"}}, + {{"debug.instrumentControlFlow", "true"}}, + {{"debug.computeInstrumentationLevel", "tile"}}, + {{"debug.outputAllSymbols", "true"}}, + {{"autoReport.all", "true"}}, + {{"autoReport.outputSerializedGraph", "true"}}, + {{"debug.retainDebugInformation", "true"}}, + }}; + + __state->poplar_context->programIds = map(); + __state->poplar_context->programsList = vector(__state->poplar_context->programs.size()); // Removing the size causes segfault + int index = 0; + for (auto &nameToProgram : __state->poplar_context->programs) {{ + __state->poplar_context->programIds[nameToProgram.first] = index; + __state->poplar_context->programsList[index] = nameToProgram.second; + index++; + }} + + // Now construct the Engine using the constructor + auto engine = Engine(__state->poplar_context->graph, __state->poplar_context->programsList, __state->poplar_context->engineOptions); + + std::cout << "STEP 5: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*__state->poplar_context->device); + // engine.enableExecutionProfiling(); + + std::cout << "STEP 6: Attach data streams" << std::endl; + + engine.connectStream("TO_IPU", __state->poplar_context->hostData.data()); + engine.connectStream("FROM_IPU", __state->poplar_context->hostData.data()); + + std::cout << "STEP 7: Run programs" << std::endl; + engine.run(__state->poplar_context->programIds["copy_to_ipu"]); // Copy to IPU + engine.run(__state->poplar_context->programIds["main"]); // Main program + engine.run(__state->poplar_context->programIds["copy_to_host"]); // Copy from IPU + """) + ## Generate the global function here def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mmlt.Memlet], @@ -1068,37 +1041,111 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca kernel_args_call_host = [] kernel_args_opencl = [] # Include state in args - kernel_args_opencl.append(f"{cpp.mangle_dace_state_struct_name(self._global_sdfg)} *__state") - kernel_args_call_host.append(f"__state") + kernel_args_opencl.append(f"{cpp.mangle_dace_state_struct_name(self._global_sdfg)} &__state") + kernel_args_call_host.append(f"*__state") # real code starts + host_function_name = f"kernel_buildComputeGraph" - host_function_name = f"__dace_runstate_{cfg.cfg_id}_{state.name}_{state_id}" - function_stream.write("\n\nDACE_EXPORTED void {}({});\n\n".format(host_function_name, - ", ".join(kernel_args_opencl))) - + callsite_stream.write("////////////////////////////////////////KERNEL") + callsite_stream.write("std::cout << \"STEP 2: Building the compute graph\" << std::endl;") + callsite_stream.write("{}({});".format(host_function_name, ", ".join(kernel_args_call_host))) + callsite_stream.write("////////////////////////////////////////") + + # function_stream.write("\n\nDACE_EXPORTED auto {}({});\n\n".format(host_function_name, + # ", ".join(kernel_args_opencl))) + + #/////////////////////////// # add generated header information kernel_host_stream.write(state_host_header_stream.getvalue()) kernel_host_stream.write(f"""\ DACE_EXPORTED void {host_function_name}({', '.join(kernel_args_opencl)}) {{""") - kernel_host_stream.write(f"""\ - hlslib::ocl::Program program = __state->fpga_context->Get().CurrentlyLoadedProgram();\ - """) - # Create a vector to collect all events that are being generated to allow - # waiting before exiting this state - kernel_host_stream.write("std::vector all_events;") + # kernel_host_stream.write(f"""\ + # hlslib::ocl::Program program = __state->poplar_context->fpga_context->Get().CurrentlyLoadedProgram();\ + # """) + # # Create a vector to collect all events that are being generated to allow + # # waiting before exiting this state + # kernel_host_stream.write("std::vector all_events;") - # Kernels invocations - kernel_host_stream.write(state_host_body_stream.getvalue()) + # # Kernels invocations + # kernel_host_stream.write(state_host_body_stream.getvalue()) - # Wait for all events - kernel_host_stream.write("hlslib::ocl::WaitForEvents(all_events);") + # # Wait for all events + # kernel_host_stream.write("hlslib::ocl::WaitForEvents(all_events);") + + # write the kernel_host_stream withe the commands I have copied + kernel_host_stream.write(f"""\ + std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; + + // Step 1: Create graph and add codelets + __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); + __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + popops::addCodelets(__state.poplar_context->graph); + + // Step 2: Add data to the graph + std::cout << " STEP 2.2: Add data to the graph" << std::endl; + __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors["data"]); + + const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); + // Add programs and wire up data + const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); + + for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ + const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); + const auto sliceStart = tileNum * NumElemsPerTile; + + auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); + __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); + __state.poplar_context->graph.setPerfEstimate(v, 100); + __state.poplar_context->graph.setTileMapping(v, tileNum); + }} + + __state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); + """) kernel_host_stream.write("}\n") + + + """ + COMMENT + DACE_EXPORTED auto kernel_buildComputeGraph({sdfg_state_name} &__state) + {{ + std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; + + // Step 1: Create graph and add codelets + __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); + __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + popops::addCodelets(__state.poplar_context->graph); + + // Step 2: Add data to the graph + std::cout << " STEP 2.2: Add data to the graph" << std::endl; + __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors["data"]); + + const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); + // Add programs and wire up data + const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); + + for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ + const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); + const auto sliceStart = tileNum * NumElemsPerTile; + + auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); + __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); + __state.poplar_context->graph.setPerfEstimate(v, 100); + __state.poplar_context->graph.setTileMapping(v, tileNum); + }} + + __state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); + }} + """ - callsite_stream.write("{}({});".format(host_function_name, ", ".join(kernel_args_call_host))) + def generate_kernel(self, sdfg: dace.SDFG, diff --git a/dace/runtime/include/dace/dace.h b/dace/runtime/include/dace/dace.h index 960aece94c..c7dca1ae8f 100644 --- a/dace/runtime/include/dace/dace.h +++ b/dace/runtime/include/dace/dace.h @@ -41,6 +41,9 @@ #include "intel_fpga/host.h" #endif +// TODO:use conditional compilation later for now include poplar always +#include "poplar_common.h" + #include "fpga_common.h" #endif // __DACE_RUNTIME_H diff --git a/dace/runtime/include/dace/poplar/host.h b/dace/runtime/include/dace/poplar/host.h new file mode 100644 index 0000000000..9611281449 --- /dev/null +++ b/dace/runtime/include/dace/poplar/host.h @@ -0,0 +1,6 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +#include // Must be included after hlslib/xilinx/OpenCL.h +#include +#include diff --git a/dace/runtime/include/dace/poplar_common.h b/dace/runtime/include/dace/poplar_common.h new file mode 100644 index 0000000000..7ba9841d5d --- /dev/null +++ b/dace/runtime/include/dace/poplar_common.h @@ -0,0 +1,5 @@ +#pragma once + +// Defined as a struct rather than a class for C compatibility with OpenCL +// For definition, see poplar_host.h +struct dace_poplar_context; diff --git a/dace/runtime/include/dace/poplar_device.h b/dace/runtime/include/dace/poplar_device.h new file mode 100644 index 0000000000..f3aba7b0b9 --- /dev/null +++ b/dace/runtime/include/dace/poplar_device.h @@ -0,0 +1,6 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +// Defined as a struct rather than a class for C compatibility with OpenCL +// For definition, see fpga_host.h +struct dace_fpga_context; diff --git a/dace/runtime/include/dace/poplar_host.h b/dace/runtime/include/dace/poplar_host.h new file mode 100644 index 0000000000..59829cc379 --- /dev/null +++ b/dace/runtime/include/dace/poplar_host.h @@ -0,0 +1,54 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +#include +#include + +// file headers +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using ::std::map; +using ::std::optional; +using ::std::string; +using ::std::vector; + +using ::poplar::Device; +using ::poplar::DeviceManager; +using ::poplar::Engine; +using ::poplar::FLOAT; +using ::poplar::Graph; +using ::poplar::OptionFlags; +using ::poplar::TargetType; +using ::poplar::Tensor; +using ::poplar::program::Copy; +using ::poplar::program::Program; +using ::poplar::program::Execute; +using ::poplar::program::Repeat; + +// Constants +const auto NUM_DATA_ITEMS = 200000; + +// Struct +struct dace_poplar_context { + optional device; + Graph graph; + map tensors; + map programs; + OptionFlags engineOptions; + map programIds; + vector programsList; + vector hostData; +}; \ No newline at end of file From ba32abeb5a16bab189c89a94fcf0c47ac5b58741 Mon Sep 17 00:00:00 2001 From: Sameeran joshi Date: Mon, 30 Sep 2024 20:20:07 +0000 Subject: [PATCH 64/77] some temporary changes --- dace/codegen/targets/ipu.py | 3 +-- dace/runtime/include/dace/dace.h | 4 ++-- dace/runtime/include/dace/poplar_host.h | 4 ++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 1f05500527..e394506ee7 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -213,8 +213,7 @@ def get_generated_codeobjects(self): host_code = CodeIOStream() host_code.write(""" -#include "dace/poplar/host.h" -#include "dace/dace.h" +#include """) fileheader = CodeIOStream() diff --git a/dace/runtime/include/dace/dace.h b/dace/runtime/include/dace/dace.h index c7dca1ae8f..6f91a922e4 100644 --- a/dace/runtime/include/dace/dace.h +++ b/dace/runtime/include/dace/dace.h @@ -41,8 +41,8 @@ #include "intel_fpga/host.h" #endif -// TODO:use conditional compilation later for now include poplar always -#include "poplar_common.h" +// // TODO:use conditional compilation later for now include poplar always +#include "poplar_host.h" #include "fpga_common.h" diff --git a/dace/runtime/include/dace/poplar_host.h b/dace/runtime/include/dace/poplar_host.h index 59829cc379..2dd53ff8c5 100644 --- a/dace/runtime/include/dace/poplar_host.h +++ b/dace/runtime/include/dace/poplar_host.h @@ -4,6 +4,10 @@ #include #include +// dace headers +#include +#include + // file headers #include #include From eeaa7d128c4935efeaa80fd2cb1ec8fcfc240e68 Mon Sep 17 00:00:00 2001 From: Sameeran joshi Date: Mon, 30 Sep 2024 22:24:55 +0000 Subject: [PATCH 65/77] Resolve errors in compilation when using includes from runtime libraries --- dace/codegen/targets/ipu.py | 41 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index e394506ee7..5a56efbf0e 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -135,6 +135,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): def preprocess(self, sdfg: SDFG) -> None: self.frame.statestruct.append('dace_poplar_context *poplar_context;') + # #create a new string # str_decl = StringIO() @@ -215,14 +216,13 @@ def get_generated_codeobjects(self): host_code.write(""" #include """) - fileheader = CodeIOStream() self.frame.generate_fileheader(self._global_sdfg, fileheader, 'poplar') host_code.write(""" {file_header} -// {other_globalcode} +{other_globalcode} DACE_EXPORTED int __dace_init_ipu({sdfg_state_name} *__state{params}) {{ __state->poplar_context = new dace_poplar_context(); @@ -911,6 +911,14 @@ def generate_state(self, else: print("IN HOST CODE") + function_stream.write(""" + +// hack to make the files compile by forward declaring the functions +extern "C" auto getIpuDevice(const unsigned int numIpus = 1) -> optional; +extern "C" void defineDataStreams(ipu_test1_state_t &__state); +extern "C" void kernel_buildComputeGraph(ipu_test1_state_t &__state); + + """) # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) @@ -1080,7 +1088,7 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca // Step 1: Create graph and add codelets __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); - __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + //__state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); popops::addCodelets(__state.poplar_context->graph); // Step 2: Add data to the graph @@ -1091,20 +1099,19 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); // Add programs and wire up data const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; - auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); - - for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ - const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); - const auto sliceStart = tileNum * NumElemsPerTile; - - auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); - __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); - __state.poplar_context->graph.setPerfEstimate(v, 100); - __state.poplar_context->graph.setTileMapping(v, tileNum); - }} - - __state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); - """) + //auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); + // + //for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ + // const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); + // const auto sliceStart = tileNum * NumElemsPerTile; + // auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); + // __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); + // __state.poplar_context->graph.setPerfEstimate(v, 100); + // __state.poplar_context->graph.setTileMapping(v, tileNum); + //}} + // + //__state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); + // """) kernel_host_stream.write("}\n") From 1a21f8eea4448fc6af6bcb1906699d799774d58f Mon Sep 17 00:00:00 2001 From: Sameeran joshi Date: Mon, 30 Sep 2024 22:44:59 +0000 Subject: [PATCH 66/77] Fix bug - wasn't generating proper kernel names, was not generic, tested on another testcase --- dace/codegen/targets/ipu.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 5a56efbf0e..b57e6d9110 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -911,14 +911,18 @@ def generate_state(self, else: print("IN HOST CODE") - function_stream.write(""" - -// hack to make the files compile by forward declaring the functions -extern "C" auto getIpuDevice(const unsigned int numIpus = 1) -> optional; -extern "C" void defineDataStreams(ipu_test1_state_t &__state); -extern "C" void kernel_buildComputeGraph(ipu_test1_state_t &__state); - - """) + sdfg_state_name = cpp.mangle_dace_state_struct_name(self._global_sdfg) + print("SDFG STATE NAME: ", sdfg_state_name) + formatted_string = """ + + // hack to make the files compile by forward declaring the functions + extern "C" auto getIpuDevice(const unsigned int numIpus = 1) -> optional; + extern "C" void defineDataStreams({sdfg_state_name} &__state); + extern "C" void kernel_buildComputeGraph({sdfg_state_name} &__state); + """.format(sdfg_state_name=sdfg_state_name) + + function_stream.write(formatted_string) + # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) From 9290acde7d631ba318a7d571d1c167909ebc4432 Mon Sep 17 00:00:00 2001 From: Sameeran joshi Date: Thu, 3 Oct 2024 16:09:08 +0000 Subject: [PATCH 67/77] Support addVariables() and mapLinearlyOnTiles(), currently works only for transients and scope must be inside state --- dace/codegen/targets/ipu.py | 155 +++++------------------------------- graphcore_dace/ipu_test.py | 30 +++---- 2 files changed, 36 insertions(+), 149 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index b57e6d9110..965ec3effe 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -51,17 +51,17 @@ def is_ipu_kernel(sdfg, state): """ # pdb.set_trace() data_nodes = state.data_nodes() - at_least_one_fpga_array = False + at_least_one_ipu_allocated_array = False for n in data_nodes: desc = n.desc(sdfg) - print(desc.storage.name, desc.storage, desc) + # print(desc.storage.name, desc.storage, desc) if desc.storage == dtypes.StorageType.IPU_Memory: - at_least_one_fpga_array = True + at_least_one_ipu_allocated_array = True if isinstance(desc, data.Scalar): continue if desc.storage != dtypes.StorageType.IPU_Memory: return False - return at_least_one_fpga_array + return at_least_one_ipu_allocated_array @registry.autoregister_params(name='ipu') class IPUCodeGen(TargetCodeGenerator): @@ -111,8 +111,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # Storage # ipu_storage = [dtypes.StorageType.IPU_Memory] - ipu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.IPU_Memory] - + ipu_storage = [dtypes.StorageType.IPU_Memory] self.dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array for storage in ipu_storage: for other_storage in dtypes.StorageType: @@ -135,83 +134,13 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): def preprocess(self, sdfg: SDFG) -> None: self.frame.statestruct.append('dace_poplar_context *poplar_context;') - - -# #create a new string -# str_decl = StringIO() -# str_decl = f""" -# optional device; -# Graph graph; -# map tensors; -# map programs; -# OptionFlags engineOptions; -# map programIds; -# vector programsList; -# vector hostData; -# """ - # Add above code to the statestruct - # self.frame.statestruct.append(str_decl) - pass def get_generated_codeobjects(self): - # structdecl = CodeIOStream() - # structdecl.write(f""" - # optional device; - # Graph graph; - # map tensors; - # map programs; - # OptionFlags engineOptions; - # map programIds; - # vector programsList; - # vector hostData; - # """) - - # fileheader = CodeIOStream() - # fileheader.write(""" - # #include - # #include - # #include - # #include - # #include - # #include - - # #include - # #include - # #include - # #include - # #include - # #include - # #include - - # using ::std::map; - # using ::std::optional; - # using ::std::string; - # using ::std::vector; - - # using ::poplar::Device; - # using ::poplar::DeviceManager; - # using ::poplar::Engine; - # using ::poplar::FLOAT; - # using ::poplar::Graph; - # using ::poplar::OptionFlags; - # using ::poplar::TargetType; - # using ::poplar::Tensor; - # using ::poplar::program::Copy; - # using ::poplar::program::Program; - # using ::poplar::program::Execute; - # using ::poplar::program::Repeat; - # """) - # fileheader.write("\n\n") - # constdefines = CodeIOStream() - # constdefines.write("""const auto NUM_DATA_ITEMS = 200000;""") - # constdefines.write("\n\n") - params_comma = self._global_sdfg.init_signature(free_symbols=self.frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma - - + host_code = CodeIOStream() host_code.write(""" #include @@ -617,7 +546,6 @@ def mapScalarOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: - self.add_header(function_stream) if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -658,7 +586,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap nodedesc.storage == dtypes.StorageType.Register: pass # IPU variables are C++ objects and are automatically deallocated else: - raise NotImplementedError + raise NotImplementedError("Unimplemented deallocate() for StorageType " + str(nodedesc.storage)) def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], @@ -1072,34 +1000,29 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca kernel_host_stream.write(f"""\ DACE_EXPORTED void {host_function_name}({', '.join(kernel_args_opencl)}) {{""") - - # kernel_host_stream.write(f"""\ - # hlslib::ocl::Program program = __state->poplar_context->fpga_context->Get().CurrentlyLoadedProgram();\ - # """) - # # Create a vector to collect all events that are being generated to allow - # # waiting before exiting this state - # kernel_host_stream.write("std::vector all_events;") - - # # Kernels invocations - # kernel_host_stream.write(state_host_body_stream.getvalue()) - - # # Wait for all events - # kernel_host_stream.write("hlslib::ocl::WaitForEvents(all_events);") # write the kernel_host_stream withe the commands I have copied kernel_host_stream.write(f"""\ - std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; + std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; // Step 1: Create graph and add codelets __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); //__state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); popops::addCodelets(__state.poplar_context->graph); - + """) + + kernel_host_stream.write(""" // Step 2: Add data to the graph - std::cout << " STEP 2.2: Add data to the graph" << std::endl; - __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + std::cout << " STEP 2.2: Add data to the graph" << std::endl;""") + # Emit internal transient array allocation + # __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + self.frame.allocate_arrays_in_scope(sdfg, cfg, state, function_stream, kernel_host_stream) + kernel_host_stream.write('\n') + + kernel_host_stream.write(""" poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors["data"]); - + """) + kernel_host_stream.write(""" const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); // Add programs and wire up data const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; @@ -1119,43 +1042,7 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca kernel_host_stream.write("}\n") - - """ - COMMENT - DACE_EXPORTED auto kernel_buildComputeGraph({sdfg_state_name} &__state) - {{ - std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; - - // Step 1: Create graph and add codelets - __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); - __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); - popops::addCodelets(__state.poplar_context->graph); - - // Step 2: Add data to the graph - std::cout << " STEP 2.2: Add data to the graph" << std::endl; - __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); - poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors["data"]); - - const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); - // Add programs and wire up data - const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; - auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); - - for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ - const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); - const auto sliceStart = tileNum * NumElemsPerTile; - - auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); - __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); - __state.poplar_context->graph.setPerfEstimate(v, 100); - __state.poplar_context->graph.setTileMapping(v, tileNum); - }} - - __state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); - }} - """ - - + self.frame.deallocate_arrays_in_scope(sdfg, cfg, state, function_stream, callsite_stream) def generate_kernel(self, sdfg: dace.SDFG, diff --git a/graphcore_dace/ipu_test.py b/graphcore_dace/ipu_test.py index 244ae9204c..29da3c9a41 100644 --- a/graphcore_dace/ipu_test.py +++ b/graphcore_dace/ipu_test.py @@ -110,7 +110,7 @@ def ipu_test1(): dtype=dace.int32, storage=dace.StorageType.IPU_Memory, location=None, - transient=False, + transient=True, strides=[1], offset=[0], lifetime=dace.AllocationLifetime.Scope, @@ -120,22 +120,22 @@ def ipu_test1(): dtype=dace.int32, storage=dace.StorageType.IPU_Memory, location=None, - transient=False, - strides=[1], - offset=[0], - lifetime=dace.AllocationLifetime.Scope, - debuginfo=None, total_size=1) - # Add a C array - nsdfg.add_array('c', - shape=[1], - dtype=dace.int32, - storage=dace.StorageType.IPU_Memory, - location=None, - transient=False, + transient=True, strides=[1], offset=[0], lifetime=dace.AllocationLifetime.Scope, debuginfo=None, total_size=1) + # # Add a C array + # nsdfg.add_array('c', + # shape=[1], + # dtype=dace.int32, + # storage=dace.StorageType.IPU_Memory, + # location=None, + # transient=True, + # strides=[1], + # offset=[0], + # lifetime=dace.AllocationLifetime.State, + # debuginfo=None, total_size=1) nsdfg.add_symbol('i', dace.int32) @@ -145,7 +145,7 @@ def ipu_test1(): dtype=dace.int32, storage=dace.StorageType.IPU_Memory, location=None, - transient=False, + transient=True, strides=[1], offset=[0], lifetime=dace.AllocationLifetime.Scope, @@ -189,4 +189,4 @@ def ipu_test1(): # nested() # ipu_vector_add_python_copy() - \ No newline at end of file + From d1971bdf01bbe1a5a845b9a824cbb555348c50ba Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Thu, 10 Oct 2024 21:57:05 +0000 Subject: [PATCH 68/77] cosmetic changes, remove Dead code, iondent --- dace/codegen/targets/ipu.py | 103 ++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 965ec3effe..040ca31e12 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -257,21 +257,64 @@ def node_dispatch_predicate(self, sdfg, state, node): ############################################################################################################ # IPU specific node/state generation ############################################################################################################ - # def copy_memory( - # self, - # sdfg: SDFG, - # cfg: ControlFlowRegion, - # dfg: StateSubgraphView, - # state_id: int, - # src_node: Union[nodes.Tasklet, nodes.AccessNode], - # dst_node: Union[nodes.Tasklet, nodes.AccessNode], - # edge: MultiConnectorEdge, - # function_stream: CodeIOStream, - # callsite_stream: CodeIOStream, - # ) -> None: - # return self.cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) - # return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) + + # def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: + # self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) + + # def allocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, + # allocation_stream: CodeIOStream) -> None: + + # # if user provided this storage type, then we dump what they said. + # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: + # name = node.data + # size = nodedesc.total_size + # ipu_type = "FLOAT" + # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) + # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) + # return + + # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, + # allocation_stream) + + # def deallocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, + # node: nodes.Node, nodedesc: data.Data, function_stream: CodeIOStream, + # callsite_stream: CodeIOStream) -> None: + # # unless any cpu allocations no need for IPUs + # pass + # # return self.cpu_codegen.deallocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, + # # callsite_stream) + + # def allocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, + # node: nodes.AccessNode, nodedesc: data.Array, function_stream: CodeIOStream, + # declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + # # Make sure the codegen includes the appropriate header files + # self.add_header(function_stream) + + # name = node.data + # print("ALLOCATE ARRAY - ", name) + # # # Based on the hardware, the total size must be 16^2 + # # assert nodedesc.total_size == 16 * 16 + # # # Majority is detected by the strides of the data + # # maj = 'row' if nodedesc.strides[-1] == 1 else 'col' + + # # Write a fragment based on the storage type + # if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: + # ctype = 'wmma::fragment' + # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) + # # else: + # # ctype = 'wmma::fragment'.format( + # # mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) + # # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) + + # # # Add the ctype to defined_vars so that the codegen can properly pass + # # # fragments to functions as an object reference. + # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) + # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + # allocation_stream) + # def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: # self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) @@ -506,7 +549,7 @@ def decidemapping(self, dataname, nodedesc, sdfg): return dataToTileMap # TODO:Similar mapVertexOntile - def mapdataontile(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + def mapdataontile(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: if isinstance(nodedesc, dace.data.Array): @@ -578,7 +621,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV # Mapping on tiles - def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1162,32 +1204,3 @@ def add_header(self, function_stream: CodeIOStream): # function_stream, # callsite_stream, # skip_entry_node=True) - - # def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, - # function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, map_header) - - - # # Add extra opening brace (dynamic map ranges, closed in MapExit - # # generator) - # callsite_stream.write('{', cfg, state_id, map_header) - - # if len(map_header.map.params) > 1: - # raise NotImplementedError('Multi-dimensional MPI maps are not supported') - - # state = cfg.state(state_id) - # symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) - - - # #$$$$ First dace::copy() - # for var, r in zip(map_header.map.params, map_header.map.range): - # begin, end, skip = r - - # callsite_stream.write('{\n', cfg, state_id, map_header) - # callsite_stream.write( - # '%s %s = %s + __dace_comm_rank * (%s);\n' % - # (symtypes[var], var, cppunparse.pyexpr2cpp(symbolic.symstr(begin, cpp_mode=True)), - # cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) - - # self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) From 835bd81828cbf30f92f4982c96ec203e00328518 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Thu, 10 Oct 2024 22:02:09 +0000 Subject: [PATCH 69/77] Fix bug in is_ipu_kernel, was failing for tests where the first accessnode is not IPU_Memory --- dace/codegen/targets/ipu.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 040ca31e12..067b69c3ba 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -44,10 +44,10 @@ def is_ipu_kernel(sdfg, state): """ - Returns whether the given state is an FPGA kernel and should be dispatched - to the FPGA code generator. + Returns whether the given state is an IPU kernel and should be dispatched + to the IPU code generator. - :return: True if this is an FPGA kernel, False otherwise. + :return: True if this is an IPU kernel, False otherwise. """ # pdb.set_trace() data_nodes = state.data_nodes() @@ -59,8 +59,6 @@ def is_ipu_kernel(sdfg, state): at_least_one_ipu_allocated_array = True if isinstance(desc, data.Scalar): continue - if desc.storage != dtypes.StorageType.IPU_Memory: - return False return at_least_one_ipu_allocated_array @registry.autoregister_params(name='ipu') From 0e43bb64c9015992a8f7e4f111c8a7870e8a67e3 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Fri, 11 Oct 2024 01:14:17 +0000 Subject: [PATCH 70/77] Fix mapping and variable allocation, remove Dead code --- dace/codegen/targets/ipu.py | 151 ++++++------------------------------ 1 file changed, 23 insertions(+), 128 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 067b69c3ba..4ba95b8944 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -256,118 +256,6 @@ def node_dispatch_predicate(self, sdfg, state, node): # IPU specific node/state generation ############################################################################################################ - - # def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, - # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: - # self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) - - # def allocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, - # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, - # allocation_stream: CodeIOStream) -> None: - - # # if user provided this storage type, then we dump what they said. - # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: - # name = node.data - # size = nodedesc.total_size - # ipu_type = "FLOAT" - # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) - # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) - # return - - # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, - # allocation_stream) - - # def deallocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, - # node: nodes.Node, nodedesc: data.Data, function_stream: CodeIOStream, - # callsite_stream: CodeIOStream) -> None: - # # unless any cpu allocations no need for IPUs - # pass - # # return self.cpu_codegen.deallocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, - # # callsite_stream) - - # def allocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, - # node: nodes.AccessNode, nodedesc: data.Array, function_stream: CodeIOStream, - # declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): - # # Make sure the codegen includes the appropriate header files - # self.add_header(function_stream) - - # name = node.data - # print("ALLOCATE ARRAY - ", name) - # # # Based on the hardware, the total size must be 16^2 - # # assert nodedesc.total_size == 16 * 16 - # # # Majority is detected by the strides of the data - # # maj = 'row' if nodedesc.strides[-1] == 1 else 'col' - - # # Write a fragment based on the storage type - # if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: - # ctype = 'wmma::fragment' - # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) - # # else: - # # ctype = 'wmma::fragment'.format( - # # mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) - # # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) - - # # # Add the ctype to defined_vars so that the codegen can properly pass - # # # fragments to functions as an object reference. - # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) - # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - # allocation_stream) - - # def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, - # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: - # self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) - - # def allocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, - # nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, - # allocation_stream: CodeIOStream) -> None: - - # # if user provided this storage type, then we dump what they said. - # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: - # name = node.data - # size = nodedesc.total_size - # ipu_type = "FLOAT" - # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) - # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) - # return - - # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, - # allocation_stream) - - # def deallocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, - # node: nodes.Node, nodedesc: data.Data, function_stream: CodeIOStream, - # callsite_stream: CodeIOStream) -> None: - # # unless any cpu allocations no need for IPUs - # pass - # # return self.cpu_codegen.deallocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, - # # callsite_stream) - - # def allocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, - # node: nodes.AccessNode, nodedesc: data.Array, function_stream: CodeIOStream, - # declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): - # # Make sure the codegen includes the appropriate header files - # self.add_header(function_stream) - - # name = node.data - # print("ALLOCATE ARRAY - ", name) - # # # Based on the hardware, the total size must be 16^2 - # # assert nodedesc.total_size == 16 * 16 - # # # Majority is detected by the strides of the data - # # maj = 'row' if nodedesc.strides[-1] == 1 else 'col' - - # # Write a fragment based on the storage type - # if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: - # ctype = 'wmma::fragment' - # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) - # # else: - # # ctype = 'wmma::fragment'.format( - # # mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) - # # declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) - - # # # Add the ctype to defined_vars so that the codegen can properly pass - # # # fragments to functions as an object reference. - # self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) - # self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - # allocation_stream) def allocate_ipu_scalar(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: @@ -409,27 +297,36 @@ def allocate_ipu_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgr arrsize = nodedesc.total_size is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) - ctypedef = 'Tensor *' + ctypedef = 'Tensor' shape = nodedesc.shape dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) # Check if array is already declared declared = self.dispatcher.declared_arrays.has(dataname) + # # if user provided this storage type, then we dump what they said. + # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: + # name = node.data + # size = nodedesc.total_size + # ipu_type = "FLOAT" + # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) + # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) + # return + # Different types of memories if nodedesc.storage == dtypes.StorageType.IPU_Memory: if not declared: result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; - self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + self.dispatcher.defined_vars.add(dataname, DefinedType.Object, ctypedef) if nodedesc.pool: raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) else: shape_poplar_format = ', '.join([str(sh) for sh in shape]) - result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) + result_alloc.write("__state.poplar_context->tensors[\"%s\"] = __state.poplar_context->graph.addVariable(%s, {%s}, \"%s\");\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format, dataname)) else: raise NotImplementedError("IPU: Unimplemented StorageType " + str(nodedesc.storage)) - declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + # declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, @@ -560,22 +457,20 @@ def mapdataontile(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV def mapArrayOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) # Map array intelligently - spreadOverTiles = True - if spreadOverTiles: + setTileMappingCall = StringIO() + spreadOverTilesManually = False + + if spreadOverTilesManually: dataToTileMap = self.decidemapping(dataname, nodedesc, sdfg) # Map array over multiple tiles - # loop over the dataToTileMap and set the mapping - # import pprint - # pprint.pprint(dataToTileMap) - + # loop over the dataToTileMap and set the mapping for data, tilenumber in dataToTileMap.items(): - setTileMappingCall = f"_state->graph.setTileMapping({data}, {tilenumber});" - allocation_stream.write(setTileMappingCall, cfg, state_id, node) + setTileMappingCall.write(f"_state->graph.setTileMapping({data}, {tilenumber});") else: - # Map array, given only 1 element maps on one tile - tilenumber = 0 - setTileMappingCall = f"_state->graph.setTileMapping({dataname}, {tilenumber});" - allocation_stream.write(setTileMappingCall, cfg, state_id, node) + # Map linearly over tiles, let poplar decide + setTileMappingCall.write(f"poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors[\"{dataname}\"]);") + + allocation_stream.write(setTileMappingCall.getvalue(), cfg, state_id, node) def mapScalarOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) From 1164d69ce808799393f23a218f68dc46999838e8 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Fri, 11 Oct 2024 02:07:36 +0000 Subject: [PATCH 71/77] Try adding generate_node() - fails as the predicate fails, as there is no schedule= in code --- dace/codegen/targets/ipu.py | 42 +++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 4ba95b8944..bd29d58cf8 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -124,8 +124,7 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): # # Dispatchers # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) - # self.dispatcher.register_node_dispatcher(self, self.is_node_library_node) - # self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) + self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) @@ -251,7 +250,11 @@ def is_node_library_node(self, sdfg, state, node): return False def node_dispatch_predicate(self, sdfg, state, node): - return True + if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes + if node.schedule in dtypes.IPU_SCHEDULES: + return True + return False + ############################################################################################################ # IPU specific node/state generation ############################################################################################################ @@ -549,15 +552,28 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + method_name = "_generate_" + type(node).__name__ print("Generating node: ", node.label) - # Dynamically obtain node generator according to class name - # gen = getattr(self, '_generate_' + type(node).__name__, False) - # if gen is not False: # Not every node type has a code generator here - # gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - # return + print("Method name: ", method_name) + # Fake inheritance... use this class' method if it exists, + # otherwise fall back on CPU codegen + if hasattr(self, method_name): + + if hasattr(node, "schedule") and node.schedule not in [ + dtypes.ScheduleType.Default, dtypes.ScheduleType.IPU_SCHEDULE]: + warnings.warn("Found schedule {} on {} node in FPGA code. " + "Ignoring.".format(node.schedule, + type(node).__name__)) + + getattr(self, method_name)(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + else: + old_codegen = self._cpu_codegen.calling_codegen + self._cpu_codegen.calling_codegen = self - # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + self._cpu_codegen.calling_codegen = old_codegen + # def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, # function_stream: CodeIOStream, callsite_stream: CodeIOStream): # """(TASKLET only) @@ -788,7 +804,11 @@ def generate_state(self, # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - + + def _generate_Tasklet(self, *args, **kwargs): + # Call CPU implementation with this code generator as callback + self._cpu_codegen._generate_Tasklet(*args, codegen=self, **kwargs) + ############################################################################################################ # #### Helpers From 530e298f4116d0bb497653a5807b3972f02e9242 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Thu, 17 Oct 2024 20:48:45 +0000 Subject: [PATCH 72/77] Add vector add test for dace and poplar --- graphcore_dace/SkeletonForIpu.cpp | 186 ++++++++++++++++++++++++++ graphcore_dace/vector_add_constant.py | 48 +++++++ 2 files changed, 234 insertions(+) create mode 100644 graphcore_dace/SkeletonForIpu.cpp create mode 100644 graphcore_dace/vector_add_constant.py diff --git a/graphcore_dace/SkeletonForIpu.cpp b/graphcore_dace/SkeletonForIpu.cpp new file mode 100644 index 0000000000..1848b3cf5e --- /dev/null +++ b/graphcore_dace/SkeletonForIpu.cpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +using ::std::map; +using ::std::vector; +using ::std::string; +using ::std::optional; + +using ::poplar::FLOAT; +using ::poplar::OptionFlags; +using ::poplar::Tensor; +using ::poplar::Graph; +using ::poplar::Engine; +using ::poplar::Device; +using ::poplar::DeviceManager; +using ::poplar::TargetType; +using ::poplar::program::Program; +using ::poplar::program::Sequence; +using ::poplar::program::Copy; +using ::poplar::program::Repeat; +using ::poplar::program::Execute; + + +const auto NUM_DATA_ITEMS = 10; +const auto HOW_MUCH_TO_ADD = 2.0f; +const auto NUM_TILES_IN_GC = 10; + + +auto getIpuDevice(const unsigned int numIpus = 1) -> optional { + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) { + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) { + std::cout << " - attached" << std::endl; + device = {std::move(d)}; + break; + } else { + std::cout << std::endl << "Error attaching to device" << std::endl; + } + } + return device; +} + +auto createGraphAndAddCodelets(const optional &device) -> Graph { + auto graph = poplar::Graph(device->getTarget()); + + // Add our custom codelet, building from CPP source + // with the given popc compiler options + graph.addCodelets({"codelets/SkeletonCodelets.cpp"}, "-O3 -I codelets"); + + // Add the codelets for the popops librarys + popops::addCodelets(graph); + return graph; +} + +auto buildComputeGraph(Graph &graph, map &tensors, map &programs, const int numTiles) { + // Add tensors + tensors["data"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "data"); + poputil::mapTensorLinearly(graph, tensors["data"]); + + + // Add programs and wire up data + const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + auto cs = graph.addComputeSet("loopBody"); + for (auto tileNum = 0; tileNum < numTiles; tileNum++) { + const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int) NUM_DATA_ITEMS); + const auto sliceStart = tileNum * NumElemsPerTile; + + auto v = graph.addVertex(cs, "SkeletonVertex", { + {"data", tensors["data"].slice(sliceStart, sliceEnd)} + }); + graph.setInitialValue(v["howMuchToAdd"], HOW_MUCH_TO_ADD); + // graph.setPerfEstimate(v, 100); // Ideally you'd get this as right as possible + graph.setTileMapping(v, tileNum); + } + auto executeIncrementVertex = Execute(cs); + + // auto mainProgram = Repeat(1, executeIncrementVertex, "repeat1x"); + programs["main"] = executeIncrementVertex; // Program 0 will be the main program +} + +auto defineDataStreams(Graph &graph, map &tensors, map &programs) { + auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + auto copyToIpuProgram = Copy(toIpuStream, tensors["data"]); + auto copyToHostProgram = Copy(tensors["data"], fromIpuStream); + + programs["copy_to_ipu"] = copyToIpuProgram; + programs["copy_to_host"] = copyToHostProgram; +} + +auto serializeGraph(const Graph &graph) { + std::ofstream graphSerOfs; + graphSerOfs.open("serialized_graph.capnp", std::ofstream::out | std::ofstream::trunc); + + graph.serialize(graphSerOfs, poplar::SerializationFormat::Binary); + graphSerOfs.close(); +} + +int main(int argc, char *argv[]) { + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + auto device = getIpuDevice(1); + if (!device.has_value()) { + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "STEP 2: Create graph and compile codelets" << std::endl; + auto graph = createGraphAndAddCodelets(device); + + + std::cout << "STEP 3: Building the compute graph" << std::endl; + auto tensors = map{}; + auto programs = map{}; + buildComputeGraph(graph, tensors, programs, NUM_TILES_IN_GC /* numTiles */); + + std::cout << "STEP 4: Define data streams" << std::endl; + defineDataStreams(graph, tensors, programs); + + std::cout << "STEP 5: Create engine and compile graph" << std::endl; + auto ENGINE_OPTIONS = OptionFlags{ + {"target.saveArchive", "archive.a"}, + {"autoReport.all", "true"}, + {"autoReport.outputSerializedGraph", "true"}, + }; + + auto programIds = map(); + auto programsList = vector(programs.size()); + int index = 0; + for (auto &nameToProgram: programs) { + programIds[nameToProgram.first] = index; + programsList[index] = nameToProgram.second; + index++; + } + auto engine = Engine(graph, programsList, ENGINE_OPTIONS); + + std::cout << "STEP 6: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*device); + engine.enableExecutionProfiling(); + + + std::cout << "STEP 7: Attach data streams" << std::endl; + auto hostData = vector(NUM_DATA_ITEMS, 1.0f); + // print before + std::cout << "\nBefore: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << hostData[i] << " "; + } + std::cout << "\nHow much to add: " << HOW_MUCH_TO_ADD << std::endl; + engine.connectStream("TO_IPU", hostData.data()); + engine.connectStream("FROM_IPU", hostData.data()); + + std::cout << "\nSTEP 8: Run programs" << std::endl; + engine.run(programIds["copy_to_ipu"]); // Copy to IPU + engine.run(programIds["main"]); // Main program + engine.run(programIds["copy_to_host"]); // Copy from IPU + + std::cout << "\nSTEP 9: Check results" << std::endl; + // print hostData to see the result + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << hostData[i] << " "; + } + + + std::cout << "\nSTEP 10: Capture debug and profile info" << std::endl; + // serializeGraph(graph); + // engine.printProfileSummary(std::cout, + // OptionFlags{{"showExecutionSteps", "false"}}); + + return EXIT_SUCCESS; +} diff --git a/graphcore_dace/vector_add_constant.py b/graphcore_dace/vector_add_constant.py new file mode 100644 index 0000000000..12c54a9360 --- /dev/null +++ b/graphcore_dace/vector_add_constant.py @@ -0,0 +1,48 @@ +import dace +import numpy as np + +def vector_add_constant_sdfg(): + # Define the SDFG + sdfg = dace.SDFG('vector_add_constant_sdfg') + + # Add arrays + sdfg.add_array('A', [10], dace.float64) + sdfg.add_array('B', [10], dace.float64) + sdfg.add_array('C', [10], dace.float64) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + B_read = state.add_read('B') + C_write = state.add_write('C') + + # # Add map + map_entry, map_exit = state.add_map('add_map', dict(i='0:10')) + + # # Add tasklet + tasklet = state.add_tasklet('add_constant', {'a_in', 'b_in'}, {'c_out'}, 'c_out = a_in + b_in') + + # # Connect nodes with memlets + state.add_memlet_path(A_read, map_entry, tasklet, dst_conn='a_in', memlet=dace.Memlet('A[i]')) + state.add_memlet_path(B_read, map_entry, tasklet, dst_conn='b_in', memlet=dace.Memlet('B[i]')) + state.add_memlet_path(tasklet, map_exit, C_write, src_conn='c_out', memlet=dace.Memlet('C[i]')) + + + # Runtime code + # Initialize data + A = np.ones(10, dtype=np.float64) + B = np.ones(10, dtype=np.float64) + C = np.zeros(10, dtype=np.float64) + + # Run the SDFG + sdfg(A=A, B=B, C=C) + + # Print the result + print(A) + print(B) + print(C) + +if __name__ == "__main__": + vector_add_constant_sdfg() \ No newline at end of file From 1334015ebc2cf085e428798ef89f2688b7d0a913 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Fri, 18 Oct 2024 21:57:50 +0000 Subject: [PATCH 73/77] Add scalar code using vector of size 1 --- graphcore_dace/scalar_1_add_constant.py | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 graphcore_dace/scalar_1_add_constant.py diff --git a/graphcore_dace/scalar_1_add_constant.py b/graphcore_dace/scalar_1_add_constant.py new file mode 100644 index 0000000000..06675cee0a --- /dev/null +++ b/graphcore_dace/scalar_1_add_constant.py @@ -0,0 +1,49 @@ +import dace +import numpy as np + + +def array_add_constant_sdfg(): + # Define the SDFG + sdfg = dace.SDFG('array_add_constant_sdfg') + + # Add arrays + sdfg.add_array('A', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_array('B', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_array('C', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + B_read = state.add_read('B') + C_write = state.add_write('C') + + # Add map + map_entry, map_exit = state.add_map('map', dict(i='0:1')) + + # Add tasklet + tasklet = state.add_tasklet('add_constant', {'a_in', 'b_in'}, {'c_out'}, 'c_out = a_in + b_in') + + # Connect nodes with memlets + state.add_memlet_path(A_read, map_entry, tasklet, dst_conn='a_in', memlet=dace.Memlet('A[i]')) + state.add_memlet_path(B_read, map_entry, tasklet, dst_conn='b_in', memlet=dace.Memlet('B[i]')) + state.add_memlet_path(tasklet, map_exit, C_write, src_conn='c_out', memlet=dace.Memlet('C[i]')) + + # Runtime code + # Initialize data + A = np.ones(1, dtype=np.float64) + B = np.ones(1, dtype=np.float64) + C = np.zeros(1, dtype=np.float64) + + # Run the SDFG + sdfg(A=A, B=B, C=C) + + # Print the result + print("A:", A) + print("B:", B) + print("C:", C) + +if __name__ == "__main__": + array_add_constant_sdfg() + From 7e54d70cc8c145b21c246ed0bbf5173651bd4f1b Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Fri, 18 Oct 2024 22:28:01 +0000 Subject: [PATCH 74/77] Remove prints --- dace/codegen/codegen.py | 3 --- dace/codegen/targets/ipu.py | 7 ------- 2 files changed, 10 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index 56125bb6fd..36403d01b9 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -250,9 +250,6 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: target_objects.extend(tgt.get_generated_codeobjects()) # Ensure that no new targets were dynamically added - print("\nused_targets = ", frame._dispatcher.used_targets) - print("\nframe.targets = ", frame.targets) - print("\nframe=", frame) assert frame._dispatcher.used_targets == (frame.targets - {frame}) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index bd29d58cf8..31b441f38e 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -244,7 +244,6 @@ def is_node_tasklet(self, sdfg, state, node): return False def is_node_library_node(self, sdfg, state, node): - print("NODE is = ", type(node).__name__) if isinstance(node, nodes.LibraryNode): return True return False @@ -553,8 +552,6 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: method_name = "_generate_" + type(node).__name__ - print("Generating node: ", node.label) - print("Method name: ", method_name) # Fake inheritance... use this class' method if it exists, # otherwise fall back on CPU codegen if hasattr(self, method_name): @@ -757,7 +754,6 @@ def generate_state(self, function_stream: CodeIOStream, callsite_stream:CodeIOStream, generate_state_footer:bool = True): - print("IPU STATE\n") # disp = self.dispatcher.get_scope_dispatcher(dtypes.ScheduleType.Unrolled) ipu_disp = self.dispatcher.get_state_dispatcher(sdfg, state=state) cpu_disp = self.cpu_codegen @@ -767,7 +763,6 @@ def generate_state(self, state_id = state.block_id if IPUCodeGen._in_device_code: - print("IN DEVICE CODE") to_allocate = dace.sdfg.local_transients(sdfg, state, None) allocated = set() @@ -789,9 +784,7 @@ def generate_state(self, self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) else: - print("IN HOST CODE") sdfg_state_name = cpp.mangle_dace_state_struct_name(self._global_sdfg) - print("SDFG STATE NAME: ", sdfg_state_name) formatted_string = """ // hack to make the files compile by forward declaring the functions From 3b1f4c79288e01b290374521156443fb33b5dd86 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Sun, 20 Oct 2024 23:31:22 +0000 Subject: [PATCH 75/77] new tests 1.copy a -> b on both IPU and dace test --- graphcore_dace/copy_a_b_skeletonIPU.cpp | 219 ++++++++++++++++++++++++ graphcore_dace/copy_a_to_b.py | 45 +++++ graphcore_dace/scalar_1_add_constant.py | 2 +- 3 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 graphcore_dace/copy_a_b_skeletonIPU.cpp create mode 100644 graphcore_dace/copy_a_to_b.py diff --git a/graphcore_dace/copy_a_b_skeletonIPU.cpp b/graphcore_dace/copy_a_b_skeletonIPU.cpp new file mode 100644 index 0000000000..9995547f33 --- /dev/null +++ b/graphcore_dace/copy_a_b_skeletonIPU.cpp @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +using namespace poplar; +using namespace poplar::program; + +using ::std::map; +using ::std::vector; +using ::std::string; +using ::std::optional; + +// using ::poplar::FLOAT; +// using ::poplar::OptionFlags; +// using ::poplar::Tensor; +// using ::poplar::Graph; +// using ::poplar::Engine; +// using ::poplar::Device; +// using ::poplar::DeviceManager; +// using ::poplar::TargetType; +// using ::poplar::program::Program; +// using ::poplar::program::Sequence; +// using ::poplar::program::Copy; +// using ::poplar::program::Repeat; +// using ::poplar::program::Execute; + + +const auto NUM_DATA_ITEMS = 1; + +auto getIpuDevice(const unsigned int numIpus = 1) -> optional { + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) { + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) { + std::cout << " - attached" << std::endl; + device = {std::move(d)}; + break; + } else { + std::cout << std::endl << "Error attaching to device" << std::endl; + } + } + return device; +} + +auto createGraphAndAddCodelets(const optional &device) -> Graph { + auto graph = poplar::Graph(device->getTarget()); + + // Add our custom codelet, building from CPP source + // with the given popc compiler options + // graph.addCodelets({"codelets/SkeletonCodelets.cpp"}, "-O3 -I codelets"); + + // Add the codelets for the popops librarys + // popops::addCodelets(graph); + return graph; +} + +auto buildComputeGraph(Graph &graph, map &tensors, map &programs) { + // Add tensors + tensors["v1"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "v1"); + poputil::mapTensorLinearly(graph, tensors["v1"]); + + tensors["v2"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "v2"); + poputil::mapTensorLinearly(graph, tensors["v2"]); // both v1 v2 will be on same tile + + // real magic happens here + auto copyprogram = Copy(tensors["v1"], tensors["v2"]); // tile to tile + programs["main"] = copyprogram; + + // print_before = program::PrintTensor("v1-debug", v1); + // programs["print_before"] = print_before; + + // print_after = program::PrintTensor("v2-debug", v2); + // programs["print_after"] = print_after; + +} + +auto defineDataStreams(Graph &graph, map &tensors, map &programs) { + auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + auto copyToIpuProgramv1 = Copy(toIpuStream, tensors["v1"]); // host->device + auto copyToIpuProgramv2 = Copy(toIpuStream, tensors["v2"]); + + // print these tensors + auto copyToHostProgramv1 = Copy(tensors["v1"], fromIpuStream); + auto copyToHostProgramv2 = Copy(tensors["v2"], fromIpuStream); // device->host + + // auto printit_v1 = PrintTensor("v1-debug", tensors["v1"]); + // auto printit_v2 = PrintTensor("v2-debug", tensors["v2"]); + // auto printit_v1_after = PrintTensor("v1-debug-after", tensors["v1"]); + // auto printit_v2_after = PrintTensor("v2-debug-after", tensors["v2"]); + // programs["print_v1_before"] = printit_v1; + // programs["print_v2_before"] = printit_v2; + // programs["print_v1_after"] = printit_v1_after; + // programs["print_v2_after"] = printit_v2_after; + + programs["copy_to_ipu_v1"] = copyToIpuProgramv1; + programs["copy_to_ipu_v2"] = copyToIpuProgramv2; + programs["copy_to_host_v1"] = copyToHostProgramv1; + programs["copy_to_host_v2"] = copyToHostProgramv2; + +} + +auto serializeGraph(const Graph &graph) { + std::ofstream graphSerOfs; + graphSerOfs.open("serialized_graph.capnp", std::ofstream::out | std::ofstream::trunc); + + graph.serialize(graphSerOfs, poplar::SerializationFormat::Binary); + graphSerOfs.close(); +} + +void print_data(std::vector &v1_host, std::vector& v2_host) { + std::cout << "v1: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << v1_host[i] << " "; + } + std::cout << std::endl; + std::cout << "v2: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << v2_host[i] << " "; + } + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + auto device = getIpuDevice(1); + if (!device.has_value()) { + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "STEP 2: Create graph and compile codelets" << std::endl; + auto graph = createGraphAndAddCodelets(device); + + + std::cout << "STEP 3: Building the compute graph" << std::endl; + auto tensors = map{}; + auto programs = map{}; + buildComputeGraph(graph, tensors, programs); + + std::cout << "STEP 4: Define data streams" << std::endl; + defineDataStreams(graph, tensors, programs); + + std::cout << "STEP 5: Create engine and compile graph" << std::endl; + auto ENGINE_OPTIONS = OptionFlags{ + {"target.saveArchive", "archive.a"}, + {"autoReport.all", "true"}, + {"autoReport.outputSerializedGraph", "true"}, + }; + + auto programIds = map(); + auto programsList = vector(programs.size()); + int index = 0; + for (auto &nameToProgram: programs) { + programIds[nameToProgram.first] = index; + programsList[index] = nameToProgram.second; + index++; + } + auto engine = Engine(graph, programsList, ENGINE_OPTIONS); + + std::cout << "STEP 6: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*device); + engine.enableExecutionProfiling(); + + std::cout << "STEP 7: Attach data streams(host to device data)" << std::endl; + auto v1_host = vector(NUM_DATA_ITEMS, 100.0f); // v1 = 1 + auto v2_host = vector(NUM_DATA_ITEMS, 0.0f); // v2 = 0 + vector vector_stream_in; + vector_stream_in.insert(vector_stream_in.end(), v1_host.begin(), + v1_host.end()); + vector_stream_in.insert(vector_stream_in.end(), v2_host.begin(), v2_host.end()); + + auto v1_host_out = vector(NUM_DATA_ITEMS, 0.0f); // Output buffer for v1 + auto v2_host_out = vector(NUM_DATA_ITEMS, 0.0f); // Output buffer for v2 + vector vector_stream_out; + vector_stream_out.insert(vector_stream_out.end(), v1_host_out.begin(), v1_host_out.end()); + vector_stream_out.insert(vector_stream_out.end(), v2_host_out.begin(), v2_host_out.end()); + + // print before + std::cout << "\nBefore: \n"; + print_data(v1_host, v2_host); + + engine.connectStream("TO_IPU", vector_stream_in.data(), vector_stream_in.data() + vector_stream_in.size()); + engine.connectStream("FROM_IPU", vector_stream_out.data(), vector_stream_out.data() + vector_stream_out.size()); + + std::cout << "\nSTEP 8: Run programs" << std::endl; + engine.run(programIds["copy_to_ipu_v1"]); // Copy to IPU + engine.run(programIds["copy_to_ipu_v2"]); // Copy to IPU + // engine.run(programIds["print_v1_before"]); // Print v1 + // engine.run(programIds["print_v2_before"]); // Print v2 + engine.run(programIds["main"]); // Main program + // engine.run(programIds["print_v1_after"]); // Print v1 + // engine.run(programIds["print_v2_after"]); // Print v2 + engine.run(programIds["copy_to_host_v1"]); // Copy from IPU + engine.run(programIds["copy_to_host_v2"]); // Copy from IPU + + std::cout << "\nSTEP 9: Check results after\n" << std::endl; + v1_host_out.assign(vector_stream_out.begin(), vector_stream_out.begin() + NUM_DATA_ITEMS); + v2_host_out.assign(vector_stream_out.begin() + NUM_DATA_ITEMS, vector_stream_out.end()); + print_data(v1_host_out, v2_host_out); + + return EXIT_SUCCESS; +} + diff --git a/graphcore_dace/copy_a_to_b.py b/graphcore_dace/copy_a_to_b.py new file mode 100644 index 0000000000..06a0cc8dd5 --- /dev/null +++ b/graphcore_dace/copy_a_to_b.py @@ -0,0 +1,45 @@ +import dace +import numpy as np + + +def copy_a_to_b(): + # Define the SDFG + sdfg = dace.SDFG('copy_a_to_b') + + # Add arrays + sdfg.add_array('A', [1], dace.float64) + sdfg.add_array('C', [1], dace.float64) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + C_write = state.add_write('C') + + # add edge + state.add_edge(A_read, None, C_write, None, dace.Memlet('A[0] -> C[0]')) + + ############################################################### + # Runtime code + # Initialize data + A = np.ones(1, dtype=np.float64) + C = np.zeros(1, dtype=np.float64) + + # PRINT BEFORE + print("\nBefore") + print("A:", A) + print("C:", C) + + # Run the SDFG + sdfg(A=A, C=C) + + # Print the result + print ("\nAfter") + print("A:", A) + print("C:", C) + + ############################################################### +if __name__ == "__main__": + copy_a_to_b() + diff --git a/graphcore_dace/scalar_1_add_constant.py b/graphcore_dace/scalar_1_add_constant.py index 06675cee0a..2e9bf58788 100644 --- a/graphcore_dace/scalar_1_add_constant.py +++ b/graphcore_dace/scalar_1_add_constant.py @@ -20,7 +20,7 @@ def array_add_constant_sdfg(): C_write = state.add_write('C') # Add map - map_entry, map_exit = state.add_map('map', dict(i='0:1')) + map_entry, map_exit = state.add_map('map', dict(i='0:1'), schedule=dace.ScheduleType.Sequential) # Add tasklet tasklet = state.add_tasklet('add_constant', {'a_in', 'b_in'}, {'c_out'}, 'c_out = a_in + b_in') From 7537466acf1b7847e6e7707b04c35e6078124fb3 Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Mon, 21 Oct 2024 02:00:06 +0000 Subject: [PATCH 76/77] Add IPU_Memory to accessNode --- graphcore_dace/copy_a_to_b.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graphcore_dace/copy_a_to_b.py b/graphcore_dace/copy_a_to_b.py index 06a0cc8dd5..294cb5f6eb 100644 --- a/graphcore_dace/copy_a_to_b.py +++ b/graphcore_dace/copy_a_to_b.py @@ -7,9 +7,9 @@ def copy_a_to_b(): sdfg = dace.SDFG('copy_a_to_b') # Add arrays - sdfg.add_array('A', [1], dace.float64) - sdfg.add_array('C', [1], dace.float64) - + sdfg.add_array('A', [1], dace.float64, storage=dace.StorageType.IPU_Memory) + sdfg.add_array('C', [1], dace.float64, storage=dace.StorageType.IPU_Memory) + # Add state state = sdfg.add_state('compute_state') From 6db5588eedd580ad2fc6807491b9460f66ff39ed Mon Sep 17 00:00:00 2001 From: Sameeran Joshi Date: Mon, 21 Oct 2024 02:05:26 +0000 Subject: [PATCH 77/77] Most of the codegen is correct, generate_node() doesn't trigger, copy_memory() triggers --- dace/codegen/dispatcher.py | 2 + dace/codegen/targets/ipu.py | 521 +++++++++++++++++++++++++++++------- 2 files changed, 421 insertions(+), 102 deletions(-) diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 3ac9e097f8..926154423f 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -444,6 +444,8 @@ def dispatch_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi state = cfg.state(state_id) disp = self.get_node_dispatcher(sdfg, state, node) self._used_targets.add(disp) + # print debugging for the dispatcher + print("SJJ: Dispatching node", node, "to", disp) disp.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) def get_scope_dispatcher(self, schedule: dtypes.ScheduleType) -> target.TargetCodeGenerator: diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py index 31b441f38e..0cf5a07beb 100644 --- a/dace/codegen/targets/ipu.py +++ b/dace/codegen/targets/ipu.py @@ -1,5 +1,6 @@ # import # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import inspect from io import StringIO from dace.codegen.codeobject import CodeObject import sympy @@ -98,17 +99,14 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self._num_kernels = 0 self._host_codes = [] self._kernel_codes = [] - self._generated_nodes = [] + self._generated_nodes = set() + self._locals = cppunparse.CPPLocals() # Register dispatchers - self.cpu_codegen = self.dispatcher.get_generic_node_dispatcher() + self.cpu_codegen = self.dispatcher.get_generic_node_dispatcher() self.dispatcher.register_state_dispatcher(self, predicate=is_ipu_kernel) - # self.dispatcher.register_array_dispatcher(dtypes.StorageType.IPU_Tile_Local, self) - - # Storage - # ipu_storage = [dtypes.StorageType.IPU_Memory] ipu_storage = [dtypes.StorageType.IPU_Memory] self.dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array for storage in ipu_storage: @@ -116,18 +114,11 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): self.dispatcher.register_copy_dispatcher(storage, other_storage, None, self) self.dispatcher.register_copy_dispatcher(other_storage, storage, None, self) - - - - - # # Dispatchers - # self.dispatcher.register_map_dispatcher(dace.ScheduleType.IPU_Map, self) + # self.dispatcher.register_map_dispatcher(dace.ScheduleType.Default, self) # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) - self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) - # self.dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, dtypes.StorageType.IPU_Tile_Local, None, func=self) - # self._dispatcher.register_map_dispatcher(dace.ScheduleType.IPU, self) - # self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # self.dispatcher.register_node_dispatcher(self) + # self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) def preprocess(self, sdfg: SDFG) -> None: self.frame.statestruct.append('dace_poplar_context *poplar_context;') @@ -196,24 +187,29 @@ def get_generated_codeobjects(self): "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes]))) - host_code_obj = CodeObject(self.program_name, - host_code.getvalue(), - "cpp", - IPUCodeGen, - "IPU", - target_type="host") - - # Device object - kernel_code_objs = [ - CodeObject(kernel_name, - code, - "cpp", - IPUCodeGen, - "IPU", - target_type="device") for (kernel_name, code) in self._kernel_codes - ] + # only generate ipu/file.cpp when it's an IPU kernel, else only cpu/file.cpp + if is_ipu_kernel(self._global_sdfg, self._global_sdfg.node(0)): + host_code_obj = CodeObject(self.program_name, + host_code.getvalue(), + "cpp", + IPUCodeGen, + "IPU", + target_type="host") + return [host_code_obj] + else: + return [] + + # # Device object + # kernel_code_objs = [ + # CodeObject(kernel_name, + # code, + # "cpp", + # IPUCodeGen, + # "IPU", + # target_type="device") for (kernel_name, code) in self._kernel_codes + # ] - return [host_code_obj] + kernel_code_objs + # __dace_init_ function @property @@ -249,10 +245,13 @@ def is_node_library_node(self, sdfg, state, node): return False def node_dispatch_predicate(self, sdfg, state, node): + return True + retval = False if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes if node.schedule in dtypes.IPU_SCHEDULES: - return True - return False + retval = True + print("Node dispatch predicate: ", retval) + return retval ############################################################################################################ # IPU specific node/state generation @@ -293,7 +292,6 @@ def allocate_ipu_scalar(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubg def allocate_ipu_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: - result_decl = StringIO() result_alloc = StringIO() arrsize = nodedesc.total_size @@ -305,15 +303,7 @@ def allocate_ipu_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgr # Check if array is already declared declared = self.dispatcher.declared_arrays.has(dataname) - # # if user provided this storage type, then we dump what they said. - # if nodedesc.storage == dtypes.StorageType.IPU_Tile_Local: - # name = node.data - # size = nodedesc.total_size - # ipu_type = "FLOAT" - # self.dispatcher.defined_vars.add(name, DefinedType.Scalar, ipu_type) - # declaration_stream.write(f'_state->graph.addVariable({ipu_type}, [{size}], {name});', cfg, state_id, node) - # return - + # Different types of memories if nodedesc.storage == dtypes.StorageType.IPU_Memory: if not declared: @@ -484,7 +474,7 @@ def mapScalarOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: - + allocation_stream.write("// Allocating array %s\n" % node.data, cfg, state_id, node) if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -524,17 +514,19 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap pass # IPU variables are C++ objects and are automatically deallocated else: raise NotImplementedError("Unimplemented deallocate() for StorageType " + str(nodedesc.storage)) - + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], - memlet: Memlet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + edge: MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write("// Copying from {} (name: {}) to {} (name: {}) with edge: {}\n".format( + src_node, src_node.label, dst_node, dst_node.label, edge), cfg, state_id) state = cfg.state(state_id) if isinstance(src_node, nodes.Tasklet): src_storage = dtypes.StorageType.Register src_parent = state.entry_node(src_node) dst_schedule = None if src_parent is None else src_parent.map.schedule else: - src_storage = src_node.desc(sdfg).storage + src_storage = src_node.desc(sdfg).storage if isinstance(dst_node, nodes.Tasklet): dst_storage = dtypes.StorageType.Register @@ -552,25 +544,32 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: method_name = "_generate_" + type(node).__name__ - # Fake inheritance... use this class' method if it exists, - # otherwise fall back on CPU codegen - if hasattr(self, method_name): - - if hasattr(node, "schedule") and node.schedule not in [ - dtypes.ScheduleType.Default, dtypes.ScheduleType.IPU_SCHEDULE]: - warnings.warn("Found schedule {} on {} node in FPGA code. " - "Ignoring.".format(node.schedule, - type(node).__name__)) - - getattr(self, method_name)(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - else: - old_codegen = self._cpu_codegen.calling_codegen - self._cpu_codegen.calling_codegen = self - - self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - - self._cpu_codegen.calling_codegen = old_codegen - + # print(method_name) + # function_stream.write(f"//SJJ: Generating node {node.label}, method name = {method_name} \n") + callsite_stream.write("// Generating Node: " + str(node) + ", Type: " + type(node).__name__ + ", Details: " + repr(node) + "\n", sdfg, state_id) + + try: + gen = getattr(self, "_generate_" + type(node).__name__) + except AttributeError: + if isinstance(node, nodes.LibraryNode): + raise NodeNotExpandedError(sdfg, state_id, dfg.node_id(node)) + raise + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # Mark node as "generated" + self._generated_nodes.add(node) + self._locals.clear_scope(self._ldepth + 1) + + + # else: + # old_codegen = self._cpu_codegen.calling_codegen + # self._cpu_codegen.calling_codegen = self + + # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + # self._cpu_codegen.calling_codegen = old_codegen + # Dynamically obtain node generator according to class name + + # def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, # function_stream: CodeIOStream, callsite_stream: CodeIOStream): # """(TASKLET only) @@ -761,12 +760,14 @@ def generate_state(self, self.dispatcher._used_targets.add(cpu_disp) state_id = state.block_id + subgraphs = dace.sdfg.concurrent_subgraphs(state) if IPUCodeGen._in_device_code: + print("device code") to_allocate = dace.sdfg.local_transients(sdfg, state, None) allocated = set() - subgraphs = dace.sdfg.concurrent_subgraphs(state) + for node in state.data_nodes(): data = node.desc(sdfg) @@ -778,11 +779,11 @@ def generate_state(self, raise cgx.CodegenError("Cannot allocate global memory from device code.") allocated.add(node.data) # Allocate transients - self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, data, function_stream, + self.dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, data, function_stream, callsite_stream) self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) - + else: sdfg_state_name = cpp.mangle_dace_state_struct_name(self._global_sdfg) formatted_string = """ @@ -795,16 +796,355 @@ def generate_state(self, function_stream.write(formatted_string) + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) + # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - def _generate_Tasklet(self, *args, **kwargs): - # Call CPU implementation with this code generator as callback - self._cpu_codegen._generate_Tasklet(*args, codegen=self, **kwargs) - + + def _generate_MapEntry( + self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + node: nodes.MapEntry, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ): + callsite_stream.write(f"// Generating MapEntry {node.label}\n") + state_dfg = cfg.state(state_id) + map_params = node.map.params + + result = callsite_stream + map_header = "" + + # Encapsulate map with a C scope + # TODO: Refactor out of MapEntry generation (generate_scope_header?) + callsite_stream.write('{', cfg, state_id, node) + + # Define all input connectors of this map entry + for e in dynamic_map_inputs(state_dfg, node): + if e.data.data != e.dst_conn: + callsite_stream.write( + self.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, + state_id, node) + + inner_stream = CodeIOStream() + self.generate_scope_preamble(sdfg, dfg, state_id, function_stream, callsite_stream, inner_stream) + + # Instrumentation: Pre-scope + instr = self._dispatcher.instrumentation[node.map.instrument] + if instr is not None: + instr.on_scope_entry(sdfg, state_dfg, node, callsite_stream, inner_stream, function_stream) + + # TODO: Refactor to generate_scope_preamble once a general code + # generator (that CPU inherits from) is implemented + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + # OpenMP header + in_persistent = False + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent]) + if in_persistent: + # If already in a #pragma omp parallel, no need to use it twice + map_header += "#pragma omp for" + # TODO(later): barriers and map_header += " nowait" + else: + map_header += "#pragma omp parallel for" + + elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + map_header += "#pragma omp parallel" + + # OpenMP schedule properties + if not in_persistent: + if node.map.omp_schedule != dtypes.OMPScheduleType.Default: + schedule = " schedule(" + if node.map.omp_schedule == dtypes.OMPScheduleType.Static: + schedule += "static" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: + schedule += "dynamic" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: + schedule += "guided" + else: + raise ValueError("Unknown OpenMP schedule type") + if node.map.omp_chunk_size > 0: + schedule += f", {node.map.omp_chunk_size}" + schedule += ")" + map_header += schedule + + if node.map.omp_num_threads > 0: + map_header += f" num_threads({node.map.omp_num_threads})" + + # OpenMP nested loop properties + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1: + map_header += ' collapse(%d)' % node.map.collapse + + if node.map.unroll: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")") + + result.write(map_header, cfg, state_id, node) + + if node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + result.write('{\n', cfg, state_id, node) + + # Find if bounds are used within the scope + scope = state_dfg.scope_subgraph(node, False, False) + fsyms = self._frame.free_symbols(scope) + # Include external edges + for n in scope.nodes(): + for e in state_dfg.all_edges(n): + fsyms |= e.data.used_symbols(False, e) + fsyms = set(map(str, fsyms)) + + ntid_is_used = '__omp_num_threads' in fsyms + tid_is_used = node.map.params[0] in fsyms + if tid_is_used or ntid_is_used: + function_stream.write('#include ', cfg, state_id, node) + if tid_is_used: + result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', cfg, state_id, node) + if ntid_is_used: + result.write(f'auto __omp_num_threads = omp_get_num_threads();', cfg, state_id, node) + else: + # Emit nested loops + for i, r in enumerate(node.map.range): + var = map_params[i] + begin, end, skip = r + + if node.map.unroll: + result.write("#pragma unroll", cfg, state_id, node) + + result.write( + "for (auto %s = %s; %s < %s; %s += %s) {\n" % + (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), + cfg, + state_id, + node, + ) + + callsite_stream.write(inner_stream.getvalue()) + + # Emit internal transient array allocation + self._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, result) + + + def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write(f"// Mapping MapExit {node.label} \n") + self.cpu_codegen._generate_MapExit(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write(f"// Generating node {node.label} using {inspect.currentframe().f_code.co_name} \n") + self.cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # print metadata + callsite_stream.write(f"// Generating node {node.label} using {inspect.currentframe().f_code.co_name} \n") + #print current function name + + state_dfg: SDFGState = cfg.nodes()[state_id] + + + sdict = state_dfg.scope_dict() + for edge in state_dfg.in_edges(node): + predecessor, _, _, _, memlet = edge + if memlet.data is None: + continue # If the edge has to be skipped + + # Determines if this path ends here or has a definite source (array) node + memlet_path = state_dfg.memlet_path(edge) + if memlet_path[-1].dst == node: + src_node = memlet_path[0].src + # Only generate code in case this is the innermost scope + # (copies are generated at the inner scope, where both arrays exist) + if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]): + self.dispatcher.dispatch_copy( + src_node, + node, + edge, + sdfg, + cfg, + dfg, + state_id, + function_stream, + callsite_stream, + ) + + # Process outgoing memlets (array-to-array write should be emitted + # from the first leading edge out of the array) + self.process_out_memlets( + sdfg, + cfg, + state_id, + node, + dfg, + self.dispatcher, + callsite_stream, + False, + function_stream, + ) + ############################################################################################################ # #### Helpers + def process_out_memlets(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state_id: int, + node: nodes.Node, + dfg: StateSubgraphView, + dispatcher: TargetDispatcher, + result: CodeIOStream, + locals_defined: bool, + function_stream: CodeIOStream, + skip_wcr: bool = False, + codegen: Optional[TargetCodeGenerator] = None): + + codegen = codegen if codegen is not None else self + state: SDFGState = cfg.nodes()[state_id] + scope_dict = state.scope_dict() + + for edge in dfg.out_edges(node): + + _, uconn, v, _, memlet = edge + if skip_wcr and memlet.wcr is not None: + continue + dst_edge = dfg.memlet_path(edge)[-1] + dst_node = dst_edge.dst + + # Target is neither a data nor a tasklet node + if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode) + and not isinstance(dst_node, nodes.CodeNode)): + continue + + # Skip array->code (will be handled as a tasklet input) + if isinstance(node, nodes.AccessNode) and isinstance(v, nodes.CodeNode): + continue + + # code->code (e.g., tasklet to tasklet) + if isinstance(dst_node, nodes.CodeNode) and edge.src_conn: + shared_data_name = edge.data.data + if not shared_data_name: + # Very unique name. TODO: Make more intuitive + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id(node), + dfg.node_id(dst_node), edge.src_conn) + + result.write( + "%s = %s;" % (shared_data_name, edge.src_conn), + cfg, + state_id, + [edge.src, edge.dst], + ) + continue + + # If the memlet is not pointing to a data node (e.g. tasklet), then + # the tasklet will take care of the copy + if not isinstance(dst_node, nodes.AccessNode): + continue + # If the memlet is pointing into an array in an inner scope, then + # the inner scope (i.e., the output array) must handle it + if scope_dict[node] != scope_dict[dst_node] and scope_contains_scope(scope_dict, node, dst_node): + continue + + # Array to tasklet (path longer than 1, handled at tasklet entry) + if node == dst_node: + continue + + # Tasklet -> array + if isinstance(node, nodes.CodeNode): + if not uconn: + raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format( + str(edge.src), str(edge.dst))) + + conntype = node.out_connectors[uconn] + is_scalar = not isinstance(conntype, dtypes.pointer) + if isinstance(conntype, dtypes.pointer) and sdfg.arrays[memlet.data].dtype == conntype: + is_scalar = True # Pointer to pointer assignment + is_stream = isinstance(sdfg.arrays[memlet.data], data.Stream) + is_refset = isinstance(sdfg.arrays[memlet.data], data.Reference) and dst_edge.dst_conn == 'set' + + if (is_scalar and not memlet.dynamic and not is_stream) or is_refset: + out_local_name = " __" + uconn + in_local_name = uconn + if not locals_defined: + out_local_name = self.memlet_ctor(sdfg, memlet, node.out_connectors[uconn], True) + in_memlets = [d for _, _, _, _, d in dfg.in_edges(node)] + assert len(in_memlets) == 1 + in_local_name = self.memlet_ctor(sdfg, in_memlets[0], node.out_connectors[uconn], False) + + if memlet.wcr is not None: + nc = not cpp.is_write_conflicted(dfg, edge, sdfg_schedule=self._toplevel_schedule) + write_expr = codegen.write_and_resolve_expr( + sdfg, memlet, nc, out_local_name, in_local_name, dtype=node.out_connectors[uconn]) + ";" + else: + if isinstance(node, nodes.NestedSDFG): + # This case happens with nested SDFG outputs, + # which we skip since the memlets are references + continue + desc = sdfg.arrays[memlet.data] + ptrname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) + try: + defined_type, _ = self.dispatcher.declared_arrays.get(ptrname, is_global=is_global) + except KeyError: + defined_type, _ = self.dispatcher.defined_vars.get(ptrname, is_global=is_global) + + if defined_type == DefinedType.Scalar: + mname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + write_expr = f"{mname} = {in_local_name};" + elif defined_type == DefinedType.Pointer and is_refset: + mname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + write_expr = f"{mname} = {in_local_name};" + elif (defined_type == DefinedType.ArrayInterface and not isinstance(desc, data.View)): + # Special case: No need to write anything between + # array interfaces going out + try: + deftype, _ = self.dispatcher.defined_vars.get(in_local_name) + except KeyError: + deftype = None + if deftype == DefinedType.ArrayInterface: + continue + array_expr = cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame) + decouple_array_interfaces = Config.get_bool("compiler", "xilinx", + "decouple_array_interfaces") + ptr_str = fpga.fpga_ptr( # we are on fpga, since this is array interface + memlet.data, + desc, + sdfg, + memlet.subset, + True, + None, + None, + True, + decouple_array_interfaces=decouple_array_interfaces) + write_expr = f"*({ptr_str} + {array_expr}) = {in_local_name};" + else: + desc_dtype = desc.dtype + expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame) + write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype) + + # Write out + result.write(write_expr, cfg, state_id, node) + + # Dispatch array-to-array outgoing copies here + elif isinstance(node, nodes.AccessNode): + if dst_node != node and not isinstance(dst_node, nodes.Tasklet): + dispatcher.dispatch_copy( + node, + dst_node, + edge, + sdfg, + cfg, + dfg, + state_id, + function_stream, + result, + ) + def generate_ipu_cpuside_state(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -904,6 +1244,7 @@ def generate_ipu_cpuside_state(self, """) ## Generate the global function here + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mmlt.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -949,13 +1290,14 @@ def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, ca kernel_host_stream.write(f"""\ DACE_EXPORTED void {host_function_name}({', '.join(kernel_args_opencl)}) {{""") + # BODY OF THE FUNCTION # write the kernel_host_stream withe the commands I have copied kernel_host_stream.write(f"""\ std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; // Step 1: Create graph and add codelets __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); - //__state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); popops::addCodelets(__state.poplar_context->graph); """) @@ -1084,29 +1426,4 @@ def add_header(self, function_stream: CodeIOStream): # ####### # print("TargetCodeGenerator:", self) # print("language", self.language) - # print("TargetDispatcher:", self._dispatcher.used_targets) - - # def generate_scope(self, - # sdfg: SDFG, - # cfg: ControlFlowRegion, - # dfg_scope: ScopeSubgraphView, - # state_id: int, - # function_stream: CodeIOStream, - # callsite_stream: CodeIOStream) -> None: - # # Get the first entry node of Map - # entry_node = dfg_scope.source_nodes()[0] - - # # function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, entry_node) - # callsite_stream.write('{', cfg, state_id, entry_node) - - # # cpp.presynchronize_streams(sdfg, cfg, dfg_scope, state_id, entry_node, callsite_stream) #TODO: add some other function of own. - # # Should we ? - # # self.generate_node(sdfg, cfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) - # # generated nested subgraphs - # self._dispatcher.dispatch_subgraph(sdfg, - # cfg, - # dfg_scope, - # state_id, - # function_stream, - # callsite_stream, - # skip_entry_node=True) + # print("TargetDispatcher:", self.dispatcher.used_targets)