diff --git a/check_external_library_used.py b/check_external_library_used.py new file mode 100644 index 0000000000..b408009b08 --- /dev/null +++ b/check_external_library_used.py @@ -0,0 +1,13 @@ +from dace.libraries import blas + +print('BLAS calls will expand by default to', blas.default_implementation) + +if blas.IntelMKL.is_installed(): + blas.default_implementation = 'MKL' +elif blas.cuBLAS.is_installed(): + blas.default_implementation = 'cuBLAS' +elif blas.OpenBLAS.is_installed(): + blas.default_implementation = 'OpenBLAS' +elif not blas.BLAS.is_installed(): + # No BLAS library found, use the unoptimized native SDFG fallback + blas.default_implementation = 'pure' diff --git a/cpu.py b/cpu.py new file mode 100644 index 0000000000..8b84cdedfc --- /dev/null +++ b/cpu.py @@ -0,0 +1,20 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np + +@dace.program +def cpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = cpu_vector_add.to_sdfg(simplify=False) # compiled SDFG + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/cpu_array_optimize.py b/cpu_array_optimize.py new file mode 100644 index 0000000000..67e0bd28d8 --- /dev/null +++ b/cpu_array_optimize.py @@ -0,0 +1,25 @@ +import dace +import numpy as np +from dace.transformation.optimizer import SDFGOptimizer + + +@dace.program +def cpu_getstarted_optimize(A, B, C): + C = A + B + return C + +if __name__ == "__main__": + #a = np.random.rand(2,3) + # a = 10 + # b = 20 + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + print ("before dace(CPU) (a,b)", A, B, C) + print("after dace(CPU)", cpu_getstarted_optimize(A, B, C)) + sdfg = cpu_getstarted_optimize.to_sdfg(A, B, C) + + # VISUALLY OPTIMIZE + sdfg = SDFGOptimizer(sdfg).optimize() + # sdfg.apply_gpu_transformations() \ No newline at end of file diff --git a/custom_codegen_external.py b/custom_codegen_external.py new file mode 100644 index 0000000000..0b84b77ec2 --- /dev/null +++ b/custom_codegen_external.py @@ -0,0 +1,107 @@ +import dace +from dace import registry +from dace.sdfg.scope import ScopeSubgraphView +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.target import TargetCodeGenerator +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.targets.cpp import sym2cpp + +@dace.program +def custom_kernel(A: dace.float64[20, 30]): + for i, j in dace.map[0:20:2, 0:30]: + A[i, j] += A[i, j] + + + +dace.ScheduleType.register('LoopyLoop') +dace.SCOPEDEFAULT_SCHEDULE[dace.ScheduleType.LoopyLoop] = dace.ScheduleType.Sequential +dace.SCOPEDEFAULT_STORAGE[dace.ScheduleType.LoopyLoop] = dace.StorageType.CPU_Heap + + +@registry.autoregister_params(name='loopy') +class MyCustomLoop(TargetCodeGenerator): + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): + ################################################################ + # Define some locals: + # Can be used to call back to the frame-code generator + self.frame = frame_codegen + # Can be used to dispatch other code generators for allocation/nodes + self.dispatcher = frame_codegen.dispatcher + + ################################################################ + # Register handlers/hooks through dispatcher: Can be used for + # nodes, memory copy/allocation, scopes, states, and more. + + # In this case, register scopes + self.dispatcher.register_map_dispatcher(dace.ScheduleType.LoopyLoop, self) + + # You can similarly use register_{array,copy,node,state}_dispatcher + + # A scope dispatcher will trigger a method called generate_scope whenever + # an SDFG has a scope with that schedule + def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, + callsite_stream: CodeIOStream): + # The parameters here are: + # sdfg: The SDFG we are currently generating. + # scope: The subgraph of the state containing only the scope (map contents) + # we want to generate the code for. + # state_id: The state in the SDFG the subgraph is taken from (i.e., + # `sdfg.node(state_id)` is the same as `scope.graph`) + # function_stream: A cursor to the global code (which can be used to define + # functions, hence the name). + # callsite_stream: A cursor to the current location in the code, most of + # the code is generated here. + + # We can get the map entry node from the scope graph + entry_node = scope.source_nodes()[0] + + # First, generate an opening brace (for instrumentation and dynamic map ranges) + callsite_stream.write('{', sdfg, state_id, entry_node) + + ################################################################ + # Generate specific code: We will generate a reversed loop with a + # comment for each dimension of the map. For the sake of simplicity, + # dynamic map ranges are not supported. + + for param, rng in zip(entry_node.map.params, entry_node.map.range): + # We use the sym2cpp function from the cpp support functions + # to convert symbolic expressions to proper C++ + begin, end, stride = (sym2cpp(r) for r in rng) + + # Every write is optionally (but recommended to be) tagged with + # 1-3 extra arguments, serving as line information to match + # SDFG, state, and graph nodes/edges to written code. + callsite_stream.write(f'''// Loopy-loop {param} + for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''', + sdfg, state_id, entry_node + ) + + # NOTE: CodeIOStream will automatically take care of indentation for us. + + + # Now that the loops have been defined, use the dispatcher to invoke any + # code generator (including this one) that is registered to deal with + # the internal nodes in the subgraph. We skip the MapEntry node. + self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, + function_stream, callsite_stream, + skip_entry_node=True) + + # NOTE: Since skip_exit_node above is set to False, closing braces will + # be automatically generated + +# Preview SDFG +sdfg = custom_kernel.to_sdfg() + +# Change schedule +for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + node.schedule = dace.ScheduleType.LoopyLoop + +# Code(sdfg.generate_code()[0].clean_code, language='cpp') + + +# display +from IPython.display import Code +from IPython.display import display +display(Code(sdfg.generate_code()[0].clean_code, language='cpp')) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index d1427bf037..36403d01b9 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -17,7 +17,7 @@ from dace.codegen.instrumentation import InstrumentationProvider from dace.sdfg.state import SDFGState - +# include/* files, containing the signature header code. def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: """ Generate a header file for the SDFG """ proto = "" @@ -34,7 +34,7 @@ def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: proto += 'extern "C" void __program_%s(%sHandle_t handle%s);\n' % params return proto - +# sample/* files - contains the main() function. def generate_dummy(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: """ Generates a C program calling this SDFG. Since we do not know the purpose/semantics of the program, we allocate @@ -147,7 +147,10 @@ def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: disp.instrumentation[sdfg.instrument] = provider_mapping[sdfg.instrument] - +# 3 step process +# 1. Generate the code for the SDFG(.cpp file)(generate_code)(sdfg.generate_code()[0]) +# 2. Generate the header file for the SDFG(.h file)(generate_headers)(sdfg.generate_code()[1]) +# 3. Generate the main function to call the SDFG(.main file)(generate_dummy)(sdfg.generate_code()[2]) def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. @@ -230,6 +233,7 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: # NOTE: THE SDFG IS ASSUMED TO BE FROZEN (not change) FROM THIS POINT ONWARDS # Generate frame code (and the rest of the code) + # (, generated_code/clean_code, ...)) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, @@ -246,6 +250,7 @@ def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: target_objects.extend(tgt.get_generated_codeobjects()) # Ensure that no new targets were dynamically added + assert frame._dispatcher.used_targets == (frame.targets - {frame}) # add a header file for calling the SDFG diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 3ac9e097f8..926154423f 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -444,6 +444,8 @@ def dispatch_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi state = cfg.state(state_id) disp = self.get_node_dispatcher(sdfg, state, node) self._used_targets.add(disp) + # print debugging for the dispatcher + print("SJJ: Dispatching node", node, "to", disp) disp.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) def get_scope_dispatcher(self, schedule: dtypes.ScheduleType) -> target.TargetCodeGenerator: diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index cd4d5f957f..5f01d11f08 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,3 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen +from .ipu import IPUCodeGen \ No newline at end of file diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 5b756b413c..fd8997d01d 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -215,6 +215,13 @@ def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre for env in self.environments: self.statestruct.extend(env.state_fields) + # GRAPHCORE + # self.statestruct.append('IPUModel ipuModel;') + # self.statestruct.append('Device device = ipuModel.createDevice();') + # self.statestruct.append('Target target = device.getTarget();') + # self.statestruct.append('Graph graph(target);') + + # Instrumentation preamble if len(self._dispatcher.instrumentation) > 2: self.statestruct.append('dace::perf::Report report;') @@ -404,7 +411,57 @@ def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeI # Footer callsite_stream.write('}', sdfg) + + def generate_ipu_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + global_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = True): + callsite_stream.write(f'// GENIPU_STATE() {state.label} ({state.block_id})\n', sdfg) + sid = state.block_id + + # Emit internal transient array allocation + self.allocate_arrays_in_scope(sdfg, cfg, state, global_stream, callsite_stream) + + callsite_stream.write('\n') + + # Invoke all instrumentation providers + for instr in self._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_begin(sdfg, state, callsite_stream, global_stream) + + ##################### + # Create dataflow graph for state's children. + + # DFG to code scheme: Only generate code for nodes whose all + # dependencies have been executed (topological sort). + # For different connected components, run them concurrently. + + # components = dace.sdfg.concurrent_subgraphs(state) + # if len(components) <= 1: + # self._dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, + # skip_entry_node=False) + # else: + # callsite_stream.write("{") + # self._dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, + # skip_entry_node=False) + # callsite_stream.write("}") + + ##################### + # Write state footer + + if generate_state_footer: + # Emit internal transient array deallocation + self.deallocate_arrays_in_scope(sdfg, state.parent_graph, state, global_stream, callsite_stream) + + # Invoke all instrumentation providers + for instr in self._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_end(sdfg, state, callsite_stream, global_stream) + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -473,7 +530,6 @@ def dispatch_state(state: SDFGState) -> str: opbar.next() states_generated.add(state) # For sanity check return stream.getvalue() - if sdfg.root_sdfg.using_experimental_blocks: # Use control flow blocks embedded in the SDFG to generate control flow. cft = cflow.structured_control_flow_tree_with_regions(sdfg, dispatch_state) @@ -500,7 +556,7 @@ def dispatch_state(state: SDFGState) -> str: opbar.done() # Write exit label - callsite_stream.write(f'__state_exit_{sdfg.cfg_id}:;', sdfg) + # callsite_stream.write(f'__state_exit_{sdfg.cfg_id}:;', sdfg) return states_generated @@ -885,6 +941,7 @@ def generate_code(self, # Allocate outer-level transients self.allocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) + # callsite_stream.write('called allocate_arrays_in_scope outer\n', sdfg) # Define constants as top-level-allocated for cname, (ctype, _) in sdfg.constants_prop.items(): @@ -928,7 +985,7 @@ def generate_code(self, # Generate actual program body states_generated = self.generate_states(sdfg, global_stream, callsite_stream) - + # Loop over states_generated and print them ####################################################################### # Sanity check @@ -940,6 +997,7 @@ def generate_code(self, # Deallocate transients self.deallocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) + # callsite_stream.write('called deallocate_arrays_in_scope internal transient\n', sdfg) # Now that we have all the information about dependencies, generate # header and footer @@ -969,16 +1027,44 @@ def generate_code(self, header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() - + # print("generated header:") + # print("#" * 50) + # print(generated_header) + # print("#" * 50) + # # print("Footer Stream:") + # # print("#" * 50) + # # print(footer_stream.getvalue()) + # # print("#" * 50) + + + all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() + # print("#" * 50) + # print("Function Signature:") + # print("#" * 50) + # print(function_signature) + # print("#" * 50) + # print("Header Stream:") + # print("#" * 50) + # header_stream.write("This is internal header, int x=10;") + # print(header_stream.getvalue()) + # print("#" * 50) + # print("callsite Code:") + # print("#" * 50) + # print(callsite_stream.getvalue()) + # print("#" * 50) + # print("Footer Stream:") + # print("#" * 50) + # print(footer_stream.getvalue()) + # print("#" * 50) else: - generated_header = global_stream.getvalue() - generated_code = callsite_stream.getvalue() + generated_header = global_stream.getvalue() # header + generated_code = callsite_stream.getvalue() # frame # Clean up generated code gotos = re.findall(r'goto (.*?);', generated_code) @@ -993,6 +1079,7 @@ def generate_code(self, if label[0] not in gotos: continue clean_code += line + '\n' + clean_code = generated_code # Return the generated global and local code strings return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments) diff --git a/dace/codegen/targets/ipu.py b/dace/codegen/targets/ipu.py new file mode 100644 index 0000000000..0cf5a07beb --- /dev/null +++ b/dace/codegen/targets/ipu.py @@ -0,0 +1,1429 @@ +# import +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import inspect +from io import StringIO +from dace.codegen.codeobject import CodeObject +import sympy +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +from copy import deepcopy +from dace import (data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config) +from dace import dtypes, memlet as mm +from dace import Memlet +from dace.codegen import cppunparse, exceptions as cgx + +from dace.codegen.prettycode import CodeIOStream +import dace.codegen.targets +from dace.codegen.targets import cpp, fpga +from dace.codegen.targets.cpu import CPUCodeGen +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.common import codeblock_to_cpp, sym2cpp, update_persistent_desc +from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute +from dace.codegen.dispatcher import DefinedType, TargetDispatcher +from dace.frontend import operations +from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, + dynamic_map_inputs, nodes, utils as sdutils) +from dace.sdfg import nodes, SDFG, SDFGState, ScopeSubgraphView, graph as gr +from dace.sdfg.scope import is_devicelevel_gpu, is_in_scope +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView +from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode +from dace.sdfg import nodes, SDFG, SDFGState, ScopeSubgraphView, graph as gr +from dace.sdfg.validation import validate_memlet_data +from dace.sdfg.graph import MultiConnectorEdge +from dace.codegen.targets.ipu_files import ipu_utils as ipu_utils +from dace.codegen.targets.cpp import (codeblock_to_cpp, cpp_array_expr, memlet_copy_to_absolute_strides, sym2cpp, + synchronize_streams, unparse_cr, mangle_dace_state_struct_name) + +import copy +import functools +import itertools +import warnings + +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen +import pdb; + +def is_ipu_kernel(sdfg, state): + """ + Returns whether the given state is an IPU kernel and should be dispatched + to the IPU code generator. + + :return: True if this is an IPU kernel, False otherwise. + """ + # pdb.set_trace() + data_nodes = state.data_nodes() + at_least_one_ipu_allocated_array = False + for n in data_nodes: + desc = n.desc(sdfg) + # print(desc.storage.name, desc.storage, desc) + if desc.storage == dtypes.StorageType.IPU_Memory: + at_least_one_ipu_allocated_array = True + if isinstance(desc, data.Scalar): + continue + return at_least_one_ipu_allocated_array + +@registry.autoregister_params(name='ipu') +class IPUCodeGen(TargetCodeGenerator): + """ IPU(Graphcore) code generator. """ + target_name = 'ipu' + title = 'IPU' + language = 'cpp' + _in_device_code = False + + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG): + + self.program_name = sdfg.name + + self.has_generated_header = False + self.frame = frame_codegen + self.dispatcher = frame_codegen._dispatcher + self.cpu_codegen: Optional['CPUCodeGen'] = None + # self._locals = cppunparse.CPPLocals() + # Scope depth (for defining locals) + self._ldepth = 0 + # Keep nested SDFG schedule when descending into it + self._toplevel_schedule = None + self._localcode = CodeIOStream() + self._globalcode = CodeIOStream() + self._initcode = CodeIOStream() + self._exitcode = CodeIOStream() + self._global_sdfg: SDFG = sdfg + self._arglists: Dict[nodes.MapEntry, Dict[str, data.Data]] = {} + # Keep track of current "scope entry/exit" code streams for extra + # code generation + self.scope_entry_stream = self._initcode + self.scope_exit_stream = self._exitcode + self._ipu_streams, self._ipu_events = 0, 0 + self._kernels_dependencies = dict() + self._kernels_names_to_id = dict() + self._num_kernels = 0 + self._host_codes = [] + self._kernel_codes = [] + self._generated_nodes = set() + self._locals = cppunparse.CPPLocals() + + + # Register dispatchers + self.cpu_codegen = self.dispatcher.get_generic_node_dispatcher() + + self.dispatcher.register_state_dispatcher(self, predicate=is_ipu_kernel) + ipu_storage = [dtypes.StorageType.IPU_Memory] + self.dispatcher.register_array_dispatcher(ipu_storage, self) # allocate_array/deallocate_array + for storage in ipu_storage: + for other_storage in dtypes.StorageType: + self.dispatcher.register_copy_dispatcher(storage, other_storage, None, self) + self.dispatcher.register_copy_dispatcher(other_storage, storage, None, self) + + # # Dispatchers + # self.dispatcher.register_map_dispatcher(dace.ScheduleType.Default, self) + # self.dispatcher.register_node_dispatcher(self, self.is_ipu_map_scope) + # self.dispatcher.register_node_dispatcher(self) + # self.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) + + def preprocess(self, sdfg: SDFG) -> None: + self.frame.statestruct.append('dace_poplar_context *poplar_context;') + pass + + def get_generated_codeobjects(self): + params_comma = self._global_sdfg.init_signature(free_symbols=self.frame.free_symbols(self._global_sdfg)) + if params_comma: + params_comma = ', ' + params_comma + + host_code = CodeIOStream() + host_code.write(""" +#include +""") + fileheader = CodeIOStream() + self.frame.generate_fileheader(self._global_sdfg, fileheader, 'poplar') + + host_code.write(""" +{file_header} + +{other_globalcode} + +DACE_EXPORTED int __dace_init_ipu({sdfg_state_name} *__state{params}) {{ + __state->poplar_context = new dace_poplar_context(); + return 0; +}} + +DACE_EXPORTED int __dace_exit_ipu({sdfg_state_name} *__state) {{ + delete __state->poplar_context; + return 0; +}} + +DACE_EXPORTED auto getIpuDevice(const unsigned int numIpus = 1) -> optional +{{ + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) {{ + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) {{ + std::cout << " - attached" << std::endl; + device = {{std::move(d)}}; + break; + }} else {{ + std::cout << std::endl << "Error attaching to device" << std::endl; + }} + }} + return device; +}} + +DACE_EXPORTED auto defineDataStreams({sdfg_state_name} &__state) +{{ + auto toIpuStream = __state.poplar_context->graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = __state.poplar_context->graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + __state.poplar_context->programs["copy_to_ipu"] = Copy(toIpuStream, __state.poplar_context->tensors["data"]); + __state.poplar_context->programs["copy_to_host"] = Copy(__state.poplar_context->tensors["data"], fromIpuStream); +}} + +{host_code_seperator}""".format(params=params_comma, + sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg), + other_globalcode=self._globalcode.getvalue(), + file_header=fileheader.getvalue(), + sdfg=self._global_sdfg, + host_code_seperator="".join([ + "{separator}\n// Dataflow graph building: {kernel_name}" + "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) + for (name, code) in self._host_codes]))) + + # only generate ipu/file.cpp when it's an IPU kernel, else only cpu/file.cpp + if is_ipu_kernel(self._global_sdfg, self._global_sdfg.node(0)): + host_code_obj = CodeObject(self.program_name, + host_code.getvalue(), + "cpp", + IPUCodeGen, + "IPU", + target_type="host") + return [host_code_obj] + else: + return [] + + # # Device object + # kernel_code_objs = [ + # CodeObject(kernel_name, + # code, + # "cpp", + # IPUCodeGen, + # "IPU", + # target_type="device") for (kernel_name, code) in self._kernel_codes + # ] + + + + # __dace_init_ function + @property + def has_initializer(self): + return True + + # __dace_exit_ function + @property + def has_finalizer(self): + return True + + def state_dispatch_predicate(self, sdfg, state): + if self._toplevel_schedule == dtypes.ScheduleType.IPU_SCHEDULE: + return True + return False + + + @staticmethod + def cmake_options(): + options = [] + # if Config.get("compiler", "ipu", "libs"): + # options.append('-DCMAKE_SHARED_LINKER_FLAGS="{}"'.format(Config.get("compiler", "ipu", "libs"))) + return options + + def is_node_tasklet(self, sdfg, state, node): + if isinstance(node, nodes.Tasklet): + return True + return False + + def is_node_library_node(self, sdfg, state, node): + if isinstance(node, nodes.LibraryNode): + return True + return False + + def node_dispatch_predicate(self, sdfg, state, node): + return True + retval = False + if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes + if node.schedule in dtypes.IPU_SCHEDULES: + retval = True + print("Node dispatch predicate: ", retval) + return retval + +############################################################################################################ +# IPU specific node/state generation +############################################################################################################ + + def allocate_ipu_scalar(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + + result_decl = StringIO() + result_alloc = StringIO() + arrsize = nodedesc.total_size + is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) + #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = 'Tensor *' + shape = nodedesc.shape + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + # Check if array is already declared + declared = self.dispatcher.declared_arrays.has(dataname) + # Different types of memories + if nodedesc.storage == dtypes.StorageType.IPU_Memory: + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; + self.dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + + if nodedesc.pool: + raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) + else: + shape_poplar_format = ', '.join([str(sh) for sh in shape]) + result_alloc.write("%s = _state->graph.addVariable(%s, {%s});\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format)) + else: + raise NotImplementedError("IPU: Unimplemented StorageType " + str(nodedesc.storage)) + + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) + + def allocate_ipu_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + result_decl = StringIO() + result_alloc = StringIO() + arrsize = nodedesc.total_size + is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) + #arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = 'Tensor' + shape = nodedesc.shape + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + # Check if array is already declared + declared = self.dispatcher.declared_arrays.has(dataname) + + # Different types of memories + if nodedesc.storage == dtypes.StorageType.IPU_Memory: + if not declared: + result_decl.write('%s %s;\n' % (ctypedef, dataname)) # Tensor *p; + self.dispatcher.defined_vars.add(dataname, DefinedType.Object, ctypedef) + + if nodedesc.pool: + raise NotImplementedError("Pool not implemented yet " + str(nodedesc.storage)) + else: + shape_poplar_format = ', '.join([str(sh) for sh in shape]) + result_alloc.write("__state.poplar_context->tensors[\"%s\"] = __state.poplar_context->graph.addVariable(%s, {%s}, \"%s\");\n" % (dataname, ipu_utils.TYPE_TO_IPU[nodedesc.dtype], shape_poplar_format, dataname)) + else: + raise NotImplementedError("IPU: Unimplemented StorageType " + str(nodedesc.storage)) + + # declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) + + def allocate_ipu_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + return NotImplementedError("IPU Stream not implemented yet") +# dataname = node.data +# allocname = cpp.ptr(dataname, nodedesc, sdfg, self.frame) +# if nodedesc.storage == dtypes.StorageType.GPU_Global: +# fmtargs = { +# 'name': allocname, # TODO: Handle persistent streams +# 'allocname': allocname, +# 'type': nodedesc.dtype.ctype, +# 'is_pow2': sym2cpp(sympy.log(nodedesc.buffer_size, 2).is_Integer), +# 'location': '%s_%s_%s' % (cfg.cfg_id, state_id, dfg.node_id(node)) +# } + +# ctypedef = 'dace::GPUStream<{type}, {is_pow2}>'.format(**fmtargs) +# self._dispatcher.defined_vars.add(allocname, DefinedType.Stream, ctypedef) + +# if is_array_stream_view(sdfg, dfg, node): +# edges = dfg.out_edges(node) +# if len(edges) > 1: +# raise NotImplementedError("Cannot handle streams writing to multiple arrays.") + +# fmtargs['ptr'] = nodedesc.sink + ' + ' + cpp_array_expr( +# sdfg, edges[0].data, with_brackets=False, codegen=self._frame) + +# # Assuming 1D subset of sink/src +# # sym2cpp(edges[0].data.subset[-1]) +# fmtargs['size'] = sym2cpp(nodedesc.buffer_size) + +# # (important) Ensure GPU array is allocated before the stream +# datanode = dfg.out_edges(node)[0].dst +# sinkdesc = sdfg.arrays[datanode.data] +# self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, datanode, sinkdesc, function_stream, +# allocation_stream) + +# function_stream.write( +# 'DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' +# .format(**fmtargs), cfg, state_id, node) +# self._globalcode.write( +# """ +# DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result); +# void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result) {{ +# result = dace::AllocGPUArrayStreamView<{type}, {is_pow2}>(ptr, size); +# }}""".format(**fmtargs), cfg, state_id, node) +# declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, +# node) +# allocation_stream.write('__dace_alloc_{location}({ptr}, {size}, {allocname});'.format(**fmtargs), cfg, +# state_id, node) +# else: +# fmtargs['size'] = sym2cpp(nodedesc.buffer_size) + +# function_stream.write( +# 'DACE_EXPORTED void __dace_alloc_{location}(uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' +# .format(**fmtargs), cfg, state_id, node) +# self._globalcode.write( +# """ +# DACE_EXPORTED void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result); +# void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result) {{ +# result = dace::AllocGPUStream<{type}, {is_pow2}>({size}); +# }}""".format(**fmtargs), cfg, state_id, node) +# declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, +# node) +# allocation_stream.write('__dace_alloc_{location}({size}, {allocname});'.format(**fmtargs), cfg, +# state_id, node) + + def decidemapping(self, dataname, nodedesc, sdfg): + + # Get the shape of the data descriptor + shape = nodedesc.shape + # Get the total size of the data descriptor + size = nodedesc.total_size + + # CREATE a dictionary to store the mapping of the data to the tile + dataToTileMap = {} + # Get the number of tiles + numTiles = 10 + # Get the number of elements in the data descriptor + numElements = size + + if (numElements < numTiles): # special case + numTiles = numElements + + # Get the number of elements per tile + numElementsPerTile = numElements // numTiles + # Get the number of elements in the last tile + numElementsLastTile = numElements % numTiles + + # Loop over the number of tiles + for i in range(numTiles): + # Get the start index of the tile + start = i * numElementsPerTile + # Get the end index of the tile + end = start + numElementsPerTile + if (end - start > 1): + # Get the data of the tile with slicing + data = dataname + ".slice(" + "[" + str(start) + ":" + str(end) + "]" + ")" + else: + data = dataname + "[" + str(start) + "]" + + # Add the data to the tile mapping + dataToTileMap[data] = i + + # # Get the start index of the last tile + # start = numTiles * numElementsPerTile + # # Get the end index of the last tile + # end = start + numElementsLastTile + # # Get the data of the last tile + # data = dataname + "[" + str(start) + ":" + str(end) + "]" + # # Add the data to the tile mapping + # dataToTileMap[data] = numTiles - 1 + + return dataToTileMap + + # TODO:Similar mapVertexOntile + def mapdataontile(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + if isinstance(nodedesc, dace.data.Array): + self.mapArrayOnTile(sdfg, cfg, state_id, node, nodedesc, allocation_stream) + elif isinstance(nodedesc, dace.data.Scalar): + self.mapScalarOnTile(sdfg, cfg, state_id, node, nodedesc, allocation_stream) + else: + raise NotImplementedError("Unimplemented mapping for this AccessNode: {}".format(type(nodedesc))) + + def mapArrayOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + # Map array intelligently + setTileMappingCall = StringIO() + spreadOverTilesManually = False + + if spreadOverTilesManually: + dataToTileMap = self.decidemapping(dataname, nodedesc, sdfg) + # Map array over multiple tiles + # loop over the dataToTileMap and set the mapping + for data, tilenumber in dataToTileMap.items(): + setTileMappingCall.write(f"_state->graph.setTileMapping({data}, {tilenumber});") + else: + # Map linearly over tiles, let poplar decide + setTileMappingCall.write(f"poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors[\"{dataname}\"]);") + + allocation_stream.write(setTileMappingCall.getvalue(), cfg, state_id, node) + + def mapScalarOnTile(self, sdfg, cfg, state_id, node, nodedesc, allocation_stream): + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + # Map scalar, given only 1 element maps on one tile + tilenumber = 0 + setTileMappingCall = f"_state->graph.setTileMapping({dataname}, {tilenumber});" + allocation_stream.write(setTileMappingCall, cfg, state_id, node) + + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + allocation_stream.write("// Allocating array %s\n" % node.data, cfg, state_id, node) + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): + nodedesc = update_persistent_desc(nodedesc, sdfg) + + dataname = cpp.ptr(node.data, nodedesc, sdfg, self.frame) + + try: + self.dispatcher.defined_vars.get(dataname) + return + except KeyError: + pass # The variable was not defined, we can continue + + if isinstance(nodedesc, dace.data.Stream): + self.allocate_ipu_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + allocation_stream) + elif isinstance(nodedesc, dace.data.View): + self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) + elif isinstance(nodedesc, dace.data.Reference): + self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + declaration_stream, allocation_stream) + elif isinstance(nodedesc, dace.data.Array): + self.allocate_ipu_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.mapdataontile(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + elif isinstance(nodedesc, dace.data.Scalar): + self.allocate_ipu_scalar(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + self.mapdataontile(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + else: + raise NotImplementedError("Unimplemented type: {}".format(type(nodedesc))) + + # Mapping on tiles + + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + if nodedesc.storage == dtypes.StorageType.IPU_Memory or \ + nodedesc.storage == dtypes.StorageType.Register: + pass # IPU variables are C++ objects and are automatically deallocated + else: + raise NotImplementedError("Unimplemented deallocate() for StorageType " + str(nodedesc.storage)) + + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + edge: MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write("// Copying from {} (name: {}) to {} (name: {}) with edge: {}\n".format( + src_node, src_node.label, dst_node, dst_node.label, edge), cfg, state_id) + state = cfg.state(state_id) + if isinstance(src_node, nodes.Tasklet): + src_storage = dtypes.StorageType.Register + src_parent = state.entry_node(src_node) + dst_schedule = None if src_parent is None else src_parent.map.schedule + else: + src_storage = src_node.desc(sdfg).storage + + if isinstance(dst_node, nodes.Tasklet): + dst_storage = dtypes.StorageType.Register + else: + dst_storage = dst_node.desc(sdfg).storage + + dst_parent = state.entry_node(dst_node) + dst_schedule = None if dst_parent is None else dst_parent.map.schedule + + callsite_stream.write("poplar::copy calls") + # # Emit actual copy + # self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, + # callsite_stream) + + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + method_name = "_generate_" + type(node).__name__ + # print(method_name) + # function_stream.write(f"//SJJ: Generating node {node.label}, method name = {method_name} \n") + callsite_stream.write("// Generating Node: " + str(node) + ", Type: " + type(node).__name__ + ", Details: " + repr(node) + "\n", sdfg, state_id) + + try: + gen = getattr(self, "_generate_" + type(node).__name__) + except AttributeError: + if isinstance(node, nodes.LibraryNode): + raise NodeNotExpandedError(sdfg, state_id, dfg.node_id(node)) + raise + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # Mark node as "generated" + self._generated_nodes.add(node) + self._locals.clear_scope(self._ldepth + 1) + + + # else: + # old_codegen = self._cpu_codegen.calling_codegen + # self._cpu_codegen.calling_codegen = self + + # self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + # self._cpu_codegen.calling_codegen = old_codegen + # Dynamically obtain node generator according to class name + + + # def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, + # function_stream: CodeIOStream, callsite_stream: CodeIOStream): + # """(TASKLET only) + # 0. Declarations + # 1. Generate pre tasklet + # 2. Generate tasklet code + # 3. Generate post tasklet + # 4. Writes + # """ + # callsite_stream.write(f"// Generating node {node.label}\n") + # inner_stream, codegen = self.declarations(cfg, state_id, node, function_stream) + # self.dispatcher.defined_vars.enter_scope(node) + # ############################################################################################################ + # # self.pre_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen) + # for edge in state.in_edges(node): + # self.generate_read(sdfg, state, edge, inner_stream) + # callsite_stream.write('SJJ:TASKLET', cfg, state_id, node) + # function_stream.write("SJJ:TASKLET Call {0}() {{\n".format(node.label), cfg, state_id, node) + # self.tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream) + # after_memlets_stream = self.post_tasklet(sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen) + # ############################################################################################################ + # callsite_stream.write('{', cfg, state_id, node) + # callsite_stream.write(inner_stream.getvalue(), cfg, state_id, node) + # callsite_stream.write(after_memlets_stream.getvalue()) + # callsite_stream.write('}', cfg, state_id, node) + # self._locals.clear_scope(self._ldepth + 1) + # self.dispatcher.defined_vars.exit_scope(node) + + def declarations(self, cfg, state_id, node, function_stream): + self.add_header(function_stream) + inner_stream = CodeIOStream() + state_dfg: SDFGState = cfg.nodes()[state_id] + codegen = self.cpu_codegen or self + return inner_stream,codegen + + def post_tasklet(self, sdfg, cfg, state, state_id, node, function_stream, inner_stream, codegen): + after_memlets_stream = CodeIOStream() + codegen.generate_tasklet_postamble(sdfg, cfg, state, state_id, node, function_stream, inner_stream, + after_memlets_stream) + # Process outgoing memlets + codegen.process_out_memlets(sdfg, cfg, state_id, node, state, self.dispatcher, inner_stream, True, function_stream) + return after_memlets_stream + + def tasklet(self, sdfg, cfg, state, state_id, node, function_stream, inner_stream): + inner_stream.write("\n ///////////////////\n", cfg, state_id, node) + # Currently cpu + self.unparse_ipu_tasklet(sdfg, cfg, state_id, state, node, function_stream, inner_stream, self._locals, + self._ldepth, self._toplevel_schedule) + inner_stream.write(" ///////////////////\n\n", cfg, state_id, node) + + def pre_tasklet(self, sdfg, cfg, state, state_id, node, function_stream, callsite_stream, inner_stream, codegen): + after_memlets_stream = CodeIOStream() + codegen.generate_tasklet_preamble(sdfg, cfg, state, state_id, node, function_stream, callsite_stream, + after_memlets_stream) + # SOME VARIABLE DECLARATIONS + # post-memlet tasklet-preamble code + + callsite_stream.write(after_memlets_stream.getvalue()) + self.add_pre_tasklet_declarations(sdfg, cfg, state_id, state, node, function_stream, inner_stream) + + def unparse_ipu_tasklet(self, sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + toplevel_schedule): + # Change it later to IPU specific + function_stream.write(f"SJJ: {node.label}() {{\n", cfg, state_id, node) + self.cpu_codegen.unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + toplevel_schedule) + + def add_pre_tasklet_declarations(self, sdfg, cfg, state_id, state, node, function_stream, inner_stream): + + arrays = set() + for edge in state.in_edges(node): + u = edge.src + memlet = edge.data + src_node = state.memlet_path(edge)[0].src + + if edge.dst_conn: # Not (None or "") + if edge.dst_conn in arrays: # Disallow duplicates + raise SyntaxError("Duplicates found in memlets") + ctype = node.in_connectors[edge.dst_conn].ctype + # Special case: code->code + if isinstance(src_node, nodes.CodeNode): + shared_data_name = edge.data.data + if not shared_data_name: + # Very unique name. TODO: Make more intuitive + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, state.node_id(src_node), + state.node_id(node), edge.src_conn) + + # Read variable from shared storage + defined_type, _ = self.dispatcher.defined_vars.get(shared_data_name) + if defined_type in (DefinedType.Scalar, DefinedType.Pointer): + assign_str = (f"const {ctype} {edge.dst_conn} = {shared_data_name};") + else: + assign_str = (f"const {ctype} &{edge.dst_conn} = {shared_data_name};") + inner_stream.write(assign_str, cfg, state_id, [edge.src, edge.dst]) + self.dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}") + + else: + self.dispatcher.dispatch_copy( + src_node, + node, + edge, + sdfg, + cfg, + state, + state_id, + function_stream, + inner_stream, + ) + + # Also define variables in the C++ unparser scope + self._locals.define(edge.dst_conn, -1, self._ldepth + 1, ctype) + arrays.add(edge.dst_conn) + + def generate_read(self, sdfg: SDFG, state: SDFGState, edge: graph.MultiConnectorEdge[mm.Memlet], + code: CodeIOStream): + """ + Responsible for generating code for reads into a Tasklet, given the ingoing edge. + """ + if edge.dst_conn is None: + return + src_node = state.memlet_path(edge)[0].src + dst_type = edge.dst.in_connectors[edge.dst_conn] + dst_name = edge.dst_conn + if isinstance(src_node, nodes.Tasklet): + ################## + # Code->Code edges + src_type = edge.src.out_connectors[edge.src_conn] + if util.is_vector(src_type) and util.is_vector(dst_type): + # Directly read from shared vector register + code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_scalar(dst_type): + # Directly read from shared scalar register + code.write(f'{dst_type} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_vector(dst_type): + # Scalar broadcast from shared scalar register + code.write( + f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' + ) + else: + raise util.NotSupportedError('Unsupported Code->Code edge') + elif isinstance(src_node, nodes.AccessNode): + ################## + # Read from AccessNode + desc = src_node.desc(sdfg) + if isinstance(desc, data.Array): + # Copy from array + if util.is_pointer(dst_type): + ################## + # Pointer reference + code.write( + f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') + elif util.is_vector(dst_type): + raise util.NotSupportedError('Unsupported read from array which is vector type, util.is_vector()') + else: + ################## + # Scalar read from array + code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') + elif isinstance(desc, data.Scalar): + # Refer to shared variable + src_type = desc.dtype + if util.is_vector(src_type) and util.is_vector(dst_type): + # Directly read from shared vector register + code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_scalar(dst_type): + # Directly read from shared scalar register + code.write(f'{dst_type} {dst_name} = {edge.data.data};') + elif util.is_scalar(src_type) and util.is_vector(dst_type): + # Scalar broadcast from shared scalar register + code.write( + f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' + ) + else: + raise util.NotSupportedError('Unsupported Scalar->Code edge') + else: + raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported') + + def generate_state(self, + sdfg:SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream:CodeIOStream, + generate_state_footer:bool = True): + # disp = self.dispatcher.get_scope_dispatcher(dtypes.ScheduleType.Unrolled) + ipu_disp = self.dispatcher.get_state_dispatcher(sdfg, state=state) + cpu_disp = self.cpu_codegen + self.dispatcher._used_targets.add(ipu_disp) + self.dispatcher._used_targets.add(cpu_disp) + + state_id = state.block_id + subgraphs = dace.sdfg.concurrent_subgraphs(state) + + if IPUCodeGen._in_device_code: + print("device code") + + to_allocate = dace.sdfg.local_transients(sdfg, state, None) + allocated = set() + + + for node in state.data_nodes(): + data = node.desc(sdfg) + if node.data not in to_allocate or node.data in allocated: + continue + # Make sure there are no global transients in the nested state + # that are thus not gonna be allocated + if data.storage == dtypes.StorageType.IPU_Memory and not isinstance(data, data.View): + raise cgx.CodegenError("Cannot allocate global memory from device code.") + allocated.add(node.data) + # Allocate transients + self.dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, data, function_stream, + callsite_stream) + + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) + + else: + sdfg_state_name = cpp.mangle_dace_state_struct_name(self._global_sdfg) + formatted_string = """ + + // hack to make the files compile by forward declaring the functions + extern "C" auto getIpuDevice(const unsigned int numIpus = 1) -> optional; + extern "C" void defineDataStreams({sdfg_state_name} &__state); + extern "C" void kernel_buildComputeGraph({sdfg_state_name} &__state); + """.format(sdfg_state_name=sdfg_state_name) + + function_stream.write(formatted_string) + + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) + + # self.frame.generate_ipu_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + self.generate_ipu_cpuside_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + + + def _generate_MapEntry( + self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + node: nodes.MapEntry, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ): + callsite_stream.write(f"// Generating MapEntry {node.label}\n") + state_dfg = cfg.state(state_id) + map_params = node.map.params + + result = callsite_stream + map_header = "" + + # Encapsulate map with a C scope + # TODO: Refactor out of MapEntry generation (generate_scope_header?) + callsite_stream.write('{', cfg, state_id, node) + + # Define all input connectors of this map entry + for e in dynamic_map_inputs(state_dfg, node): + if e.data.data != e.dst_conn: + callsite_stream.write( + self.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, + state_id, node) + + inner_stream = CodeIOStream() + self.generate_scope_preamble(sdfg, dfg, state_id, function_stream, callsite_stream, inner_stream) + + # Instrumentation: Pre-scope + instr = self._dispatcher.instrumentation[node.map.instrument] + if instr is not None: + instr.on_scope_entry(sdfg, state_dfg, node, callsite_stream, inner_stream, function_stream) + + # TODO: Refactor to generate_scope_preamble once a general code + # generator (that CPU inherits from) is implemented + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + # OpenMP header + in_persistent = False + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore: + in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent]) + if in_persistent: + # If already in a #pragma omp parallel, no need to use it twice + map_header += "#pragma omp for" + # TODO(later): barriers and map_header += " nowait" + else: + map_header += "#pragma omp parallel for" + + elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + map_header += "#pragma omp parallel" + + # OpenMP schedule properties + if not in_persistent: + if node.map.omp_schedule != dtypes.OMPScheduleType.Default: + schedule = " schedule(" + if node.map.omp_schedule == dtypes.OMPScheduleType.Static: + schedule += "static" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic: + schedule += "dynamic" + elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided: + schedule += "guided" + else: + raise ValueError("Unknown OpenMP schedule type") + if node.map.omp_chunk_size > 0: + schedule += f", {node.map.omp_chunk_size}" + schedule += ")" + map_header += schedule + + if node.map.omp_num_threads > 0: + map_header += f" num_threads({node.map.omp_num_threads})" + + # OpenMP nested loop properties + if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1: + map_header += ' collapse(%d)' % node.map.collapse + + if node.map.unroll: + if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): + raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")") + + result.write(map_header, cfg, state_id, node) + + if node.map.schedule == dtypes.ScheduleType.CPU_Persistent: + result.write('{\n', cfg, state_id, node) + + # Find if bounds are used within the scope + scope = state_dfg.scope_subgraph(node, False, False) + fsyms = self._frame.free_symbols(scope) + # Include external edges + for n in scope.nodes(): + for e in state_dfg.all_edges(n): + fsyms |= e.data.used_symbols(False, e) + fsyms = set(map(str, fsyms)) + + ntid_is_used = '__omp_num_threads' in fsyms + tid_is_used = node.map.params[0] in fsyms + if tid_is_used or ntid_is_used: + function_stream.write('#include ', cfg, state_id, node) + if tid_is_used: + result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', cfg, state_id, node) + if ntid_is_used: + result.write(f'auto __omp_num_threads = omp_get_num_threads();', cfg, state_id, node) + else: + # Emit nested loops + for i, r in enumerate(node.map.range): + var = map_params[i] + begin, end, skip = r + + if node.map.unroll: + result.write("#pragma unroll", cfg, state_id, node) + + result.write( + "for (auto %s = %s; %s < %s; %s += %s) {\n" % + (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), + cfg, + state_id, + node, + ) + + callsite_stream.write(inner_stream.getvalue()) + + # Emit internal transient array allocation + self._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, result) + + + def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write(f"// Mapping MapExit {node.label} \n") + self.cpu_codegen._generate_MapExit(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + callsite_stream.write(f"// Generating node {node.label} using {inspect.currentframe().f_code.co_name} \n") + self.cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # print metadata + callsite_stream.write(f"// Generating node {node.label} using {inspect.currentframe().f_code.co_name} \n") + #print current function name + + state_dfg: SDFGState = cfg.nodes()[state_id] + + + sdict = state_dfg.scope_dict() + for edge in state_dfg.in_edges(node): + predecessor, _, _, _, memlet = edge + if memlet.data is None: + continue # If the edge has to be skipped + + # Determines if this path ends here or has a definite source (array) node + memlet_path = state_dfg.memlet_path(edge) + if memlet_path[-1].dst == node: + src_node = memlet_path[0].src + # Only generate code in case this is the innermost scope + # (copies are generated at the inner scope, where both arrays exist) + if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]): + self.dispatcher.dispatch_copy( + src_node, + node, + edge, + sdfg, + cfg, + dfg, + state_id, + function_stream, + callsite_stream, + ) + + # Process outgoing memlets (array-to-array write should be emitted + # from the first leading edge out of the array) + self.process_out_memlets( + sdfg, + cfg, + state_id, + node, + dfg, + self.dispatcher, + callsite_stream, + False, + function_stream, + ) + +############################################################################################################ +# #### Helpers + + def process_out_memlets(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state_id: int, + node: nodes.Node, + dfg: StateSubgraphView, + dispatcher: TargetDispatcher, + result: CodeIOStream, + locals_defined: bool, + function_stream: CodeIOStream, + skip_wcr: bool = False, + codegen: Optional[TargetCodeGenerator] = None): + + codegen = codegen if codegen is not None else self + state: SDFGState = cfg.nodes()[state_id] + scope_dict = state.scope_dict() + + for edge in dfg.out_edges(node): + + _, uconn, v, _, memlet = edge + if skip_wcr and memlet.wcr is not None: + continue + dst_edge = dfg.memlet_path(edge)[-1] + dst_node = dst_edge.dst + + # Target is neither a data nor a tasklet node + if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode) + and not isinstance(dst_node, nodes.CodeNode)): + continue + + # Skip array->code (will be handled as a tasklet input) + if isinstance(node, nodes.AccessNode) and isinstance(v, nodes.CodeNode): + continue + + # code->code (e.g., tasklet to tasklet) + if isinstance(dst_node, nodes.CodeNode) and edge.src_conn: + shared_data_name = edge.data.data + if not shared_data_name: + # Very unique name. TODO: Make more intuitive + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id(node), + dfg.node_id(dst_node), edge.src_conn) + + result.write( + "%s = %s;" % (shared_data_name, edge.src_conn), + cfg, + state_id, + [edge.src, edge.dst], + ) + continue + + # If the memlet is not pointing to a data node (e.g. tasklet), then + # the tasklet will take care of the copy + if not isinstance(dst_node, nodes.AccessNode): + continue + # If the memlet is pointing into an array in an inner scope, then + # the inner scope (i.e., the output array) must handle it + if scope_dict[node] != scope_dict[dst_node] and scope_contains_scope(scope_dict, node, dst_node): + continue + + # Array to tasklet (path longer than 1, handled at tasklet entry) + if node == dst_node: + continue + + # Tasklet -> array + if isinstance(node, nodes.CodeNode): + if not uconn: + raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format( + str(edge.src), str(edge.dst))) + + conntype = node.out_connectors[uconn] + is_scalar = not isinstance(conntype, dtypes.pointer) + if isinstance(conntype, dtypes.pointer) and sdfg.arrays[memlet.data].dtype == conntype: + is_scalar = True # Pointer to pointer assignment + is_stream = isinstance(sdfg.arrays[memlet.data], data.Stream) + is_refset = isinstance(sdfg.arrays[memlet.data], data.Reference) and dst_edge.dst_conn == 'set' + + if (is_scalar and not memlet.dynamic and not is_stream) or is_refset: + out_local_name = " __" + uconn + in_local_name = uconn + if not locals_defined: + out_local_name = self.memlet_ctor(sdfg, memlet, node.out_connectors[uconn], True) + in_memlets = [d for _, _, _, _, d in dfg.in_edges(node)] + assert len(in_memlets) == 1 + in_local_name = self.memlet_ctor(sdfg, in_memlets[0], node.out_connectors[uconn], False) + + if memlet.wcr is not None: + nc = not cpp.is_write_conflicted(dfg, edge, sdfg_schedule=self._toplevel_schedule) + write_expr = codegen.write_and_resolve_expr( + sdfg, memlet, nc, out_local_name, in_local_name, dtype=node.out_connectors[uconn]) + ";" + else: + if isinstance(node, nodes.NestedSDFG): + # This case happens with nested SDFG outputs, + # which we skip since the memlets are references + continue + desc = sdfg.arrays[memlet.data] + ptrname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) + try: + defined_type, _ = self.dispatcher.declared_arrays.get(ptrname, is_global=is_global) + except KeyError: + defined_type, _ = self.dispatcher.defined_vars.get(ptrname, is_global=is_global) + + if defined_type == DefinedType.Scalar: + mname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + write_expr = f"{mname} = {in_local_name};" + elif defined_type == DefinedType.Pointer and is_refset: + mname = cpp.ptr(memlet.data, desc, sdfg, self._frame) + write_expr = f"{mname} = {in_local_name};" + elif (defined_type == DefinedType.ArrayInterface and not isinstance(desc, data.View)): + # Special case: No need to write anything between + # array interfaces going out + try: + deftype, _ = self.dispatcher.defined_vars.get(in_local_name) + except KeyError: + deftype = None + if deftype == DefinedType.ArrayInterface: + continue + array_expr = cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame) + decouple_array_interfaces = Config.get_bool("compiler", "xilinx", + "decouple_array_interfaces") + ptr_str = fpga.fpga_ptr( # we are on fpga, since this is array interface + memlet.data, + desc, + sdfg, + memlet.subset, + True, + None, + None, + True, + decouple_array_interfaces=decouple_array_interfaces) + write_expr = f"*({ptr_str} + {array_expr}) = {in_local_name};" + else: + desc_dtype = desc.dtype + expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame) + write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype) + + # Write out + result.write(write_expr, cfg, state_id, node) + + # Dispatch array-to-array outgoing copies here + elif isinstance(node, nodes.AccessNode): + if dst_node != node and not isinstance(dst_node, nodes.Tasklet): + dispatcher.dispatch_copy( + node, + dst_node, + edge, + sdfg, + cfg, + dfg, + state_id, + function_stream, + result, + ) + + def generate_ipu_cpuside_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = True): + sid = state.block_id + + callsite_stream.write(f'// Ipu pipeline \n', sdfg) + callsite_stream.write(f""" + // Data initialization + __state->poplar_context->hostData = vector(NUM_DATA_ITEMS, 1); + + // Real code pipeline starts from here. + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + __state->poplar_context->device = getIpuDevice(1); + if (!__state->poplar_context->device.has_value()) {{ + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return; + }} + """) + ##################### + # Create dataflow graph for state's children. + + + # Start a new state code generation: reset previous dependencies if any + self._kernels_dependencies.clear() + self._kernels_names_to_id.clear() + + # For now only 1 kernel. + kernels = [(state, 0)] + + + state_host_header_stream = CodeIOStream() + state_host_body_stream = CodeIOStream() + instrumentation_stream = CodeIOStream() + + for kern, kern_id in kernels: + if sdfg.parent_nsdfg_node is not None: + kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{cfg.cfg_id}" + else: + kernel_name = f"{state.label}_{kern_id}_{cfg.cfg_id}" + self._kernels_names_to_id[kernel_name] = kern_id + + kernel_host_stream = CodeIOStream() + function_stream.write(f"// kernel_name = {kernel_name}\n") + self.generate_host_function(sdfg, cfg, state, sid, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream) + + # Store code strings to be passed to compilation phase + self._host_codes.append((kernel_name, kernel_host_stream.getvalue())) + + ##################### + # Write state footer(After kernel call?) + callsite_stream.write(f""" + std::cout << "STEP 3: Define data streams" << std::endl; + defineDataStreams(*__state); // Pass the state directly + + std::cout << "STEP 4: Create engine and compile graph" << std::endl; + __state->poplar_context->engineOptions = OptionFlags{{ + {{"target.saveArchive", "archive.a"}}, + {{"debug.instrument", "true"}}, + {{"debug.instrumentCompute", "true"}}, + {{"debug.instrumentControlFlow", "true"}}, + {{"debug.computeInstrumentationLevel", "tile"}}, + {{"debug.outputAllSymbols", "true"}}, + {{"autoReport.all", "true"}}, + {{"autoReport.outputSerializedGraph", "true"}}, + {{"debug.retainDebugInformation", "true"}}, + }}; + + __state->poplar_context->programIds = map(); + __state->poplar_context->programsList = vector(__state->poplar_context->programs.size()); // Removing the size causes segfault + int index = 0; + for (auto &nameToProgram : __state->poplar_context->programs) {{ + __state->poplar_context->programIds[nameToProgram.first] = index; + __state->poplar_context->programsList[index] = nameToProgram.second; + index++; + }} + + // Now construct the Engine using the constructor + auto engine = Engine(__state->poplar_context->graph, __state->poplar_context->programsList, __state->poplar_context->engineOptions); + + std::cout << "STEP 5: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*__state->poplar_context->device); + // engine.enableExecutionProfiling(); + + std::cout << "STEP 6: Attach data streams" << std::endl; + + engine.connectStream("TO_IPU", __state->poplar_context->hostData.data()); + engine.connectStream("FROM_IPU", __state->poplar_context->hostData.data()); + + std::cout << "STEP 7: Run programs" << std::endl; + engine.run(__state->poplar_context->programIds["copy_to_ipu"]); // Copy to IPU + engine.run(__state->poplar_context->programIds["main"]); // Main program + engine.run(__state->poplar_context->programIds["copy_to_host"]); // Copy from IPU + """) + + ## Generate the global function here + + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mmlt.Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self.dispatcher.dispatch_copy(src_node, dst_node, edge, sdfg, cfg, state_dfg, state_id, function_stream, + callsite_stream) + + def generate_nested_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, nest_name: str, + subgraphs: List[ScopeSubgraphView], function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + + for sg in subgraphs: + self.dispatcher.dispatch_subgraph(sdfg, + cfg, + sg, + sdfg.node_id(state), + function_stream, + callsite_stream, + skip_entry_node=False) + + def generate_host_function(self, sdfg, cfg, state, state_id, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, kernel_host_stream): + # Basic arguments setting + kernel_args_call_host = [] + kernel_args_opencl = [] + # Include state in args + kernel_args_opencl.append(f"{cpp.mangle_dace_state_struct_name(self._global_sdfg)} &__state") + kernel_args_call_host.append(f"*__state") + + # real code starts + host_function_name = f"kernel_buildComputeGraph" + + callsite_stream.write("////////////////////////////////////////KERNEL") + callsite_stream.write("std::cout << \"STEP 2: Building the compute graph\" << std::endl;") + callsite_stream.write("{}({});".format(host_function_name, ", ".join(kernel_args_call_host))) + callsite_stream.write("////////////////////////////////////////") + + # function_stream.write("\n\nDACE_EXPORTED auto {}({});\n\n".format(host_function_name, + # ", ".join(kernel_args_opencl))) + + #/////////////////////////// + # add generated header information + kernel_host_stream.write(state_host_header_stream.getvalue()) + + kernel_host_stream.write(f"""\ + DACE_EXPORTED void {host_function_name}({', '.join(kernel_args_opencl)}) {{""") + + # BODY OF THE FUNCTION + # write the kernel_host_stream withe the commands I have copied + kernel_host_stream.write(f"""\ + std::cout << " STEP 2.1: Create graph and compile codelets" << std::endl; + + // Step 1: Create graph and add codelets + __state.poplar_context->graph = poplar::Graph(__state.poplar_context->device->getTarget()); + __state.poplar_context->graph.addCodelets({{"src/codelets/SkeletonCodelets.cpp"}}, "-O3 -I codelets"); + popops::addCodelets(__state.poplar_context->graph); + """) + + kernel_host_stream.write(""" + // Step 2: Add data to the graph + std::cout << " STEP 2.2: Add data to the graph" << std::endl;""") + # Emit internal transient array allocation + # __state.poplar_context->tensors["data"] = __state.poplar_context->graph.addVariable(poplar::FLOAT, {{NUM_DATA_ITEMS}}, "data"); + self.frame.allocate_arrays_in_scope(sdfg, cfg, state, function_stream, kernel_host_stream) + kernel_host_stream.write('\n') + + kernel_host_stream.write(""" + poputil::mapTensorLinearly(__state.poplar_context->graph, __state.poplar_context->tensors["data"]); + """) + kernel_host_stream.write(""" + const int numTiles = __state.poplar_context->device->getTarget().getNumTiles(); + // Add programs and wire up data + const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + //auto cs = __state.poplar_context->graph.addComputeSet("loopBody"); + // + //for (auto tileNum = 0; tileNum < numTiles; tileNum++) {{ + // const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int)NUM_DATA_ITEMS); + // const auto sliceStart = tileNum * NumElemsPerTile; + // auto v = __state.poplar_context->graph.addVertex(cs, "SkeletonVertex", {{"data", __state.poplar_context->tensors["data"].slice(sliceStart, sliceEnd)}}); + // __state.poplar_context->graph.setInitialValue(v["howMuchToAdd"], tileNum); + // __state.poplar_context->graph.setPerfEstimate(v, 100); + // __state.poplar_context->graph.setTileMapping(v, tileNum); + //}} + // + //__state.poplar_context->programs["main"] = Repeat(10, Execute(cs)); + // """) + + kernel_host_stream.write("}\n") + + self.frame.deallocate_arrays_in_scope(sdfg, cfg, state, function_stream, callsite_stream) + + def generate_kernel(self, + sdfg: dace.SDFG, + cfg: ControlFlowRegion, + state: dace.SDFGState, + kernel_name: str, + subgraphs: list, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + state_host_header_stream: CodeIOStream, + state_host_body_stream: CodeIOStream, + instrumentation_stream: CodeIOStream, + state_parameters: list, + kernel_id: int = None): + """ + Entry point for generating an FPGA Kernel out of the given subgraphs. + + :param sdfg: + :param state: + :param kernel_name: the generated kernel name. + :param subgraphs: the connected components that constitute this kernel. + :param function_stream: CPU code stream, contains global declarations. + :param callsite_stream: CPU code stream, contains code for invoking kernels, ... + :param state_host_header_stream: Device-specific host code stream: contains the host code + for the state global declarations. + :param state_host_body_stream: Device-specific host code stream: contains all the code related + to this state, for creating transient buffers, spawning kernels, and synchronizing them. + :param instrumentation_stream: Code for profiling kernel execution time. + :param state_parameters: a list of parameters that must be passed to the state. It will get populated + considering all the parameters needed by the kernels in this state. + :param kernel_id: Unique ID of this kernels as computed in the generate_state function + """ + kernel_stream = CodeIOStream() + # # Actual kernel code generation + # self.generate_kernel_internal(sdfg, cfg, state, kernel_name, predecessors, subgraphs, kernel_stream, + # state_host_header_stream, state_host_body_stream, instrumentation_stream, + # function_stream, callsite_stream, state_parameters) + kernel_stream.write(f"// Kernel {kernel_name} called here", sdfg, state) + # Store code strings to be passed to compilation phase + self._kernel_codes.append((kernel_name, kernel_stream.getvalue())) + + def add_header(self, function_stream: CodeIOStream): + if self.has_generated_header: + return + self.has_generated_header = True + + # headers + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + function_stream.write("#include \n") + # namespace + function_stream.write(f'using namespace poplar; \n') + function_stream.write(f'using namespace poplar::program; \n') + + # def debug_print_self(self): + # print("IN GENERATE_STATE") + + # # print below ones as well + # print("TargetDispatcher:", self.dispatcher) + # print("init_code", self.frame._initcode.getvalue()) + # print("exit_code", self.frame._exitcode.getvalue()) + # print("Len env:", len(self.frame.environments)) + # for _x in self.frame.statestruct: + # print("statestruct:", _x) + # print("environments:", self.frame.environments) + # print("targets:", self.frame.targets) + # print("to_allocate:", self.frame.to_allocate) + # print("where_allocated:", self.frame.where_allocated) + # print("fsyms:", self.frame.fsyms) + # print("_symbols_and_constants:", self.frame._symbols_and_constants) + # print("arglist:", self.frame.arglist) + # print ("DONE") + # print("DISPATCHER Data") + # print ("used_env", self.dispatcher.used_environments) + # print ("used_targets", self.frame.dispatcher.used_targets) + # print("DONE") + # ####### + # print("TargetCodeGenerator:", self) + # print("language", self.language) + # print("TargetDispatcher:", self.dispatcher.used_targets) diff --git a/dace/codegen/targets/ipu_files/ipu_utils.py b/dace/codegen/targets/ipu_files/ipu_utils.py new file mode 100644 index 0000000000..3300ae1854 --- /dev/null +++ b/dace/codegen/targets/ipu_files/ipu_utils.py @@ -0,0 +1,25 @@ + +""" +Utils for the IPU target. +""" + +import dace +import dace.codegen.targets + + +# Convert from DACE Types to IPU Types +TYPE_TO_IPU = { + dace.bool: 'BOOL', + dace.int8: 'CHAR', + dace.int16: 'SHORT', + dace.int32: 'INT', + dace.int64: 'LONGLONG', # LONG is not supported in IPU + dace.uint8: 'UNSIGNED_CHAR', + dace.uint16: 'UNSIGNED_SHORT', + dace.uint32: 'UNSINGED_INT', + dace.uint64: 'UNSINGNED_LONGLONG', + dace.float16: 'HALF', + dace.float32: 'FLOAT', + dace.float64: 'DOUBLE', + dace.string: 'char*', # Not sure if this is correct +} diff --git a/dace/config_schema.yml b/dace/config_schema.yml index da35e61997..d393d39258 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -264,6 +264,31 @@ required: If set to true, multiple connected components will generate "#pragma omp parallel sections" code around them. + ############################################# + # IPU compiler + ipu: + type: dict + title: IPU + description: IPU compiler preferences + required: + executable: + type: str + default: '' + title: Compiler executable override + description: File path or name of compiler executable + + args: + type: str + title: Arguments + description: Compiler argument flags + default: '-std=c++14 -fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label' + default_Windows: '/O2 /fp:fast /arch:AVX2 /D_USRDLL /D_WINDLL /D__restrict__=__restrict' + + libs: + type: str + title: Additional libraries + description: Additional linked libraries required by target + default: '-lpoplar' ############################################# # GPU (CUDA/HIP) compiler diff --git a/dace/dtypes.py b/dace/dtypes.py index f04200e63b..6d2ebed7e8 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -19,10 +19,12 @@ class DeviceType(aenum.AutoNumberEnum): CPU = () #: Multi-core CPU GPU = () #: GPU (AMD or NVIDIA) + IPU = () #: IPU (Graphcore) FPGA = () #: FPGA (Intel or Xilinx) Snitch = () #: Compute Cluster (RISC-V) + @undefined_safe_enum @extensible_enum class StorageType(aenum.AutoNumberEnum): @@ -43,6 +45,7 @@ class StorageType(aenum.AutoNumberEnum): Snitch_TCDM = () #: Cluster-private memory Snitch_L2 = () #: External memory Snitch_SSR = () #: Memory accessed by SSR streamer + IPU_Memory = () #: IPU Tile-local memory @undefined_safe_enum @@ -77,7 +80,11 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping + IPU_SCHEDULE = () #: IPU (Graphcore) +IPU_SCHEDULES = [ + ScheduleType.IPU_SCHEDULE, +] # A subset of GPU schedule types GPU_SCHEDULES = [ @@ -198,7 +205,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + # ScheduleType.IPU_SCHEDULE: StorageType.IPU_Memory, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -219,7 +227,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, + # ScheduleType.IPU_Map: ScheduleType.IPU_Map } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. @@ -232,6 +241,7 @@ class TilingType(aenum.AutoNumberEnum): StorageType.GPU_Shared: ScheduleType.GPU_ThreadBlock, StorageType.FPGA_Global: ScheduleType.FPGA_Device, StorageType.SVE_Register: ScheduleType.SVE_Map, + # StorageType.IPU_Memory: ScheduleType.IPU_SCHEDULE, } # Translation of types to C types diff --git a/dace/libraries/poplar/__init__.py b/dace/libraries/poplar/__init__.py new file mode 100644 index 0000000000..728a102bac --- /dev/null +++ b/dace/libraries/poplar/__init__.py @@ -0,0 +1,7 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from dace.library import register_library +from .nodes import * +from .environments import * + + +register_library(__name__, "poplar") diff --git a/dace/libraries/poplar/environments/__init__.py b/dace/libraries/poplar/environments/__init__.py new file mode 100644 index 0000000000..d0765769da --- /dev/null +++ b/dace/libraries/poplar/environments/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from .poplar import * diff --git a/dace/libraries/poplar/environments/poplar.py b/dace/libraries/poplar/environments/poplar.py new file mode 100644 index 0000000000..cf2462ec7d --- /dev/null +++ b/dace/libraries/poplar/environments/poplar.py @@ -0,0 +1,46 @@ +import os +from dace.config import Config +import dace.library +import ctypes.util +import warnings +from typing import Union + +@dace.library.environment +class IPU: + + cmake_minimum_version = None + cmake_packages = ["poplar"] # Find = POPLARConfig.cmake | poplar-config.cmake + cmake_files = [] + cmake_variables = {} + cmake_includes = [] + cmake_libraries = ["poplar", "popops", "poplin", "poputil"] + cmake_compile_flags = [] + cmake_link_flags = [] #-L/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/lib + headers = [ "../include/poplar_dace_interface.h"] + state_fields = [ + "// IPUModel APIs", + "IPUModel ipuModel;", + "Device device;", + "Target target;", + "Graph graph;", + "Sequence prog;", + ] + init_code = """ + __state->device = __state->ipuModel.createDevice(); + __state->target = __state->device.getTarget(); + __state->graph = Graph(__state->target); + popops::addCodelets(__state->graph); + poplin::addCodelets(__state->graph); + """ + finalize_code = """ + auto engine = Engine{__state->graph, __state->prog, {{"debug.retainDebugInformation", "true"}}}; + engine.load(__state->device); + // Run the control program + std::cout << "Running program"; + engine.run(0); + std::cout << "Program complete"; + // engine.printProfileSummary(std::cout, {{"showExecutionSteps", "true"}}); + return 0; + """ + dependencies = [] + diff --git a/dace/libraries/poplar/include/poplar_dace_interface.h b/dace/libraries/poplar/include/poplar_dace_interface.h new file mode 100644 index 0000000000..e0ba919b7b --- /dev/null +++ b/dace/libraries/poplar/include/poplar_dace_interface.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace poplar; +using namespace poplar::program; \ No newline at end of file diff --git a/dace/libraries/poplar/nodes/__init__.py b/dace/libraries/poplar/nodes/__init__.py new file mode 100644 index 0000000000..89eb792be6 --- /dev/null +++ b/dace/libraries/poplar/nodes/__init__.py @@ -0,0 +1 @@ +from .popmm import IPUMatMul diff --git a/dace/libraries/poplar/nodes/popmm.py b/dace/libraries/poplar/nodes/popmm.py new file mode 100644 index 0000000000..8854c068b3 --- /dev/null +++ b/dace/libraries/poplar/nodes/popmm.py @@ -0,0 +1,78 @@ +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace import dtypes +from dace.symbolic import symstr +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.codegen.targets.ipu_files import ipu_utils as ipu_utils + + +@dace.library.expansion +class ExpandMMPopLib(ExpandTransformation): + + environments = [environments.poplar.IPU] + + @staticmethod + def expansion(node, parent_state, parent_sdfg): + (adesc, bdesc, cdesc) = node.validate(parent_sdfg, parent_state) + + A_poplar_type = ipu_utils.TYPE_TO_IPU[adesc.dtype] + B_poplar_type = ipu_utils.TYPE_TO_IPU[bdesc.dtype] + C_poplar_type = ipu_utils.TYPE_TO_IPU[cdesc.dtype] + + + init = f""" + // Add variables to the graph + Tensor m1 = __state->graph.addVariable(FLOAT, {{900, 600}}, "m1"); + Tensor m2 = __state->graph.addVariable(FLOAT, {{600, 300}}, "m2"); + Tensor m3 = __state->graph.addVariable(FLOAT, {{300, 200}}, "m3"); + poputil::mapTensorLinearly(__state->graph, m1); + poputil::mapTensorLinearly(__state->graph, m2); + poputil::mapTensorLinearly(__state->graph, m3); + Tensor m4 = poplin::matMul(__state->graph, m1, m2, __state->prog, "m4"); + """ + + code = f""" + {init} + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dtypes.Language.CPP) + return tasklet + + +@dace.library.node +class IPUMatMul(dace.sdfg.nodes.LibraryNode): + """Executes poplin::matMul. + """ + # Global properties + implementations = { + "MM": ExpandMMPopLib, + } + default_implementation = None + + def __init__(self, name): + super().__init__(name, inputs={"_inbufferA", "_inbufferB"}, outputs={"_outbufferC"}) + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer) of the three data descriptors in the + parent SDFG. + """ + + inbufferA, inbufferB, outbufferC = None, None, None + for e in state.out_edges(self): + if e.src_conn == "_outbufferC": + outbufferC = sdfg.arrays[e.data.data] + for e in state.in_edges(self): + if e.dst_conn == "_inbufferA": + inbufferA = sdfg.arrays[e.data.data] + if e.dst_conn == "_inbufferB": + inbufferB = sdfg.arrays[e.data.data] + + + return (inbufferA, inbufferB, outbufferC) diff --git a/dace/runtime/include/dace/dace.h b/dace/runtime/include/dace/dace.h index 960aece94c..6f91a922e4 100644 --- a/dace/runtime/include/dace/dace.h +++ b/dace/runtime/include/dace/dace.h @@ -41,6 +41,9 @@ #include "intel_fpga/host.h" #endif +// // TODO:use conditional compilation later for now include poplar always +#include "poplar_host.h" + #include "fpga_common.h" #endif // __DACE_RUNTIME_H diff --git a/dace/runtime/include/dace/poplar/host.h b/dace/runtime/include/dace/poplar/host.h new file mode 100644 index 0000000000..9611281449 --- /dev/null +++ b/dace/runtime/include/dace/poplar/host.h @@ -0,0 +1,6 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +#include // Must be included after hlslib/xilinx/OpenCL.h +#include +#include diff --git a/dace/runtime/include/dace/poplar_common.h b/dace/runtime/include/dace/poplar_common.h new file mode 100644 index 0000000000..7ba9841d5d --- /dev/null +++ b/dace/runtime/include/dace/poplar_common.h @@ -0,0 +1,5 @@ +#pragma once + +// Defined as a struct rather than a class for C compatibility with OpenCL +// For definition, see poplar_host.h +struct dace_poplar_context; diff --git a/dace/runtime/include/dace/poplar_device.h b/dace/runtime/include/dace/poplar_device.h new file mode 100644 index 0000000000..f3aba7b0b9 --- /dev/null +++ b/dace/runtime/include/dace/poplar_device.h @@ -0,0 +1,6 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +// Defined as a struct rather than a class for C compatibility with OpenCL +// For definition, see fpga_host.h +struct dace_fpga_context; diff --git a/dace/runtime/include/dace/poplar_host.h b/dace/runtime/include/dace/poplar_host.h new file mode 100644 index 0000000000..2dd53ff8c5 --- /dev/null +++ b/dace/runtime/include/dace/poplar_host.h @@ -0,0 +1,58 @@ +// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +#pragma once + +#include +#include + +// dace headers +#include +#include + +// file headers +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using ::std::map; +using ::std::optional; +using ::std::string; +using ::std::vector; + +using ::poplar::Device; +using ::poplar::DeviceManager; +using ::poplar::Engine; +using ::poplar::FLOAT; +using ::poplar::Graph; +using ::poplar::OptionFlags; +using ::poplar::TargetType; +using ::poplar::Tensor; +using ::poplar::program::Copy; +using ::poplar::program::Program; +using ::poplar::program::Execute; +using ::poplar::program::Repeat; + +// Constants +const auto NUM_DATA_ITEMS = 200000; + +// Struct +struct dace_poplar_context { + optional device; + Graph graph; + map tensors; + map programs; + OptionFlags engineOptions; + map programIds; + vector programsList; + vector hostData; +}; \ No newline at end of file diff --git a/fpga.py b/fpga.py new file mode 100644 index 0000000000..95188bae65 --- /dev/null +++ b/fpga.py @@ -0,0 +1,23 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +import pytest +from dace.transformation.interstate import FPGATransformSDFG + +@dace.program +def fpga_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = fpga_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(FPGATransformSDFG) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/gpu.py b/gpu.py new file mode 100644 index 0000000000..d5066a4ba9 --- /dev/null +++ b/gpu.py @@ -0,0 +1,23 @@ + +import dace +import numpy as np +from dace.transformation.interstate import GPUTransformSDFG + + +@dace.program +def gpu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = gpu_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(GPUTransformSDFG) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/graphcore.py b/graphcore.py new file mode 100644 index 0000000000..f3acd9628d --- /dev/null +++ b/graphcore.py @@ -0,0 +1,24 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np + +# @dace.program +# def ipu_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): +# for i in dace.map[0:20]: # parallelization construct +# C[i] = A[i] + B[i] + +@dace.program +def ipu_vector_add(A: dace.int32, B: dace.int32, C: dace.int32): + C = A + B + +if __name__ == '__main__': + sdfg = ipu_vector_add.to_sdfg(simplify=False) # compiled SDFG + #sdfg.apply_transformations(IPUTransformSDFG) + # call with values + A = np.int32(1) # 1,1,1,1,... + B = np.int32(1) # 1,1,1,1,... + C = np.int32(0) # 0,0,0,0,... + sdfg(A, B, C) + + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/graphcore_dace/SkeletonForIpu.cpp b/graphcore_dace/SkeletonForIpu.cpp new file mode 100644 index 0000000000..1848b3cf5e --- /dev/null +++ b/graphcore_dace/SkeletonForIpu.cpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +using ::std::map; +using ::std::vector; +using ::std::string; +using ::std::optional; + +using ::poplar::FLOAT; +using ::poplar::OptionFlags; +using ::poplar::Tensor; +using ::poplar::Graph; +using ::poplar::Engine; +using ::poplar::Device; +using ::poplar::DeviceManager; +using ::poplar::TargetType; +using ::poplar::program::Program; +using ::poplar::program::Sequence; +using ::poplar::program::Copy; +using ::poplar::program::Repeat; +using ::poplar::program::Execute; + + +const auto NUM_DATA_ITEMS = 10; +const auto HOW_MUCH_TO_ADD = 2.0f; +const auto NUM_TILES_IN_GC = 10; + + +auto getIpuDevice(const unsigned int numIpus = 1) -> optional { + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) { + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) { + std::cout << " - attached" << std::endl; + device = {std::move(d)}; + break; + } else { + std::cout << std::endl << "Error attaching to device" << std::endl; + } + } + return device; +} + +auto createGraphAndAddCodelets(const optional &device) -> Graph { + auto graph = poplar::Graph(device->getTarget()); + + // Add our custom codelet, building from CPP source + // with the given popc compiler options + graph.addCodelets({"codelets/SkeletonCodelets.cpp"}, "-O3 -I codelets"); + + // Add the codelets for the popops librarys + popops::addCodelets(graph); + return graph; +} + +auto buildComputeGraph(Graph &graph, map &tensors, map &programs, const int numTiles) { + // Add tensors + tensors["data"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "data"); + poputil::mapTensorLinearly(graph, tensors["data"]); + + + // Add programs and wire up data + const auto NumElemsPerTile = NUM_DATA_ITEMS / numTiles; + auto cs = graph.addComputeSet("loopBody"); + for (auto tileNum = 0; tileNum < numTiles; tileNum++) { + const auto sliceEnd = std::min((tileNum + 1) * NumElemsPerTile, (int) NUM_DATA_ITEMS); + const auto sliceStart = tileNum * NumElemsPerTile; + + auto v = graph.addVertex(cs, "SkeletonVertex", { + {"data", tensors["data"].slice(sliceStart, sliceEnd)} + }); + graph.setInitialValue(v["howMuchToAdd"], HOW_MUCH_TO_ADD); + // graph.setPerfEstimate(v, 100); // Ideally you'd get this as right as possible + graph.setTileMapping(v, tileNum); + } + auto executeIncrementVertex = Execute(cs); + + // auto mainProgram = Repeat(1, executeIncrementVertex, "repeat1x"); + programs["main"] = executeIncrementVertex; // Program 0 will be the main program +} + +auto defineDataStreams(Graph &graph, map &tensors, map &programs) { + auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + auto copyToIpuProgram = Copy(toIpuStream, tensors["data"]); + auto copyToHostProgram = Copy(tensors["data"], fromIpuStream); + + programs["copy_to_ipu"] = copyToIpuProgram; + programs["copy_to_host"] = copyToHostProgram; +} + +auto serializeGraph(const Graph &graph) { + std::ofstream graphSerOfs; + graphSerOfs.open("serialized_graph.capnp", std::ofstream::out | std::ofstream::trunc); + + graph.serialize(graphSerOfs, poplar::SerializationFormat::Binary); + graphSerOfs.close(); +} + +int main(int argc, char *argv[]) { + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + auto device = getIpuDevice(1); + if (!device.has_value()) { + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "STEP 2: Create graph and compile codelets" << std::endl; + auto graph = createGraphAndAddCodelets(device); + + + std::cout << "STEP 3: Building the compute graph" << std::endl; + auto tensors = map{}; + auto programs = map{}; + buildComputeGraph(graph, tensors, programs, NUM_TILES_IN_GC /* numTiles */); + + std::cout << "STEP 4: Define data streams" << std::endl; + defineDataStreams(graph, tensors, programs); + + std::cout << "STEP 5: Create engine and compile graph" << std::endl; + auto ENGINE_OPTIONS = OptionFlags{ + {"target.saveArchive", "archive.a"}, + {"autoReport.all", "true"}, + {"autoReport.outputSerializedGraph", "true"}, + }; + + auto programIds = map(); + auto programsList = vector(programs.size()); + int index = 0; + for (auto &nameToProgram: programs) { + programIds[nameToProgram.first] = index; + programsList[index] = nameToProgram.second; + index++; + } + auto engine = Engine(graph, programsList, ENGINE_OPTIONS); + + std::cout << "STEP 6: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*device); + engine.enableExecutionProfiling(); + + + std::cout << "STEP 7: Attach data streams" << std::endl; + auto hostData = vector(NUM_DATA_ITEMS, 1.0f); + // print before + std::cout << "\nBefore: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << hostData[i] << " "; + } + std::cout << "\nHow much to add: " << HOW_MUCH_TO_ADD << std::endl; + engine.connectStream("TO_IPU", hostData.data()); + engine.connectStream("FROM_IPU", hostData.data()); + + std::cout << "\nSTEP 8: Run programs" << std::endl; + engine.run(programIds["copy_to_ipu"]); // Copy to IPU + engine.run(programIds["main"]); // Main program + engine.run(programIds["copy_to_host"]); // Copy from IPU + + std::cout << "\nSTEP 9: Check results" << std::endl; + // print hostData to see the result + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << hostData[i] << " "; + } + + + std::cout << "\nSTEP 10: Capture debug and profile info" << std::endl; + // serializeGraph(graph); + // engine.printProfileSummary(std::cout, + // OptionFlags{{"showExecutionSteps", "false"}}); + + return EXIT_SUCCESS; +} diff --git a/graphcore_dace/copy_a_b_skeletonIPU.cpp b/graphcore_dace/copy_a_b_skeletonIPU.cpp new file mode 100644 index 0000000000..9995547f33 --- /dev/null +++ b/graphcore_dace/copy_a_b_skeletonIPU.cpp @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +using namespace poplar; +using namespace poplar::program; + +using ::std::map; +using ::std::vector; +using ::std::string; +using ::std::optional; + +// using ::poplar::FLOAT; +// using ::poplar::OptionFlags; +// using ::poplar::Tensor; +// using ::poplar::Graph; +// using ::poplar::Engine; +// using ::poplar::Device; +// using ::poplar::DeviceManager; +// using ::poplar::TargetType; +// using ::poplar::program::Program; +// using ::poplar::program::Sequence; +// using ::poplar::program::Copy; +// using ::poplar::program::Repeat; +// using ::poplar::program::Execute; + + +const auto NUM_DATA_ITEMS = 1; + +auto getIpuDevice(const unsigned int numIpus = 1) -> optional { + DeviceManager manager = DeviceManager::createDeviceManager(); + optional device = std::nullopt; + for (auto &d : manager.getDevices(TargetType::IPU, numIpus)) { + std::cout << "Trying to attach to IPU " << d.getId(); + if (d.attach()) { + std::cout << " - attached" << std::endl; + device = {std::move(d)}; + break; + } else { + std::cout << std::endl << "Error attaching to device" << std::endl; + } + } + return device; +} + +auto createGraphAndAddCodelets(const optional &device) -> Graph { + auto graph = poplar::Graph(device->getTarget()); + + // Add our custom codelet, building from CPP source + // with the given popc compiler options + // graph.addCodelets({"codelets/SkeletonCodelets.cpp"}, "-O3 -I codelets"); + + // Add the codelets for the popops librarys + // popops::addCodelets(graph); + return graph; +} + +auto buildComputeGraph(Graph &graph, map &tensors, map &programs) { + // Add tensors + tensors["v1"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "v1"); + poputil::mapTensorLinearly(graph, tensors["v1"]); + + tensors["v2"] = graph.addVariable(poplar::FLOAT, {NUM_DATA_ITEMS}, "v2"); + poputil::mapTensorLinearly(graph, tensors["v2"]); // both v1 v2 will be on same tile + + // real magic happens here + auto copyprogram = Copy(tensors["v1"], tensors["v2"]); // tile to tile + programs["main"] = copyprogram; + + // print_before = program::PrintTensor("v1-debug", v1); + // programs["print_before"] = print_before; + + // print_after = program::PrintTensor("v2-debug", v2); + // programs["print_after"] = print_after; + +} + +auto defineDataStreams(Graph &graph, map &tensors, map &programs) { + auto toIpuStream = graph.addHostToDeviceFIFO("TO_IPU", FLOAT, NUM_DATA_ITEMS); + auto fromIpuStream = graph.addDeviceToHostFIFO("FROM_IPU", FLOAT, NUM_DATA_ITEMS); + + auto copyToIpuProgramv1 = Copy(toIpuStream, tensors["v1"]); // host->device + auto copyToIpuProgramv2 = Copy(toIpuStream, tensors["v2"]); + + // print these tensors + auto copyToHostProgramv1 = Copy(tensors["v1"], fromIpuStream); + auto copyToHostProgramv2 = Copy(tensors["v2"], fromIpuStream); // device->host + + // auto printit_v1 = PrintTensor("v1-debug", tensors["v1"]); + // auto printit_v2 = PrintTensor("v2-debug", tensors["v2"]); + // auto printit_v1_after = PrintTensor("v1-debug-after", tensors["v1"]); + // auto printit_v2_after = PrintTensor("v2-debug-after", tensors["v2"]); + // programs["print_v1_before"] = printit_v1; + // programs["print_v2_before"] = printit_v2; + // programs["print_v1_after"] = printit_v1_after; + // programs["print_v2_after"] = printit_v2_after; + + programs["copy_to_ipu_v1"] = copyToIpuProgramv1; + programs["copy_to_ipu_v2"] = copyToIpuProgramv2; + programs["copy_to_host_v1"] = copyToHostProgramv1; + programs["copy_to_host_v2"] = copyToHostProgramv2; + +} + +auto serializeGraph(const Graph &graph) { + std::ofstream graphSerOfs; + graphSerOfs.open("serialized_graph.capnp", std::ofstream::out | std::ofstream::trunc); + + graph.serialize(graphSerOfs, poplar::SerializationFormat::Binary); + graphSerOfs.close(); +} + +void print_data(std::vector &v1_host, std::vector& v2_host) { + std::cout << "v1: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << v1_host[i] << " "; + } + std::cout << std::endl; + std::cout << "v2: "; + for (auto i = 0; i < NUM_DATA_ITEMS; i++) { + std::cout << v2_host[i] << " "; + } + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + std::cout << "STEP 1: Connecting to an IPU device" << std::endl; + auto device = getIpuDevice(1); + if (!device.has_value()) { + std::cerr << "Could not attach to an IPU device. Aborting" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "STEP 2: Create graph and compile codelets" << std::endl; + auto graph = createGraphAndAddCodelets(device); + + + std::cout << "STEP 3: Building the compute graph" << std::endl; + auto tensors = map{}; + auto programs = map{}; + buildComputeGraph(graph, tensors, programs); + + std::cout << "STEP 4: Define data streams" << std::endl; + defineDataStreams(graph, tensors, programs); + + std::cout << "STEP 5: Create engine and compile graph" << std::endl; + auto ENGINE_OPTIONS = OptionFlags{ + {"target.saveArchive", "archive.a"}, + {"autoReport.all", "true"}, + {"autoReport.outputSerializedGraph", "true"}, + }; + + auto programIds = map(); + auto programsList = vector(programs.size()); + int index = 0; + for (auto &nameToProgram: programs) { + programIds[nameToProgram.first] = index; + programsList[index] = nameToProgram.second; + index++; + } + auto engine = Engine(graph, programsList, ENGINE_OPTIONS); + + std::cout << "STEP 6: Load compiled graph onto the IPU tiles" << std::endl; + engine.load(*device); + engine.enableExecutionProfiling(); + + std::cout << "STEP 7: Attach data streams(host to device data)" << std::endl; + auto v1_host = vector(NUM_DATA_ITEMS, 100.0f); // v1 = 1 + auto v2_host = vector(NUM_DATA_ITEMS, 0.0f); // v2 = 0 + vector vector_stream_in; + vector_stream_in.insert(vector_stream_in.end(), v1_host.begin(), + v1_host.end()); + vector_stream_in.insert(vector_stream_in.end(), v2_host.begin(), v2_host.end()); + + auto v1_host_out = vector(NUM_DATA_ITEMS, 0.0f); // Output buffer for v1 + auto v2_host_out = vector(NUM_DATA_ITEMS, 0.0f); // Output buffer for v2 + vector vector_stream_out; + vector_stream_out.insert(vector_stream_out.end(), v1_host_out.begin(), v1_host_out.end()); + vector_stream_out.insert(vector_stream_out.end(), v2_host_out.begin(), v2_host_out.end()); + + // print before + std::cout << "\nBefore: \n"; + print_data(v1_host, v2_host); + + engine.connectStream("TO_IPU", vector_stream_in.data(), vector_stream_in.data() + vector_stream_in.size()); + engine.connectStream("FROM_IPU", vector_stream_out.data(), vector_stream_out.data() + vector_stream_out.size()); + + std::cout << "\nSTEP 8: Run programs" << std::endl; + engine.run(programIds["copy_to_ipu_v1"]); // Copy to IPU + engine.run(programIds["copy_to_ipu_v2"]); // Copy to IPU + // engine.run(programIds["print_v1_before"]); // Print v1 + // engine.run(programIds["print_v2_before"]); // Print v2 + engine.run(programIds["main"]); // Main program + // engine.run(programIds["print_v1_after"]); // Print v1 + // engine.run(programIds["print_v2_after"]); // Print v2 + engine.run(programIds["copy_to_host_v1"]); // Copy from IPU + engine.run(programIds["copy_to_host_v2"]); // Copy from IPU + + std::cout << "\nSTEP 9: Check results after\n" << std::endl; + v1_host_out.assign(vector_stream_out.begin(), vector_stream_out.begin() + NUM_DATA_ITEMS); + v2_host_out.assign(vector_stream_out.begin() + NUM_DATA_ITEMS, vector_stream_out.end()); + print_data(v1_host_out, v2_host_out); + + return EXIT_SUCCESS; +} + diff --git a/graphcore_dace/copy_a_to_b.py b/graphcore_dace/copy_a_to_b.py new file mode 100644 index 0000000000..294cb5f6eb --- /dev/null +++ b/graphcore_dace/copy_a_to_b.py @@ -0,0 +1,45 @@ +import dace +import numpy as np + + +def copy_a_to_b(): + # Define the SDFG + sdfg = dace.SDFG('copy_a_to_b') + + # Add arrays + sdfg.add_array('A', [1], dace.float64, storage=dace.StorageType.IPU_Memory) + sdfg.add_array('C', [1], dace.float64, storage=dace.StorageType.IPU_Memory) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + C_write = state.add_write('C') + + # add edge + state.add_edge(A_read, None, C_write, None, dace.Memlet('A[0] -> C[0]')) + + ############################################################### + # Runtime code + # Initialize data + A = np.ones(1, dtype=np.float64) + C = np.zeros(1, dtype=np.float64) + + # PRINT BEFORE + print("\nBefore") + print("A:", A) + print("C:", C) + + # Run the SDFG + sdfg(A=A, C=C) + + # Print the result + print ("\nAfter") + print("A:", A) + print("C:", C) + + ############################################################### +if __name__ == "__main__": + copy_a_to_b() + diff --git a/graphcore_dace/handcrafted_sdfg_scalar_add.py b/graphcore_dace/handcrafted_sdfg_scalar_add.py new file mode 100644 index 0000000000..5eb745c000 --- /dev/null +++ b/graphcore_dace/handcrafted_sdfg_scalar_add.py @@ -0,0 +1,573 @@ +import dace +import numpy as np +from dace.transformation.interstate.gpu_transform_sdfg import GPUTransformSDFG + + +# SDFG APIs +def handcrafted_sdfg_scalar_add(): + sdfg = dace.SDFG('handcrafted_sdfg') + + + ################OTHER################ + # other data nodes + sdfg.add_symbol('Symbol', dace.int64) # symbol can't be added in a state + sdfg.add_constant('constant_bool', True) # constant + + ################DEPRICATED################ + # data(scalar, symbol, array, constant, stream, transient) - everything is depricated + sdfg.add_array('Array_normal', [2, 2], dace.int64, storage=dace.StorageType.Default, + transient=False) # normal array + sdfg.add_array('Array_transient', [2, 2], dace.int64, storage=dace.StorageType.Default, + transient=True) #Transiant + sdfg.add_array('Array_onGPU', [2, 2], dace.int64, storage=dace.StorageType.GPU_Global, + transient=False) #on GPU + # sdfg.add_stream('stream', dace.float32, transient=True, buffer_size=10) # stream + # sdfg.add_transient('transient', [2, 2], dace.int64) # transient + # sdfg.add_scalar('a_scalar', dace.int32) + + + ############################################ + root = sdfg.add_state('top_level_state', is_start_block=True, is_start_state=True) + ################USE THIS################ + A = root.add_access("Array_normal") + B = root.add_access("Array_transient") + C = root.add_access("Array_onGPU") + # D = root.add_access("stream") + # E = root.add_access("transient") + # F = root.add_access("a_scalar") + + ################MEMLET################ + + + # # state + middle = sdfg.add_state('middle_level_state', is_start_block=False, is_start_state=False) + exit = sdfg.add_state('bottom_level_state', is_start_block=False, is_start_state=False) + + # cfg + sdfg.add_edge(root, middle, dace.InterstateEdge()) + sdfg.add_edge(middle, exit, dace.InterstateEdge()) + + # dfg edges/ Memlets + root.add_nedge(A, B, dace.Memlet("Array_normal[0]")) + root.add_edge(B, None, C, None, dace.Memlet("Array_transient[0]")) + print("Total edges", root.number_of_edges()) + # root.add_nedge(A, middle, dace.Memlet("Array_normal[0]")) + + # # tasklet + # tasklet = root.add_tasklet('add', {'tmp_A', 'tmp_B'}, {'tmp_C'}, 'tmp_C = tmp_A + tmp_B', language=dace.Language.Python) + + # # edges inside DFG/Memlet + # root.add_edge(A, None, tasklet, "tmp_A", dace.Memlet("A[0]")) + # root.add_edge(B, None, tasklet, "tmp_B", dace.Memlet("B[0]")) + # root.add_edge(tasklet, "tmp_C", C, None, dace.Memlet("C[0]")) + + sdfg() # uncomment for upstream dace to codegen + code = sdfg.generate_code()[0].clean_code + +def structure(): + sdfg = dace.SDFG('structure') + state = sdfg.add_state('state') + + sdfg() + code = sdfg.generate_code()[0].clean_code + +def vector_add(): + sdfg = dace.SDFG('vector_add') + #########GLOBAL VARIABLES######### + # # data(vector add) + sdfg.add_array('A', [10], dace.float64) + sdfg.add_array('B', [10], dace.float64) + sdfg.add_array('C', [10], dace.float64) + + ###########STATE, CFG, GLOBAL DATA################ + # # add state + state = sdfg.add_state('sum', is_start_block=True) + a = state.add_read('A') + b = state.add_read('B') + c = state.add_write('C') + + ###########DFG################ + # Add nodes + # # map + add_entry, add_exit = state.add_map('add_map', dict(i='0:10'), schedule=dace.ScheduleType.Sequential) + # # tasklet + t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i'), internal_connector='_a') + state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i'), internal_connector='_b') + state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i'), internal_connector='_c') + + ###########CODEGEN################ + + A = np.random.rand(10) + B = np.random.rand(10) + C = np.zeros(10) + + print(A) + print(B) + print(C) + sdfg(A, B, C) + print(C) + +def gpu_accessnode_test(): + sdfg = dace.SDFG('gpu_accessnode_test') + #########GLOBAL VARIABLES######### + + # sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode1", dace.bool, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode2", dace.int32, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode3", dace.int64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode4", dace.uint8, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode5", dace.uint64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode6", dace.float16, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode7", dace.float32, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode8", dace.string, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + # sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_constant('constant', 1) + + + # ###########STATE, CFG, GLOBAL DATA################ + # # # add state + state = sdfg.add_state('sum', is_start_block=True) + + # scalar_read = state.add_read('scalarNode') + # scalar_read1 = state.add_read('scalarNode1') + # scalar_read2 = state.add_read('scalarNode2') + # scalar_read3 = state.add_read('scalarNode3') + # scalar_read4 = state.add_read('scalarNode4') + # scalar_read5 = state.add_read('scalarNode5') + # scalar_read6 = state.add_read('scalarNode6') + # scalar_read7 = state.add_read('scalarNode7') + # scalar_read8 = state.add_read('scalarNode8') + # scalar_read9 = state.add_read('scalarNode9') + scalar_write = state.add_write('write_to_scalar') + array_ = state.add_read('arrayNode') + # stream_ = state.add_read('StreamNode') + + + + + # b = state.add_read('B_scalar') + # c = state.add_write('C_scalar') + # state.add_edge(scalar_read, None, scalar_write, None, dace.Memlet(f"scalarNode[0]")) + # state.add_edge(scalar_read1, None, scalar_write, None, dace.Memlet(f"scalarNode1[0]")) + # state.add_edge(scalar_read2, None, scalar_write, None, dace.Memlet(f"scalarNode2[0]")) + # state.add_edge(scalar_read3, None, scalar_write, None, dace.Memlet(f"scalarNode3[0]")) + # state.add_edge(scalar_read4, None, scalar_write, None, dace.Memlet(f"scalarNode4[0]")) + # state.add_edge(scalar_read5, None, scalar_write, None, dace.Memlet(f"scalarNode5[0]")) + # state.add_edge(scalar_read6, None, scalar_write, None, dace.Memlet(f"scalarNode6[0]")) + # state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) + # state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) + # state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) + # state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) + + + ###########CODEGEN################ + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg = sdfg(A) + sdfg.apply_transformations(GPUTransformSDFG) + print("After", C) + + +def gpu_scalar_add(): + sdfg = dace.SDFG('gpu_scalar_add') + #########GLOBAL VARIABLES######### + + sdfg.add_scalar("scalarNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode1", dace.bool, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode2", dace.int32, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode3", dace.int64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode4", dace.uint8, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode5", dace.uint64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode6", dace.float16, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode7", dace.float32, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode8", dace.string, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("scalarNode9", dace.int8, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_scalar("write_to_scalar", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + sdfg.add_array("arrayNode", [10], dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + sdfg.add_stream("StreamNode", dace.float64, storage=dace.StorageType.IPU_Memory, transient=True) + + # sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.GPU_Global, transient=False) + # sdfg.add_constant('constant', 1) + + + # ###########STATE, CFG, GLOBAL DATA################ + # # # add state + state = sdfg.add_state('sum', is_start_block=True) + + scalar_read = state.add_read('scalarNode') + scalar_read1 = state.add_read('scalarNode1') + scalar_read2 = state.add_read('scalarNode2') + scalar_read3 = state.add_read('scalarNode3') + scalar_read4 = state.add_read('scalarNode4') + scalar_read5 = state.add_read('scalarNode5') + scalar_read6 = state.add_read('scalarNode6') + scalar_read7 = state.add_read('scalarNode7') + scalar_read8 = state.add_read('scalarNode8') + scalar_read9 = state.add_read('scalarNode9') + scalar_write = state.add_write('write_to_scalar') + array_ = state.add_read('arrayNode') + stream_ = state.add_read('StreamNode') + + + + + # b = state.add_read('B_scalar') + # c = state.add_write('C_scalar') + state.add_edge(scalar_read, None, scalar_write, None, dace.Memlet(f"scalarNode[0]")) + state.add_edge(scalar_read1, None, scalar_write, None, dace.Memlet(f"scalarNode1[0]")) + state.add_edge(scalar_read2, None, scalar_write, None, dace.Memlet(f"scalarNode2[0]")) + state.add_edge(scalar_read3, None, scalar_write, None, dace.Memlet(f"scalarNode3[0]")) + state.add_edge(scalar_read4, None, scalar_write, None, dace.Memlet(f"scalarNode4[0]")) + state.add_edge(scalar_read5, None, scalar_write, None, dace.Memlet(f"scalarNode5[0]")) + state.add_edge(scalar_read6, None, scalar_write, None, dace.Memlet(f"scalarNode6[0]")) + state.add_edge(scalar_read7, None, scalar_write, None, dace.Memlet(f"scalarNode7[0]")) + state.add_edge(scalar_read8, None, scalar_write, None, dace.Memlet(f"scalarNode8[0]")) + state.add_edge(scalar_read9, None, scalar_write, None, dace.Memlet(f"scalarNode9[0]")) + state.add_edge(array_, None, scalar_write, None, dace.Memlet(f"arrayNode[0]")) + state.add_edge(stream_, None, scalar_write, None, dace.Memlet(f"StreamNode[0]")) + + + # ###########DFG################ + # # Add nodes + # # # map + # # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) + # # # tasklet + # t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + # state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i')) + # state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i')) + # state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i')) + + # # Add memlet_path + # state.add_memlet_path(a, t1, dst_conn='_a', memlet=dace.Memlet(f"A[i]")) + # state.add_memlet_path(b, t1, dst_conn='_b', memlet=dace.Memlet(f"B[i]")) + # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) + + # # just add_edge + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + + + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) + + ###########CODEGEN################ + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg = sdfg(A) + sdfg.apply_transformations(GPUTransformSDFG) + print("After", C) + +def cpu_scalar_add(): + sdfg = dace.SDFG('cpu_scalar_add') + #########GLOBAL VARIABLES######### + # # data(vector add) + + # sdfg.add_array('A', [1], dace.float64) + # sdfg.add_array('B', [1], dace.float64) + # sdfg.add_array('C', [1], dace.float64) + sdfg.add_scalar("A_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_scalar("B_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_scalar("C_scalar", dace.float64, storage=dace.StorageType.Default, transient=False) + sdfg.add_constant('constant', 1) + + + ###########STATE, CFG, GLOBAL DATA################ + # # add state + state = sdfg.add_state('sum', is_start_block=True) + a = state.add_read('A_scalar') + b = state.add_read('B_scalar') + c = state.add_write('C_scalar') + + ###########DFG################ + # Add nodes + # # map + # add_entry, add_exit = state.add_map('add_map', dict(i='0:31'), schedule=dace.ScheduleType.Default) + # # tasklet + t1 = state.add_tasklet('add_scalar', {'_a', '_b'}, {'_c'}, '_c = _a + _b') + + # Add add_edge_pair(map mostly) + # state.add_edge_pair(add_entry, t1, a, dace.Memlet.simple(a, 'i')) + # state.add_edge_pair(add_entry, t1, b, dace.Memlet.simple(b, 'i')) + # state.add_edge_pair(add_exit, t1, c, dace.Memlet.simple(c, 'i')) + + # # Add memlet_path + # state.add_memlet_path(a, t1, dst_conn='_a', memlet=dace.Memlet(f"A[i]")) + # state.add_memlet_path(b, t1, dst_conn='_b', memlet=dace.Memlet(f"B[i]")) + # state.add_memlet_path(t1, c, src_conn='_c', memlet=dace.Memlet(f"C[i]")) + + # just add_edge + state.add_edge(a, None, t1, '_a', dace.Memlet(f"A_scalar")) + state.add_edge(b, None, t1, '_b', dace.Memlet(f"B_scalar")) + state.add_edge(t1, '_c', c, None, dace.Memlet(f"C_scalar")) + + + # state.add_edge(a, None, t1, '_a', dace.Memlet(f"A[0]")) + # state.add_edge(b, None, t1, '_b', dace.Memlet(f"B[0]")) + # state.add_edge(t1, '_c', c, None, dace.Memlet(f"C[0]")) + + ###########CODEGEN################ + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + print(A) + print(B) + print("Before", C) + sdfg = sdfg(A, B, C) + print("After", C) + +def only_state(): + sdfg = dace.SDFG('only_state') + sdfg.add_constant('constant_variable', 1) + sdfg.add_symbol('symbol_variable', dace.int64) + sdfg.add_array('A_array', [1], dace.float64) #, storage=dace.StorageType.IPU_Tile_Local, transient=False) + sdfg.add_array('B_array', [1], dace.float64) + sdfg.add_array('C_array', [1], dace.float64) + state1 = sdfg.add_state('state1' , is_start_state=True) + a = state1.add_read('A_array') + b = state1.add_read('B_array') + c = state1.add_write('C_array') + t = state1.add_tasklet('add', {'a', 'b'}, {'c'}, 'c = a + b') + state1.add_edge(a, None, t, 'a', dace.Memlet('A_array[0]')) + state1.add_edge(b, None, t, 'b', dace.Memlet('B_array[0]')) + state1.add_edge(t, 'c', c, None, dace.Memlet('C_array[0]')) + + # state2 = sdfg.add_state('state2') + # state3 = sdfg.add_state('state3') + # state4 = sdfg.add_state('state4') + + # # cfg/program::sequential + # sdfg.add_edge(state1, state2, dace.InterstateEdge()) + # sdfg.add_edge(state2, state3, dace.InterstateEdge()) + # sdfg.add_edge(state3, state4, dace.InterstateEdge()) + + + sdfg(A, B, C) + +#### Python +def add(A, B, C): + C = A + B + + + +def allocate_data(sdfg): + + # data + sdfg.add_array('A', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + # add a _tmp1 accessnode with transient state, shape 1 and dtype int32 + sdfg.add_array('_tmp1', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.Register, + location=None, + transient=True, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + # me, mx = state.add_map('outer', dict(i='0:2')) + # nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) + # state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A', 'i')) + # state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) + +def gpu_vector_add_python_copy(): + + # # add a _tmp1 accessnode with transient state, shape 1 and dtype int32 + # sdfg.add_array('_tmp1_outer', + # shape=[1], + # dtype=dace.int32, + # storage=dace.StorageType.Register, + # location=None, + # transient=True, + # strides=[1], + # offset=[0], + # lifetime=dace.AllocationLifetime.Scope, + # debuginfo=None, total_size=1) + def nested() -> dace.SDFG: + # Inner SDFG + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.int32) + nsdfg.add_array('b', [1], dace.int32) + nsdfg.add_array('c', [1], dace.int32) + nsdfg.add_transient('t', [1], dace.int32) + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + nstate = nsdfg.add_state() + irnode = nstate.add_read('t') + task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + iwnode = nstate.add_write('c') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + return nsdfg + + + ############################################################### + # Outer SDFG + sdfg = dace.SDFG('gpu_vector_add_python_copy') + # data + sdfg.add_array('A_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.CPU_Heap, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + sdfg.add_symbol('i', dace.int32) + + # State machine + initstate = sdfg.add_state("init") + state = sdfg.add_state() + rnode = state.add_read('A_outer') + rnodeb = state.add_read('B_outer') + wnode = state.add_write('C_outer') + me, mx = state.add_map('map_parallelizn', dict(i='0:20')) + nsdfg_node = state.add_nested_sdfg(nested(), None, {'a', 'b'}, {'c'}, schedule=dace.ScheduleType.Sequential) + state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A_outer', 'i')) + state.add_memlet_path(rnodeb, me, nsdfg_node, dst_conn='b', memlet=dace.Memlet.simple('B_outer', 'i')) + state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='c', memlet=dace.Memlet.simple('C_outer', 'i')) + + # add state edges + sdfg.add_edge(initstate, state, dace.InterstateEdge()) + + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + print("A Values:", A) + print("B Values:", B) + print("C Values:", C) + + sdfg = sdfg(A, B, C) + + +# def gpu_vec_add_python(): + +# @dace.program +# def gpu_vector_add(A: dace.int32, B: dace.int32, C: dace.int32): +# for i in dace.map[0:20]: # parallelization construct +# C[i] = A[i] + B[i] + +# sdfg = gpu_vector_add.to_sdfg(simplify=False) # compiled SDFG +# sdfg.apply_transformations(GPUTransformSDFG) + +# # call with values +# A = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# B = np.ones((20), dtype=np.int32) # 1,1,1,1,... +# C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... +# sdfg(A, B, C) + +# main +if __name__ == "__main__": + # handcrafted_sdfg_scalar_add() + # structure() + #add a,b,c values + A = np.random.rand(1) + B = np.random.rand(1) + C = np.zeros(1) + # print (A) + # print (B) + # add(A, B, C) + # print (C) + # only_state() + # print (C) + # vector_add() + # gpu_scalar_add() + gpu_accessnode_test() + # gpu_vector_add_python_copy() diff --git a/graphcore_dace/ipu_test.py b/graphcore_dace/ipu_test.py new file mode 100644 index 0000000000..29da3c9a41 --- /dev/null +++ b/graphcore_dace/ipu_test.py @@ -0,0 +1,192 @@ +import dace +import numpy as np +from dace.transformation.interstate.gpu_transform_sdfg import GPUTransformSDFG + + +def nested() -> dace.SDFG: + # Inner SDFG + nsdfg = dace.SDFG('nested') + nsdfg.add_array('a', [1], dace.int32) + nsdfg.add_array('b', [1], dace.int32) + nsdfg.add_array('c', [1], dace.int32) + nsdfg.add_transient('t', [1], dace.int32) + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + nstate = nsdfg.add_state() + irnode = nstate.add_read('t') + task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + iwnode = nstate.add_write('c') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + + return nsdfg + +def ipu_vector_add_python_copy(): + + ############################################################### + # Outer SDFG + sdfg = dace.SDFG('gpu_vector_add_python_copy') + # data + sdfg.add_array('A_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + sdfg.add_array('B_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + # Add a C array + sdfg.add_array('C_outer', + shape=[20], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=20) + + sdfg.add_symbol('i', dace.int32) + + # State machine + initstate = sdfg.add_state("init") + state = sdfg.add_state() + rnode = state.add_read('A_outer') + rnodeb = state.add_read('B_outer') + wnode = state.add_write('C_outer') + me, mx = state.add_map('map_parallelizn', dict(i='0:20')) #, schedule=dace.ScheduleType.IPU_SCHEDULE) + nsdfg_node = state.add_nested_sdfg(nested(), None, {'a', 'b'}, {'c'}, schedule=dace.ScheduleType.Sequential) + state.add_memlet_path(rnode, me, nsdfg_node, dst_conn='a', memlet=dace.Memlet.simple('A_outer', 'i')) + state.add_memlet_path(rnodeb, me, nsdfg_node, dst_conn='b', memlet=dace.Memlet.simple('B_outer', 'i')) + state.add_memlet_path(nsdfg_node, mx, wnode, src_conn='c', memlet=dace.Memlet.simple('C_outer', 'i')) + + # add state edges + sdfg.add_edge(initstate, state, dace.InterstateEdge()) + + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + print("A Values:", A) + print("B Values:", B) + print("C Values:", C) + + sdfg = sdfg(A, B, C) + +def ipu_test1(): + nsdfg = dace.SDFG('ipu_test1') + # data + nsdfg.add_array('a', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=True, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + nsdfg.add_array('b', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=True, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + # # Add a C array + # nsdfg.add_array('c', + # shape=[1], + # dtype=dace.int32, + # storage=dace.StorageType.IPU_Memory, + # location=None, + # transient=True, + # strides=[1], + # offset=[0], + # lifetime=dace.AllocationLifetime.State, + # debuginfo=None, total_size=1) + + + nsdfg.add_symbol('i', dace.int32) + # nsdfg.add_transient('t', [1], dace.int32) + nsdfg.add_array('t', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=True, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + + # init state + ninitstate = nsdfg.add_state() + # a,b->t state + nstate = nsdfg.add_state() + irnode = nstate.add_read('a') + irnodeb = nstate.add_read('b') + task = nstate.add_tasklet('t1', {'inp1', 'inp2'}, {'out'}, 'out = inp1 + inp2') + iwnode = nstate.add_write('t') + nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('a', '0')) + nstate.add_edge(irnodeb, None, task, 'inp2', dace.Memlet.simple('b', '0')) + nstate.add_edge(task, 'out', iwnode, None, dace.Memlet.simple('t', '0')) + + # t->c state + first_state = nstate + # nstate = nsdfg.add_state() + # irnode = nstate.add_read('t') + # task = nstate.add_tasklet('t2', {'inp1'}, {'out1'}, 'out1 = inp1') + # iwnode = nstate.add_write('c') + # nstate.add_edge(irnode, None, task, 'inp1', dace.Memlet.simple('t', '0')) + # nstate.add_edge(task, 'out1', iwnode, None, dace.Memlet.simple('c', '0')) + + nsdfg.add_edge(ninitstate, first_state, dace.InterstateEdge()) + # nsdfg.add_edge(first_state, nstate, dace.InterstateEdge()) + ###########CODEGEN################ + A = np.random.rand(20) + B = np.random.rand(20) + C = np.zeros(20) + # codeobjects = nsdfg(A, B, C).generate_code() + code = nsdfg(A, B, C).generate_code(recompile=False)[0].clean_code + + +# main +if __name__ == "__main__": + ipu_test1() + # nested() + # ipu_vector_add_python_copy() + + diff --git a/graphcore_dace/scalar_1_add_constant.py b/graphcore_dace/scalar_1_add_constant.py new file mode 100644 index 0000000000..2e9bf58788 --- /dev/null +++ b/graphcore_dace/scalar_1_add_constant.py @@ -0,0 +1,49 @@ +import dace +import numpy as np + + +def array_add_constant_sdfg(): + # Define the SDFG + sdfg = dace.SDFG('array_add_constant_sdfg') + + # Add arrays + sdfg.add_array('A', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_array('B', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + sdfg.add_array('C', [1], dace.float64, storage=dace.StorageType.IPU_Memory, transient=False) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + B_read = state.add_read('B') + C_write = state.add_write('C') + + # Add map + map_entry, map_exit = state.add_map('map', dict(i='0:1'), schedule=dace.ScheduleType.Sequential) + + # Add tasklet + tasklet = state.add_tasklet('add_constant', {'a_in', 'b_in'}, {'c_out'}, 'c_out = a_in + b_in') + + # Connect nodes with memlets + state.add_memlet_path(A_read, map_entry, tasklet, dst_conn='a_in', memlet=dace.Memlet('A[i]')) + state.add_memlet_path(B_read, map_entry, tasklet, dst_conn='b_in', memlet=dace.Memlet('B[i]')) + state.add_memlet_path(tasklet, map_exit, C_write, src_conn='c_out', memlet=dace.Memlet('C[i]')) + + # Runtime code + # Initialize data + A = np.ones(1, dtype=np.float64) + B = np.ones(1, dtype=np.float64) + C = np.zeros(1, dtype=np.float64) + + # Run the SDFG + sdfg(A=A, B=B, C=C) + + # Print the result + print("A:", A) + print("B:", B) + print("C:", C) + +if __name__ == "__main__": + array_add_constant_sdfg() + diff --git a/graphcore_dace/vector_add_constant.py b/graphcore_dace/vector_add_constant.py new file mode 100644 index 0000000000..12c54a9360 --- /dev/null +++ b/graphcore_dace/vector_add_constant.py @@ -0,0 +1,48 @@ +import dace +import numpy as np + +def vector_add_constant_sdfg(): + # Define the SDFG + sdfg = dace.SDFG('vector_add_constant_sdfg') + + # Add arrays + sdfg.add_array('A', [10], dace.float64) + sdfg.add_array('B', [10], dace.float64) + sdfg.add_array('C', [10], dace.float64) + + # Add state + state = sdfg.add_state('compute_state') + + # Add read and write nodes + A_read = state.add_read('A') + B_read = state.add_read('B') + C_write = state.add_write('C') + + # # Add map + map_entry, map_exit = state.add_map('add_map', dict(i='0:10')) + + # # Add tasklet + tasklet = state.add_tasklet('add_constant', {'a_in', 'b_in'}, {'c_out'}, 'c_out = a_in + b_in') + + # # Connect nodes with memlets + state.add_memlet_path(A_read, map_entry, tasklet, dst_conn='a_in', memlet=dace.Memlet('A[i]')) + state.add_memlet_path(B_read, map_entry, tasklet, dst_conn='b_in', memlet=dace.Memlet('B[i]')) + state.add_memlet_path(tasklet, map_exit, C_write, src_conn='c_out', memlet=dace.Memlet('C[i]')) + + + # Runtime code + # Initialize data + A = np.ones(10, dtype=np.float64) + B = np.ones(10, dtype=np.float64) + C = np.zeros(10, dtype=np.float64) + + # Run the SDFG + sdfg(A=A, B=B, C=C) + + # Print the result + print(A) + print(B) + print(C) + +if __name__ == "__main__": + vector_add_constant_sdfg() \ No newline at end of file diff --git a/graphcore_mapped_dace.cpp b/graphcore_mapped_dace.cpp new file mode 100644 index 0000000000..176da32a73 --- /dev/null +++ b/graphcore_mapped_dace.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2018 Graphcore Ltd. All rights reserved. + +/* This file contains the completed version of Poplar tutorial 3. + See the Poplar user guide for details. +*/ + +#include +#include +#include +#include +using namespace poplar; +using namespace poplar::program; + +void func_device_pre(){ + + // Create the IPU model device + IPUModel ipuModel; + Device device = ipuModel.createDevice(); + Target target = device.getTarget(); + // Create the Graph object + +} +// graph +// codelets->tasklets +// data +// connect input-output + +void func_graph(){ + // init device and graph + Graph graph(target); + + // Add codelets to the graph + graph.addCodelets("tut3_codelets.cpp"); + + // CONTAINERS ->ARRAYS/MAPS + // Add variables to the graph + Tensor v1 = graph.addVariable(FLOAT, {4}, "v1"); + Tensor v2 = graph.addVariable(FLOAT, {4}, "v2"); + for (unsigned i = 0; i < 4; ++i) { + graph.setTileMapping(v1[i], i); + graph.setTileMapping(v2[i], i); + } + // Add steps to initialize the variables + Tensor c1 = graph.addConstant(FLOAT, {4}, {1.0, 1.5, 2.0, 2.5}); + graph.setTileMapping(c1, 0); + + + // parallel stuff -> MAP/CONSUME? + // Connect the codelets with data --> MEMLET part + ComputeSet computeSet = graph.addComputeSet("computeSet"); + for (unsigned i = 0; i < 4; ++i) { + VertexRef vtx = graph.addVertex(computeSet, "SumVertex"); + graph.connect(vtx["in"], v1.slice(i, 4)); + graph.connect(vtx["out"], v2[i]); + graph.setTileMapping(vtx, i); + graph.setPerfEstimate(vtx, 20); + } + +} + +// seems like function before calling __internal__ +void cfg() { + // Create a control program that is a sequence of steps + Sequence prog; + + prog.add(Copy(c1, v1)); + + // Add step to execute the compute set + prog.add(Execute(computeSet)); // ------------->>>>graph() + + // Add step to print out v2 + prog.add(PrintTensor("v2", v2)); +} + +void func_engine_and_cleanup(){ + + // Create the engine + Engine engine(graph, prog); + engine.load(device); + + // Run the control program + std::cout << "Running program\n"; + engine.run(0); + std::cout << "Program complete\n"; +} + +int main() { +// where is the dataflow graph supposed to get built? + + func_device_pre(); // no + func_graph(); // yes (__internal__) + cfg(); // program(){__internal__} + func_engine_and_cleanup(); // no + + return 0; +} \ No newline at end of file diff --git a/mpi.py b/mpi.py new file mode 100644 index 0000000000..593fec2208 --- /dev/null +++ b/mpi.py @@ -0,0 +1,22 @@ + +import dace +import numpy as np +from dace.transformation.dataflow import MPITransformMap + + +@dace.program +def mpi_vector_add(A: dace.int32[20], B: dace.int32[20], C: dace.int32[20]): + for i in dace.map[0:20]: # parallelization construct + C[i] = A[i] + B[i] + +if __name__ == '__main__': + sdfg = mpi_vector_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(MPITransformMap) + + # call with values + A = np.ones((20), dtype=np.int32) # 1,1,1,1,... + B = np.ones((20), dtype=np.int32) # 1,1,1,1,... + C = np.zeros((20), dtype=np.int32) # 0,0,0,0,... + sdfg(A, B, C) + # ref = np.full(20, 2, dtype=np.int32) # 2,2,2,2,... + # assert np.array_equal(ref, C) diff --git a/mpi_scalar.py b/mpi_scalar.py new file mode 100644 index 0000000000..7037908c31 --- /dev/null +++ b/mpi_scalar.py @@ -0,0 +1,17 @@ + +import dace +import numpy as np +from dace.transformation.dataflow import MPITransformMap + +@dace.program +def mpi_scalar_add(A: dace.int32, B: dace.int32, C: dace.int32): + C = A + B + +if __name__ == '__main__': + sdfg = mpi_scalar_add.to_sdfg(simplify=False) # compiled SDFG + sdfg.apply_transformations(MPITransformMap) + + A = np.int32(1) # 1,1,1,1,... + B = np.int32(1) # 1,1,1,1,... + C = np.int32(0) # 0,0,0,0,... + sdfg(A, B, C) \ No newline at end of file diff --git a/tests/library/poplar/poplar_matmul.py b/tests/library/poplar/poplar_matmul.py new file mode 100644 index 0000000000..adbcdf2ee4 --- /dev/null +++ b/tests/library/poplar/poplar_matmul.py @@ -0,0 +1,116 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.memlet import Memlet +import dace.libraries.poplar as poplar +import numpy as np +import pytest + +############################################################################### + +def make_sdfg(dtype): + + sdfg = dace.SDFG("poplar_matmul") + state = sdfg.add_state("matmul_state") + sdfg.add_array('A', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + sdfg.add_array('B', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + # Add a C array + sdfg.add_array('C', + shape=[1], + dtype=dace.int32, + storage=dace.StorageType.IPU_Memory, + location=None, + transient=False, + strides=[1], + offset=[0], + lifetime=dace.AllocationLifetime.Scope, + debuginfo=None, total_size=1) + + + a = state.add_access("A") + b = state.add_access("B") + c = state.add_access("C") + + poplar_mm_node = poplar.nodes.popmm.IPUMatMul("MATMUL") + poplar_mm_node.implementation = "MM" + + state.add_memlet_path(a, poplar_mm_node, dst_conn="_inbufferA", memlet=dace.Memlet(f"A")) + state.add_memlet_path(b, poplar_mm_node, dst_conn="_inbufferB", memlet=dace.Memlet(f"B")) + state.add_memlet_path(poplar_mm_node, c, src_conn="_outbufferC", memlet=dace.Memlet(f"C")) + + return sdfg + + +############################################################################### + + +# def _test_poplar(info, sdfg, dtype): + +# poplar_sdfg = sdfg.compile() + + + +@pytest.mark.poplar +def test_poplar(): + sdfg = make_sdfg(np.float32) + sdfg.compile() + print("Success!") + +############################################################################### + +# N = dace.symbol('N', dtype=dace.int64) + + +# @dace.program +# def dace_bcast(A: dace.float32[N]): +# dace.comm.Bcast(A, root=0) + + +# @pytest.mark.mpi +# def test_dace_bcast(): +# from mpi4py import MPI as MPI4PY +# comm = MPI4PY.COMM_WORLD +# rank = comm.Get_rank() +# commsize = comm.Get_size() +# mpi_sdfg = None +# if commsize < 2: +# raise ValueError("This test is supposed to be run with at least two processes!") +# for r in range(0, commsize): +# if r == rank: +# mpi_sdfg = dace_bcast.compile() +# comm.Barrier() + +# length = 128 +# if rank == 0: +# A = np.full([length], np.pi, dtype=np.float32) +# else: +# A = np.random.randn(length).astype(np.float32) + +# mpi_sdfg(A=A, N=length) + +# assert (np.allclose(A, np.full([length], np.pi, dtype=np.float32))) + + +############################################################################### + +if __name__ == "__main__": + test_poplar() + # test_dace_bcast() +###############################################################################