From 93fb59fcbe0777b2754fe0b03df8a06af6de95f0 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 5 Apr 2024 16:49:55 +0200 Subject: [PATCH] experimental_offload: Add atax, correlation and covariance --- .../experiments/offload/data/atax.json.tpl | 8 ++ .../offload/data/correlation.json.tpl | 8 ++ .../offload/data/covariance.json.tpl | 8 ++ .../sim/experiments/offload/roi/atax.json.tpl | 109 ++++++++++++++ .../offload/roi/correlation.json.tpl | 106 ++++++++++++++ .../offload/roi/covariance.json.tpl | 106 ++++++++++++++ .../offload/roi/offload_minimal.json.tpl | 45 ++++++ target/sim/experiments/offload/run.py | 79 +++++----- .../apps/experimental_offload/src/axpy_job.h | 2 +- .../experimental_offload/src/kmeans_job.h | 2 +- .../apps/experimental_offload/src/offload.c | 34 +++-- .../host/apps/experimental_offload/Makefile | 12 ++ .../apps/experimental_offload/src/offload.c | 135 ++++++++++++++++++ .../apps/experimental_offload/src/offload.h | 54 ++++++- 14 files changed, 654 insertions(+), 54 deletions(-) create mode 100644 target/sim/experiments/offload/data/atax.json.tpl create mode 100644 target/sim/experiments/offload/data/correlation.json.tpl create mode 100644 target/sim/experiments/offload/data/covariance.json.tpl create mode 100644 target/sim/experiments/offload/roi/atax.json.tpl create mode 100644 target/sim/experiments/offload/roi/correlation.json.tpl create mode 100644 target/sim/experiments/offload/roi/covariance.json.tpl create mode 100644 target/sim/experiments/offload/roi/offload_minimal.json.tpl diff --git a/target/sim/experiments/offload/data/atax.json.tpl b/target/sim/experiments/offload/data/atax.json.tpl new file mode 100644 index 000000000..94f5e4eca --- /dev/null +++ b/target/sim/experiments/offload/data/atax.json.tpl @@ -0,0 +1,8 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + M: 16, + N: ${N} +} diff --git a/target/sim/experiments/offload/data/correlation.json.tpl b/target/sim/experiments/offload/data/correlation.json.tpl new file mode 100644 index 000000000..9e89d9f85 --- /dev/null +++ b/target/sim/experiments/offload/data/correlation.json.tpl @@ -0,0 +1,8 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + M: 16, + N: 8 +} diff --git a/target/sim/experiments/offload/data/covariance.json.tpl b/target/sim/experiments/offload/data/covariance.json.tpl new file mode 100644 index 000000000..9e89d9f85 --- /dev/null +++ b/target/sim/experiments/offload/data/covariance.json.tpl @@ -0,0 +1,8 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + M: 16, + N: 8 +} diff --git a/target/sim/experiments/offload/roi/atax.json.tpl b/target/sim/experiments/offload/roi/atax.json.tpl new file mode 100644 index 000000000..fc7b44e65 --- /dev/null +++ b/target/sim/experiments/offload/roi/atax.json.tpl @@ -0,0 +1,109 @@ +[ + // CVA6 core + { + "thread": "hart_0", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "prepare data"}, + {"idx": 2, "label": "send interrupt"}, + {"idx": 4, "label": "clr interrupt"}, + // Iteration 2 + {"idx": 5, "label": "prepare data"}, + {"idx": 6, "label": "send interrupt"}, + {"idx": 8, "label": "clr interrupt"} + ] + }, + +// Snitch clusters +% for i in range(0, nr_clusters): + + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{1 + 9*i + j}'}", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job setup"}, + {"idx": 3, "label": "barrier"}, + {"idx": 4, "label": "Ax"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "AtAx"}, + {"idx": 7, "label": "barrier"}, + // Iteration 2 + {"idx": 10, "label": "clr interrupt"}, + {"idx": 11, "label": "job setup"}, + {"idx": 12, "label": "barrier"}, + {"idx": 13, "label": "Ax"}, + {"idx": 14, "label": "barrier"}, + {"idx": 15, "label": "AtAx"}, + {"idx": 16, "label": "barrier"}, + ] + }, + % endfor + + // DMA cores + { + "thread": "${f'hart_{1 + 9*i + 8}'}", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr"}, + {"idx": 3, "label": "get job args"}, + {"idx": 4, "label": "copy data in"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "copy data out"}, + {"idx": 7, "label": "return"}, + // Iteration 2 + {"idx": 10, "label": "clr interrupt"}, + {"idx": 11, "label": "get job ptr"}, + {"idx": 12, "label": "get job args"}, + {"idx": 13, "label": "copy data in"}, + {"idx": 14, "label": "barrier"}, + {"idx": 15, "label": "copy data out"}, + {"idx": 16, "label": "return"}, + ] + }, + + // DMA engine proper + % if i == 0 or multicast: + { + "thread": "${f'dma_{1 + 9*i + 8}'}", + "roi": [ + // Iteration 1 + {"idx": -10, "label": "A in"}, + {"idx": -9, "label": "x in"}, + {"idx": -8, "label": "y tile in"}, + {"idx": -7, "label": "tmp in"}, + {"idx": -6, "label": "y out"}, + // Iteration 2 + {"idx": -5, "label": "A in"}, + {"idx": -4, "label": "x in"}, + {"idx": -3, "label": "y tile in"}, + {"idx": -2, "label": "tmp in"}, + {"idx": -1, "label": "y out"}, + ] + }, + % else: + { + "thread": "${f'dma_{1 + 9*i + 8}'}", + "roi": [ + // Iteration 1 + {"idx": -12, "label": "job info"}, + {"idx": -11, "label": "A in"}, + {"idx": -10, "label": "x in"}, + {"idx": -9, "label": "y tile in"}, + {"idx": -8, "label": "tmp in"}, + {"idx": -7, "label": "y out"}, + // Iteration 2 + {"idx": -6, "label": "job info"}, + {"idx": -5, "label": "A in"}, + {"idx": -4, "label": "x in"}, + {"idx": -3, "label": "y tile in"}, + {"idx": -2, "label": "tmp in"}, + {"idx": -1, "label": "y out"}, + ] + }, + % endif +% endfor +] \ No newline at end of file diff --git a/target/sim/experiments/offload/roi/correlation.json.tpl b/target/sim/experiments/offload/roi/correlation.json.tpl new file mode 100644 index 000000000..dbfbb8e24 --- /dev/null +++ b/target/sim/experiments/offload/roi/correlation.json.tpl @@ -0,0 +1,106 @@ +[ + // CVA6 core + { + "thread": "hart_0", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "prepare data"}, + {"idx": 2, "label": "send interrupt"}, + {"idx": 4, "label": "clr interrupt"}, + // Iteration 2 + {"idx": 5, "label": "prepare data"}, + {"idx": 6, "label": "send interrupt"}, + {"idx": 8, "label": "clr interrupt"} + ] + }, + +// Snitch clusters +% for i in range(0, nr_clusters): + + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{1 + 9*i + j}'}", + "roi": [ + ## Only cluster 0 computes step 2 + % if i == 0: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job setup"}, + {"idx": 3, "label": "barrier"}, + {"idx": 4, "label": "compute step 1"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "compute step 2"}, + {"idx": 7, "label": "barrier"}, + // Iteration 2 + {"idx": 10, "label": "clr interrupt"}, + {"idx": 11, "label": "job setup"}, + {"idx": 12, "label": "barrier"}, + {"idx": 13, "label": "compute step 1"}, + {"idx": 14, "label": "barrier"}, + {"idx": 15, "label": "compute step 2"}, + {"idx": 16, "label": "barrier"}, + % else: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job setup"}, + {"idx": 3, "label": "barrier"}, + {"idx": 4, "label": "compute step 1"}, + {"idx": 5, "label": "barrier"}, + // Iteration 2 + {"idx": 8, "label": "clr interrupt"}, + {"idx": 9, "label": "job setup"}, + {"idx": 10, "label": "barrier"}, + {"idx": 11, "label": "compute step 1"}, + {"idx": 12, "label": "barrier"}, + % endif + ] + }, + % endfor + + // DMA cores + { + "thread": "${f'hart_{1 + 9*i + 8}'}", + "roi": [ + ## Cluster 0's DMA core aggregates the data from step 1 + % if i == 0: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr"}, + {"idx": 3, "label": "get job args"}, + {"idx": 4, "label": "copy data in"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "copy step1 data"}, + {"idx": 7, "label": "barrier"}, + {"idx": 8, "label": "copy data out"}, + {"idx": 9, "label": "return"}, + // Iteration 2 + {"idx": 12, "label": "clr interrupt"}, + {"idx": 13, "label": "get job ptr"}, + {"idx": 14, "label": "get job args"}, + {"idx": 15, "label": "copy data in"}, + {"idx": 16, "label": "barrier"}, + {"idx": 17, "label": "copy step1 data"}, + {"idx": 18, "label": "barrier"}, + {"idx": 19, "label": "copy data out"}, + {"idx": 20, "label": "return"}, + % else: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr"}, + {"idx": 3, "label": "get job args"}, + {"idx": 4, "label": "copy data in"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "return"}, + // Iteration 2 + {"idx": 9, "label": "clr interrupt"}, + {"idx": 10, "label": "get job ptr"}, + {"idx": 11, "label": "get job args"}, + {"idx": 12, "label": "copy data in"}, + {"idx": 13, "label": "barrier"}, + {"idx": 14, "label": "return"}, + % endif + ] + }, +% endfor +] \ No newline at end of file diff --git a/target/sim/experiments/offload/roi/covariance.json.tpl b/target/sim/experiments/offload/roi/covariance.json.tpl new file mode 100644 index 000000000..dbfbb8e24 --- /dev/null +++ b/target/sim/experiments/offload/roi/covariance.json.tpl @@ -0,0 +1,106 @@ +[ + // CVA6 core + { + "thread": "hart_0", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "prepare data"}, + {"idx": 2, "label": "send interrupt"}, + {"idx": 4, "label": "clr interrupt"}, + // Iteration 2 + {"idx": 5, "label": "prepare data"}, + {"idx": 6, "label": "send interrupt"}, + {"idx": 8, "label": "clr interrupt"} + ] + }, + +// Snitch clusters +% for i in range(0, nr_clusters): + + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{1 + 9*i + j}'}", + "roi": [ + ## Only cluster 0 computes step 2 + % if i == 0: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job setup"}, + {"idx": 3, "label": "barrier"}, + {"idx": 4, "label": "compute step 1"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "compute step 2"}, + {"idx": 7, "label": "barrier"}, + // Iteration 2 + {"idx": 10, "label": "clr interrupt"}, + {"idx": 11, "label": "job setup"}, + {"idx": 12, "label": "barrier"}, + {"idx": 13, "label": "compute step 1"}, + {"idx": 14, "label": "barrier"}, + {"idx": 15, "label": "compute step 2"}, + {"idx": 16, "label": "barrier"}, + % else: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job setup"}, + {"idx": 3, "label": "barrier"}, + {"idx": 4, "label": "compute step 1"}, + {"idx": 5, "label": "barrier"}, + // Iteration 2 + {"idx": 8, "label": "clr interrupt"}, + {"idx": 9, "label": "job setup"}, + {"idx": 10, "label": "barrier"}, + {"idx": 11, "label": "compute step 1"}, + {"idx": 12, "label": "barrier"}, + % endif + ] + }, + % endfor + + // DMA cores + { + "thread": "${f'hart_{1 + 9*i + 8}'}", + "roi": [ + ## Cluster 0's DMA core aggregates the data from step 1 + % if i == 0: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr"}, + {"idx": 3, "label": "get job args"}, + {"idx": 4, "label": "copy data in"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "copy step1 data"}, + {"idx": 7, "label": "barrier"}, + {"idx": 8, "label": "copy data out"}, + {"idx": 9, "label": "return"}, + // Iteration 2 + {"idx": 12, "label": "clr interrupt"}, + {"idx": 13, "label": "get job ptr"}, + {"idx": 14, "label": "get job args"}, + {"idx": 15, "label": "copy data in"}, + {"idx": 16, "label": "barrier"}, + {"idx": 17, "label": "copy step1 data"}, + {"idx": 18, "label": "barrier"}, + {"idx": 19, "label": "copy data out"}, + {"idx": 20, "label": "return"}, + % else: + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr"}, + {"idx": 3, "label": "get job args"}, + {"idx": 4, "label": "copy data in"}, + {"idx": 5, "label": "barrier"}, + {"idx": 6, "label": "return"}, + // Iteration 2 + {"idx": 9, "label": "clr interrupt"}, + {"idx": 10, "label": "get job ptr"}, + {"idx": 11, "label": "get job args"}, + {"idx": 12, "label": "copy data in"}, + {"idx": 13, "label": "barrier"}, + {"idx": 14, "label": "return"}, + % endif + ] + }, +% endfor +] \ No newline at end of file diff --git a/target/sim/experiments/offload/roi/offload_minimal.json.tpl b/target/sim/experiments/offload/roi/offload_minimal.json.tpl new file mode 100644 index 000000000..93855314b --- /dev/null +++ b/target/sim/experiments/offload/roi/offload_minimal.json.tpl @@ -0,0 +1,45 @@ +[ + { + "thread": "hart_0", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "prepare data"}, + {"idx": 2, "label": "send interrupt"}, + {"idx": 4, "label": "clr interrupt"}, + // Iteration 2 + {"idx": 5, "label": "prepare data"}, + {"idx": 6, "label": "send interrupt"}, + {"idx": 8, "label": "clr interrupt"} + ] + }, +% for i in range(0, nr_clusters): + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{1 + 9*i + j}'}", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "job"}, + // Iteration 2 + {"idx": 4, "label": "clr interrupt"}, + {"idx": 5, "label": "job"}, + ] + }, + % endfor + // DMA cores + { + "thread": "${f'hart_{1 + 9*i + 8}'}", + "roi": [ + // Iteration 1 + {"idx": 1, "label": "clr interrupt"}, + {"idx": 2, "label": "get job ptr and args"}, + {"idx": 3, "label": "job"}, + // Iteration 2 + {"idx": 5, "label": "clr interrupt"}, + {"idx": 6, "label": "get job ptr and args"}, + {"idx": 7, "label": "job"}, + ] + }, +% endfor +] \ No newline at end of file diff --git a/target/sim/experiments/offload/run.py b/target/sim/experiments/offload/run.py index 6f4bd9afb..e42734d7f 100755 --- a/target/sim/experiments/offload/run.py +++ b/target/sim/experiments/offload/run.py @@ -24,9 +24,6 @@ FILE_DIR = Path(__file__).parent.resolve() TARGET_DIR = FILE_DIR / '../../' SNITCH_DIR = TARGET_DIR / '../../deps/snitch_cluster' -AXPY_VERIFY_PY = SNITCH_DIR / 'sw/blas/axpy/scripts/verify.py' -GEMM_VERIFY_PY = SNITCH_DIR / 'sw/blas/gemm/scripts/verify.py' -KMEANS_VERIFY_PY = SNITCH_DIR / 'sw/apps/kmeans/scripts/verify.py' APP = 'experimental_offload' SOURCE_BUILD_DIR = TARGET_DIR / f'sw/host/apps/{APP}/build' TARGET_BUILD_DIR = FILE_DIR / 'build' @@ -35,11 +32,6 @@ BIN_DIR = Path('bin') VSIM_BUILDDIR = Path('work-vsim') -KMEANS_CFG_TEMPLATE = FILE_DIR / 'data' / 'kmeans.json.tpl' - -KMEANS_ROI_TEMPLATE = FILE_DIR / 'roi' / 'kmeans.json.tpl' -GEMM_ROI_TEMPLATE = FILE_DIR / 'roi' / 'gemm.json.tpl' - def run(cmd, env=None, dry_run=False): cmd = [str(arg) for arg in cmd] @@ -92,19 +84,19 @@ def build_hw(tests, dry_run=False): def post_process_traces(test, dry_run=False): n_clusters_to_use = test['n_clusters_to_use'] + multicast = test['multicast'] run_dir = test['run_dir'] logdir = run_dir / 'logs' device_elf = test['device_elf'] hw_cfg = test['hw_cfg'] roi_spec = logdir / 'roi_spec.json' + app = test['app'] # Read and render specification template JSON - if test['app'] == 'gemm': - roi_spec_tpl = GEMM_ROI_TEMPLATE - elif test['app'] == 'kmeans': - roi_spec_tpl = KMEANS_ROI_TEMPLATE + if app in ['gemm', 'kmeans', 'atax', 'correlation', 'covariance']: + roi_spec_tpl = FILE_DIR / 'roi' / f'{app}.json.tpl' with open(roi_spec_tpl, 'r') as f: spec_template = Template(f.read()) - rendered_spec = spec_template.render(nr_clusters=n_clusters_to_use) + rendered_spec = spec_template.render(nr_clusters=n_clusters_to_use, multicast=multicast) spec = json5.loads(rendered_spec) with open(roi_spec, 'w') as f: json.dump(spec, f, indent=4) @@ -113,21 +105,26 @@ def post_process_traces(test, dry_run=False): run(['make', '-C', TARGET_DIR, f'SIM_DIR={run_dir}', f'BINARY={device_elf}', 'annotate', '-j'], dry_run=dry_run) run(['make', '-C', TARGET_DIR, f'SIM_DIR={run_dir}', f'ROI_SPEC={roi_spec}', - f'CFG_OVERRIDE={hw_cfg}', 'visual-trace'], dry_run=dry_run) + f'CFG_OVERRIDE={hw_cfg}', f'BINARY={device_elf}', 'visual-trace'], dry_run=dry_run) -def get_gemm_cfg(n): - filled_template = Template(filename=str(GEMM_CFG_TEMPLATE)).render(N=n) +def get_data_cfg(test): + app = test['app'] + cfg_template = str(FILE_DIR / 'data' / f'{app}.json.tpl') + filled_template = Template(filename=cfg_template).render(**test) with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: temp_file.write(filled_template) return temp_file.name -def get_kmeans_cfg(**kwargs): - filled_template = Template(filename=str(KMEANS_CFG_TEMPLATE)).render(**kwargs) - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: - temp_file.write(filled_template) - return temp_file.name +def get_data_cfg_prefix(test): + app = test['app'] + if app == 'kmeans': + return f'L{test["n_samples"]}' + elif app in ['atax']: + return f'L{test["N"]}' + elif app in ['correlation', 'covariance']: + return f'L{test["M"]}' # Get tests from a test list file @@ -142,14 +139,15 @@ def get_tests(testlist, run_dir, hw_cfg): for test in tests: # Alias test parameters - length = test['length'] + if 'length' in test: + length = test['length'] n_clusters_to_use = test['n_clusters_to_use'] multicast = test['multicast'] app = test['app'] # Resolve derived test parameters mcast_prefix = "M" if multicast else "U" - prefix = f'{app}/L{length}/{mcast_prefix}/N{n_clusters_to_use}' + prefix = f'{app}/{get_data_cfg_prefix(test)}/{mcast_prefix}/N{n_clusters_to_use}' full_hw_cfg = f'{mcast_prefix}-{hw_cfg}' hw_cfg_file = CFG_DIR / f'{full_hw_cfg}.hjson' vsim_builddir = VSIM_BUILDDIR / f'{full_hw_cfg}' @@ -161,37 +159,34 @@ def get_tests(testlist, run_dir, hw_cfg): cflags = f'-DN_CLUSTERS_TO_USE={n_clusters_to_use}' if multicast: cflags += ' -DUSE_MULTICAST' - if app == 'axpy': - cflags += ' -DOFFLOAD_AXPY' - elif app == 'gemm': - cflags += ' -DOFFLOAD_GEMM' - elif app == 'kmeans': - cflags += ' -DOFFLOAD_KMEANS' - elif app == 'mc': + if app == 'mc': cflags += f' -DOFFLOAD_MONTECARLO -DMC_LENGTH={length}' + else: + cflags += f' -DOFFLOAD_{app.upper()}' env = extend_environment( RISCV_CFLAGS=cflags, - LENGTH=f'{length}', SECTION=".wide_spm", OFFLOAD=app) - if app == 'gemm': - gemm_cfg_file = get_gemm_cfg(length) - env = extend_environment(env, DATA_CFG=gemm_cfg_file) + if app in ['axpy', 'gemm', 'atax', 'correlation', 'covariance']: + data_cfg = get_data_cfg(test) + env = extend_environment(env, DATA_CFG=data_cfg) elif app == 'kmeans': - kmeans_cfg_file = get_kmeans_cfg(n_samples=length) - env = extend_environment(env, KMEANS_DATA_CFG=kmeans_cfg_file) + data_cfg = get_data_cfg(test) + env = extend_environment(env, KMEANS_DATA_CFG=data_cfg) # Extend test with derived parameters test['sim_bin'] = sim_bin test['prefix'] = prefix test['elf'] = elf test['device_elf'] = device_elf - if app == 'axpy': - test['cmd'] = [str(AXPY_VERIFY_PY), str(sim_bin), str(elf)] - elif app == 'gemm': - test['cmd'] = [str(GEMM_VERIFY_PY), str(sim_bin), str(elf)] - elif app == 'kmeans': - test['cmd'] = [str(KMEANS_VERIFY_PY), str(sim_bin), str(elf), '--no-gui'] + if app in ['axpy', 'gemm']: + verify_py = str(SNITCH_DIR / f'sw/blas/{app}/scripts/verify.py') + test['cmd'] = [verify_py, str(sim_bin), str(elf)] + elif app in ['kmeans', 'atax', 'correlation', 'covariance']: + verify_py = str(SNITCH_DIR / f'sw/apps/{app}/scripts/verify.py') + test['cmd'] = [verify_py, str(sim_bin), str(elf)] + if app == 'kmeans': + test['cmd'].append('--no-gui') elif app == 'mc': test['sim_bin'] = sim_bin test['run_dir'] = unique_run_dir diff --git a/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h index 1dc19777e..b1b3c4c25 100644 --- a/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h +++ b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h @@ -5,7 +5,7 @@ #define XSSR #include "axpy.h" -void axpy_job_unified(job_args_t* job_args) { +void axpy_job_unified(void* job_args) { double* local_x; double* local_y; double* local_z; diff --git a/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h b/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h index d0be98c63..b1b6bc331 100644 --- a/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h +++ b/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h @@ -5,7 +5,7 @@ __thread uint32_t n_samples_per_cluster, n_samples_per_core; __thread double *local_samples, *local_centroids, *final_centroids, *partial_centroids; __thread uint32_t *membership, *partial_membership_cnt; -void kmeans_iteration_job(job_args_t* job_args) { +void kmeans_iteration_job(void* job_args) { kmeans_args_t *args = (kmeans_args_t *)job_args; // Aliases diff --git a/target/sim/sw/device/apps/experimental_offload/src/offload.c b/target/sim/sw/device/apps/experimental_offload/src/offload.c index a55d34c19..a5e21410a 100644 --- a/target/sim/sw/device/apps/experimental_offload/src/offload.c +++ b/target/sim/sw/device/apps/experimental_offload/src/offload.c @@ -10,28 +10,44 @@ __thread usr_data_t* volatile usr_data_ptr; __thread uint32_t local_job_addr; __thread uint32_t remote_job_addr; +// Job arguments are already in TCDM, no need to load them with the DMA +#define JOB_ARGS_PRELOADED + #include "axpy_job.h" // #include "gemm_job.h" // #include "montecarlo_job.h" #include "kmeans_job.h" +#include "atax/src/atax.h" +#include "correlation/src/correlation.h" +#include "covariance/src/covariance.h" // Job function type -typedef void (*job_func_t)(job_args_t* args); +typedef void (*job_func_t)(void* args); // Job function array -__thread job_func_t jobs[N_JOB_TYPES] = {axpy_job_unified, NULL, NULL, kmeans_iteration_job}; +__thread job_func_t jobs[N_JOB_TYPES] = { + axpy_job_unified, + NULL, + NULL, + kmeans_iteration_job, + atax_job, + correlation_job, + covariance_job +}; static inline void run_job() { // Invoke job #if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) job_t* job = (job_t *)local_job_addr; uint32_t job_id = job->id; - if (snrt_is_dm_core()) - snrt_mcycle(); - jobs[job_id](&job->args); + if (snrt_is_dm_core()) snrt_mcycle(); + if (snrt_is_dm_core()) snrt_mcycle(); + jobs[job_id]((void *)&job->args); snrt_cluster_hw_barrier(); - if (snrt_is_dm_core()) + if (snrt_is_dm_core()) { + snrt_mcycle(); return_to_cva6_accelerated(job->offload_id); + } #else job_t* remote_job = (job_t*)remote_job_addr; job_t* local_job = (job_t *)local_job_addr; @@ -44,9 +60,11 @@ static inline void run_job() { if (snrt_cluster_idx() != 0) snrt_dma_start_1d(&local_job->args, &remote_job->args, job_args_size(local_job->id)); snrt_dma_wait_all(); + snrt_mcycle(); } snrt_cluster_hw_barrier(); - jobs[local_job->id](&local_job->args); + jobs[local_job->id]((void *)&local_job->args); + if (snrt_is_dm_core()) snrt_mcycle(); return_to_cva6(SYNC_ALL); #endif } @@ -76,7 +94,7 @@ int main() { snrt_wfi(); #if !defined(SUPPORTS_MULTICAST) || !defined(USE_MULTICAST) - // Get pointer to remote job in last cluster's TCDM + // Get pointer to remote job in first cluster's TCDM remote_job_addr = usr_data_ptr->local_job_addr; #endif diff --git a/target/sim/sw/host/apps/experimental_offload/Makefile b/target/sim/sw/host/apps/experimental_offload/Makefile index ed7608378..9d70f8c99 100644 --- a/target/sim/sw/host/apps/experimental_offload/Makefile +++ b/target/sim/sw/host/apps/experimental_offload/Makefile @@ -31,6 +31,18 @@ ifeq ($(OFFLOAD),kmeans) include $(APPS_DIR)/kmeans/Makefile endif +ifeq ($(OFFLOAD),atax) +include $(APPS_DIR)/atax/Makefile +endif + +ifeq ($(OFFLOAD),correlation) +include $(APPS_DIR)/correlation/Makefile +endif + +ifeq ($(OFFLOAD),covariance) +include $(APPS_DIR)/covariance/Makefile +endif + include ../common.mk ifneq ($(OFFLOAD),mc) diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.c b/target/sim/sw/host/apps/experimental_offload/src/offload.c index c3609f42e..1a364c39f 100644 --- a/target/sim/sw/host/apps/experimental_offload/src/offload.c +++ b/target/sim/sw/host/apps/experimental_offload/src/offload.c @@ -20,6 +20,12 @@ const int n_clusters_to_use = N_CLUSTERS; #elif defined(OFFLOAD_KMEANS) #include "kmeans/data/data.h" #include "kmeans_job.h" +#elif defined(OFFLOAD_ATAX) +#include "atax/data/data.h" +#elif defined(OFFLOAD_CORRELATION) +#include "correlation/data/data.h" +#elif defined(OFFLOAD_COVARIANCE) +#include "covariance/data/data.h" #endif #ifdef OFFLOAD_KMEANS @@ -162,6 +168,92 @@ static inline void send_job_and_wakeup(job_t *job, uint64_t l1_job_ptr) { #endif break; } + case J_ATAX: { + atax_args_t args = job->args.atax; + +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + uint64_t mask = ((n_clusters_to_use - 1) << 18); + enable_multicast(mask); +#endif + *((volatile uint64_t *)(l1_job_ptr)) = job->id; + *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) = + job->offload_id; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(atax_args_t, M))) = args.M; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(atax_args_t, N))) = args.N; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(atax_args_t, A_addr))) = args.A_addr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(atax_args_t, x_addr))) = args.x_addr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(atax_args_t, y_addr))) = args.y_addr; + + mcycle(); // Wakeup +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511; + disable_multicast(); +#else + wakeup_snitches(); +#endif + break; + } + case J_CORRELATION: { + correlation_args_t args = job->args.correlation; + +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + uint64_t mask = ((n_clusters_to_use - 1) << 18); + enable_multicast(mask); +#endif + *((volatile uint64_t *)(l1_job_ptr)) = job->id; + *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) = + job->offload_id; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(correlation_args_t, N))) = args.N; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(correlation_args_t, M))) = args.M; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(correlation_args_t, data_addr))) = args.data_addr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(correlation_args_t, corr_addr))) = args.corr_addr; + + mcycle(); // Wakeup +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511; + disable_multicast(); +#else + wakeup_snitches(); +#endif + break; + } + case J_COVARIANCE: { + covariance_args_t args = job->args.covariance; + +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + uint64_t mask = ((n_clusters_to_use - 1) << 18); + enable_multicast(mask); +#endif + *((volatile uint64_t *)(l1_job_ptr)) = job->id; + *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) = + job->offload_id; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(covariance_args_t, N))) = args.N; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(covariance_args_t, M))) = args.M; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(covariance_args_t, data_addr))) = args.data_addr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(covariance_args_t, cov_addr))) = args.cov_addr; + + mcycle(); // Wakeup +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511; + disable_multicast(); +#else + wakeup_snitches(); +#endif + break; + } } } @@ -215,6 +307,37 @@ int main() { job_t jobs[N_JOBS]; jobs[0] = first_iter_kmeans; for (uint32_t i = 1; i < N_JOBS; i++) jobs[i] = succ_iter_kmeans; +#elif defined(OFFLOAD_ATAX) + // TODO should we divide M and N by n_cluster_to_use? + atax_args_t atax_args = {M, + N, + WIDE_SPM_ADDR((uint64_t)A), + WIDE_SPM_ADDR((uint64_t)x), + WIDE_SPM_ADDR((uint64_t)y)}; + job_args_t job_args; + job_args.atax = atax_args; + job_t atax = {J_ATAX, 0, job_args}; + job_t jobs[N_JOBS] = {atax, atax}; +#elif defined(OFFLOAD_CORRELATION) + // TODO should we divide M and N by n_cluster_to_use? + correlation_args_t correlation_args = {N, + M, + WIDE_SPM_ADDR((uint64_t)data), + WIDE_SPM_ADDR((uint64_t)corr)}; + job_args_t job_args; + job_args.correlation = correlation_args; + job_t correlation = {J_CORRELATION, 0, job_args}; + job_t jobs[N_JOBS] = {correlation, correlation}; +#elif defined(OFFLOAD_COVARIANCE) + // TODO should we divide M and N by n_cluster_to_use? + covariance_args_t covariance_args = {N, + M, + WIDE_SPM_ADDR((uint64_t)data), + WIDE_SPM_ADDR((uint64_t)cov)}; + job_args_t job_args; + job_args.covariance = covariance_args; + job_t covariance = {J_COVARIANCE, 0, job_args}; + job_t jobs[N_JOBS] = {covariance, covariance}; #endif volatile uint32_t n_jobs = N_JOBS; @@ -275,6 +398,18 @@ int main() { double pi_estimate = *((double *)mc_args.result_ptr); double err = fabs(pi_estimate - 3.14); if (err > 0.5) return 1; +#elif defined(OFFLOAD_ATAX) + // Copy results from wide SPM to DRAM for verification + sys_dma_blk_memcpy((uint64_t)y, WIDE_SPM_ADDR((uint64_t)y), + N * sizeof(double)); +#elif defined(OFFLOAD_CORRELATION) + // Copy results from wide SPM to DRAM for verification + sys_dma_blk_memcpy((uint64_t)corr, WIDE_SPM_ADDR((uint64_t)corr), + M * M * sizeof(double)); +#elif defined(OFFLOAD_COVARIANCE) + // Copy results from wide SPM to DRAM for verification + sys_dma_blk_memcpy((uint64_t)cov, WIDE_SPM_ADDR((uint64_t)cov), + M * M * sizeof(double)); #endif // Exit routine diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.h b/target/sim/sw/host/apps/experimental_offload/src/offload.h index 3bd14ad46..72da2899f 100644 --- a/target/sim/sw/host/apps/experimental_offload/src/offload.h +++ b/target/sim/sw/host/apps/experimental_offload/src/offload.h @@ -5,6 +5,9 @@ #include #include "axpy/src/args.h" #include "kmeans/src/args.h" +#include "atax/src/args.h" +#include "correlation/src/args.h" +#include "covariance/src/args.h" typedef struct { volatile uint32_t local_job_addr; @@ -82,6 +85,36 @@ typedef struct { mc_args_t args; } mc_job_t; +////////// +// ATAX // +////////// + +typedef struct { + uint32_t id; + uint8_t offload_id; + atax_args_t args; +} atax_job_t; + +///////////////// +// Correlation // +///////////////// + +typedef struct { + uint32_t id; + uint8_t offload_id; + correlation_args_t args; +} correlation_job_t; + +//////////////// +// Covariance // +//////////////// + +typedef struct { + uint32_t id; + uint8_t offload_id; + covariance_args_t args; +} covariance_job_t; + ///////////// // Generic // ///////////// @@ -95,6 +128,9 @@ typedef union { gemm_args_t gemm; mc_args_t mc; kmeans_args_t kmeans; + atax_args_t atax; + correlation_args_t correlation; + covariance_args_t covariance; } job_args_t; typedef struct { @@ -103,8 +139,16 @@ typedef struct { job_args_t args; } job_t; -#define N_JOB_TYPES 4 -typedef enum { J_AXPY = 0, J_GEMM = 1, J_MONTECARLO = 2, J_KMEANS = 3 } job_id_t; +#define N_JOB_TYPES 7 +typedef enum { + J_AXPY = 0, + J_GEMM = 1, + J_MONTECARLO = 2, + J_KMEANS = 3, + J_ATAX = 4, + J_CORRELATION = 5, + J_COVARIANCE = 6 +} job_id_t; static inline uint32_t job_args_size(job_id_t job_id) { switch (job_id) { @@ -116,6 +160,12 @@ static inline uint32_t job_args_size(job_id_t job_id) { return sizeof(mc_args_t); case J_KMEANS: return sizeof(kmeans_args_t); + case J_ATAX: + return sizeof(atax_args_t); + case J_CORRELATION: + return sizeof(correlation_args_t); + case J_COVARIANCE: + return sizeof(covariance_args_t); default: return 0; }