From 93fb59fcbe0777b2754fe0b03df8a06af6de95f0 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 5 Apr 2024 16:49:55 +0200
Subject: [PATCH] experimental_offload: Add atax, correlation and covariance

---
 .../experiments/offload/data/atax.json.tpl    |   8 ++
 .../offload/data/correlation.json.tpl         |   8 ++
 .../offload/data/covariance.json.tpl          |   8 ++
 .../sim/experiments/offload/roi/atax.json.tpl | 109 ++++++++++++++
 .../offload/roi/correlation.json.tpl          | 106 ++++++++++++++
 .../offload/roi/covariance.json.tpl           | 106 ++++++++++++++
 .../offload/roi/offload_minimal.json.tpl      |  45 ++++++
 target/sim/experiments/offload/run.py         |  79 +++++-----
 .../apps/experimental_offload/src/axpy_job.h  |   2 +-
 .../experimental_offload/src/kmeans_job.h     |   2 +-
 .../apps/experimental_offload/src/offload.c   |  34 +++--
 .../host/apps/experimental_offload/Makefile   |  12 ++
 .../apps/experimental_offload/src/offload.c   | 135 ++++++++++++++++++
 .../apps/experimental_offload/src/offload.h   |  54 ++++++-
 14 files changed, 654 insertions(+), 54 deletions(-)
 create mode 100644 target/sim/experiments/offload/data/atax.json.tpl
 create mode 100644 target/sim/experiments/offload/data/correlation.json.tpl
 create mode 100644 target/sim/experiments/offload/data/covariance.json.tpl
 create mode 100644 target/sim/experiments/offload/roi/atax.json.tpl
 create mode 100644 target/sim/experiments/offload/roi/correlation.json.tpl
 create mode 100644 target/sim/experiments/offload/roi/covariance.json.tpl
 create mode 100644 target/sim/experiments/offload/roi/offload_minimal.json.tpl

diff --git a/target/sim/experiments/offload/data/atax.json.tpl b/target/sim/experiments/offload/data/atax.json.tpl
new file mode 100644
index 000000000..94f5e4eca
--- /dev/null
+++ b/target/sim/experiments/offload/data/atax.json.tpl
@@ -0,0 +1,8 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    M: 16,
+    N: ${N}
+}
diff --git a/target/sim/experiments/offload/data/correlation.json.tpl b/target/sim/experiments/offload/data/correlation.json.tpl
new file mode 100644
index 000000000..9e89d9f85
--- /dev/null
+++ b/target/sim/experiments/offload/data/correlation.json.tpl
@@ -0,0 +1,8 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    M: 16,
+    N: 8
+}
diff --git a/target/sim/experiments/offload/data/covariance.json.tpl b/target/sim/experiments/offload/data/covariance.json.tpl
new file mode 100644
index 000000000..9e89d9f85
--- /dev/null
+++ b/target/sim/experiments/offload/data/covariance.json.tpl
@@ -0,0 +1,8 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    M: 16,
+    N: 8
+}
diff --git a/target/sim/experiments/offload/roi/atax.json.tpl b/target/sim/experiments/offload/roi/atax.json.tpl
new file mode 100644
index 000000000..fc7b44e65
--- /dev/null
+++ b/target/sim/experiments/offload/roi/atax.json.tpl
@@ -0,0 +1,109 @@
+[
+    // CVA6 core
+    {
+        "thread": "hart_0",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "prepare data"},
+            {"idx": 2, "label": "send interrupt"},
+            {"idx": 4, "label": "clr interrupt"},
+            // Iteration 2
+            {"idx": 5, "label": "prepare data"},
+            {"idx": 6, "label": "send interrupt"},
+            {"idx": 8, "label": "clr interrupt"}
+        ]
+    },
+
+// Snitch clusters
+% for i in range(0, nr_clusters):
+
+    // Compute cores
+    % for j in range(0, 8):
+    {
+        "thread": "${f'hart_{1 + 9*i + j}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job setup"},
+            {"idx": 3, "label": "barrier"},
+            {"idx": 4, "label": "Ax"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "AtAx"},
+            {"idx": 7, "label": "barrier"},
+            // Iteration 2
+            {"idx": 10, "label": "clr interrupt"},
+            {"idx": 11, "label": "job setup"},
+            {"idx": 12, "label": "barrier"},
+            {"idx": 13, "label": "Ax"},
+            {"idx": 14, "label": "barrier"},
+            {"idx": 15, "label": "AtAx"},
+            {"idx": 16, "label": "barrier"},
+        ]
+    },
+    % endfor
+
+    // DMA cores
+    {
+        "thread": "${f'hart_{1 + 9*i + 8}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr"},
+            {"idx": 3, "label": "get job args"},
+            {"idx": 4, "label": "copy data in"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "copy data out"},
+            {"idx": 7, "label": "return"},
+            // Iteration 2
+            {"idx": 10, "label": "clr interrupt"},
+            {"idx": 11, "label": "get job ptr"},
+            {"idx": 12, "label": "get job args"},
+            {"idx": 13, "label": "copy data in"},
+            {"idx": 14, "label": "barrier"},
+            {"idx": 15, "label": "copy data out"},
+            {"idx": 16, "label": "return"},
+        ]
+    },
+
+    // DMA engine proper
+    % if i == 0 or multicast:
+    {
+        "thread": "${f'dma_{1 + 9*i + 8}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": -10, "label": "A in"},
+            {"idx": -9, "label": "x in"},
+            {"idx": -8, "label": "y tile in"},
+            {"idx": -7, "label": "tmp in"},
+            {"idx": -6, "label": "y out"},
+            // Iteration 2
+            {"idx": -5, "label": "A in"},
+            {"idx": -4, "label": "x in"},
+            {"idx": -3, "label": "y tile in"},
+            {"idx": -2, "label": "tmp in"},
+            {"idx": -1, "label": "y out"},
+        ]
+    },
+    % else:
+    {
+        "thread": "${f'dma_{1 + 9*i + 8}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": -12, "label": "job info"},
+            {"idx": -11, "label": "A in"},
+            {"idx": -10, "label": "x in"},
+            {"idx": -9, "label": "y tile in"},
+            {"idx": -8, "label": "tmp in"},
+            {"idx": -7, "label": "y out"},
+            // Iteration 2
+            {"idx": -6, "label": "job info"},
+            {"idx": -5, "label": "A in"},
+            {"idx": -4, "label": "x in"},
+            {"idx": -3, "label": "y tile in"},
+            {"idx": -2, "label": "tmp in"},
+            {"idx": -1, "label": "y out"},
+        ]
+    },
+    % endif
+% endfor
+]
\ No newline at end of file
diff --git a/target/sim/experiments/offload/roi/correlation.json.tpl b/target/sim/experiments/offload/roi/correlation.json.tpl
new file mode 100644
index 000000000..dbfbb8e24
--- /dev/null
+++ b/target/sim/experiments/offload/roi/correlation.json.tpl
@@ -0,0 +1,106 @@
+[
+    // CVA6 core
+    {
+        "thread": "hart_0",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "prepare data"},
+            {"idx": 2, "label": "send interrupt"},
+            {"idx": 4, "label": "clr interrupt"},
+            // Iteration 2
+            {"idx": 5, "label": "prepare data"},
+            {"idx": 6, "label": "send interrupt"},
+            {"idx": 8, "label": "clr interrupt"}
+        ]
+    },
+
+// Snitch clusters
+% for i in range(0, nr_clusters):
+
+    // Compute cores
+    % for j in range(0, 8):
+    {
+        "thread": "${f'hart_{1 + 9*i + j}'}",
+        "roi": [
+        ## Only cluster 0 computes step 2
+        % if i == 0:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job setup"},
+            {"idx": 3, "label": "barrier"},
+            {"idx": 4, "label": "compute step 1"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "compute step 2"},
+            {"idx": 7, "label": "barrier"},
+            // Iteration 2
+            {"idx": 10, "label": "clr interrupt"},
+            {"idx": 11, "label": "job setup"},
+            {"idx": 12, "label": "barrier"},
+            {"idx": 13, "label": "compute step 1"},
+            {"idx": 14, "label": "barrier"},
+            {"idx": 15, "label": "compute step 2"},
+            {"idx": 16, "label": "barrier"},
+        % else:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job setup"},
+            {"idx": 3, "label": "barrier"},
+            {"idx": 4, "label": "compute step 1"},
+            {"idx": 5, "label": "barrier"},
+            // Iteration 2
+            {"idx": 8, "label": "clr interrupt"},
+            {"idx": 9, "label": "job setup"},
+            {"idx": 10, "label": "barrier"},
+            {"idx": 11, "label": "compute step 1"},
+            {"idx": 12, "label": "barrier"},
+        % endif
+        ]
+    },
+    % endfor
+
+    // DMA cores
+    {
+        "thread": "${f'hart_{1 + 9*i + 8}'}",
+        "roi": [
+    ## Cluster 0's DMA core aggregates the data from step 1
+    % if i == 0:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr"},
+            {"idx": 3, "label": "get job args"},
+            {"idx": 4, "label": "copy data in"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "copy step1 data"},
+            {"idx": 7, "label": "barrier"},
+            {"idx": 8, "label": "copy data out"},
+            {"idx": 9, "label": "return"},
+            // Iteration 2
+            {"idx": 12, "label": "clr interrupt"},
+            {"idx": 13, "label": "get job ptr"},
+            {"idx": 14, "label": "get job args"},
+            {"idx": 15, "label": "copy data in"},
+            {"idx": 16, "label": "barrier"},
+            {"idx": 17, "label": "copy step1 data"},
+            {"idx": 18, "label": "barrier"},
+            {"idx": 19, "label": "copy data out"},
+            {"idx": 20, "label": "return"},
+    % else:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr"},
+            {"idx": 3, "label": "get job args"},
+            {"idx": 4, "label": "copy data in"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "return"},
+            // Iteration 2
+            {"idx": 9, "label": "clr interrupt"},
+            {"idx": 10, "label": "get job ptr"},
+            {"idx": 11, "label": "get job args"},
+            {"idx": 12, "label": "copy data in"},
+            {"idx": 13, "label": "barrier"},
+            {"idx": 14, "label": "return"},
+    % endif
+        ]
+    },
+% endfor
+]
\ No newline at end of file
diff --git a/target/sim/experiments/offload/roi/covariance.json.tpl b/target/sim/experiments/offload/roi/covariance.json.tpl
new file mode 100644
index 000000000..dbfbb8e24
--- /dev/null
+++ b/target/sim/experiments/offload/roi/covariance.json.tpl
@@ -0,0 +1,106 @@
+[
+    // CVA6 core
+    {
+        "thread": "hart_0",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "prepare data"},
+            {"idx": 2, "label": "send interrupt"},
+            {"idx": 4, "label": "clr interrupt"},
+            // Iteration 2
+            {"idx": 5, "label": "prepare data"},
+            {"idx": 6, "label": "send interrupt"},
+            {"idx": 8, "label": "clr interrupt"}
+        ]
+    },
+
+// Snitch clusters
+% for i in range(0, nr_clusters):
+
+    // Compute cores
+    % for j in range(0, 8):
+    {
+        "thread": "${f'hart_{1 + 9*i + j}'}",
+        "roi": [
+        ## Only cluster 0 computes step 2
+        % if i == 0:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job setup"},
+            {"idx": 3, "label": "barrier"},
+            {"idx": 4, "label": "compute step 1"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "compute step 2"},
+            {"idx": 7, "label": "barrier"},
+            // Iteration 2
+            {"idx": 10, "label": "clr interrupt"},
+            {"idx": 11, "label": "job setup"},
+            {"idx": 12, "label": "barrier"},
+            {"idx": 13, "label": "compute step 1"},
+            {"idx": 14, "label": "barrier"},
+            {"idx": 15, "label": "compute step 2"},
+            {"idx": 16, "label": "barrier"},
+        % else:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job setup"},
+            {"idx": 3, "label": "barrier"},
+            {"idx": 4, "label": "compute step 1"},
+            {"idx": 5, "label": "barrier"},
+            // Iteration 2
+            {"idx": 8, "label": "clr interrupt"},
+            {"idx": 9, "label": "job setup"},
+            {"idx": 10, "label": "barrier"},
+            {"idx": 11, "label": "compute step 1"},
+            {"idx": 12, "label": "barrier"},
+        % endif
+        ]
+    },
+    % endfor
+
+    // DMA cores
+    {
+        "thread": "${f'hart_{1 + 9*i + 8}'}",
+        "roi": [
+    ## Cluster 0's DMA core aggregates the data from step 1
+    % if i == 0:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr"},
+            {"idx": 3, "label": "get job args"},
+            {"idx": 4, "label": "copy data in"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "copy step1 data"},
+            {"idx": 7, "label": "barrier"},
+            {"idx": 8, "label": "copy data out"},
+            {"idx": 9, "label": "return"},
+            // Iteration 2
+            {"idx": 12, "label": "clr interrupt"},
+            {"idx": 13, "label": "get job ptr"},
+            {"idx": 14, "label": "get job args"},
+            {"idx": 15, "label": "copy data in"},
+            {"idx": 16, "label": "barrier"},
+            {"idx": 17, "label": "copy step1 data"},
+            {"idx": 18, "label": "barrier"},
+            {"idx": 19, "label": "copy data out"},
+            {"idx": 20, "label": "return"},
+    % else:
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr"},
+            {"idx": 3, "label": "get job args"},
+            {"idx": 4, "label": "copy data in"},
+            {"idx": 5, "label": "barrier"},
+            {"idx": 6, "label": "return"},
+            // Iteration 2
+            {"idx": 9, "label": "clr interrupt"},
+            {"idx": 10, "label": "get job ptr"},
+            {"idx": 11, "label": "get job args"},
+            {"idx": 12, "label": "copy data in"},
+            {"idx": 13, "label": "barrier"},
+            {"idx": 14, "label": "return"},
+    % endif
+        ]
+    },
+% endfor
+]
\ No newline at end of file
diff --git a/target/sim/experiments/offload/roi/offload_minimal.json.tpl b/target/sim/experiments/offload/roi/offload_minimal.json.tpl
new file mode 100644
index 000000000..93855314b
--- /dev/null
+++ b/target/sim/experiments/offload/roi/offload_minimal.json.tpl
@@ -0,0 +1,45 @@
+[
+    {
+        "thread": "hart_0",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "prepare data"},
+            {"idx": 2, "label": "send interrupt"},
+            {"idx": 4, "label": "clr interrupt"},
+            // Iteration 2
+            {"idx": 5, "label": "prepare data"},
+            {"idx": 6, "label": "send interrupt"},
+            {"idx": 8, "label": "clr interrupt"}
+        ]
+    },
+% for i in range(0, nr_clusters):
+    // Compute cores
+    % for j in range(0, 8):
+    {
+        "thread": "${f'hart_{1 + 9*i + j}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "job"},
+            // Iteration 2
+            {"idx": 4, "label": "clr interrupt"},
+            {"idx": 5, "label": "job"},
+        ]
+    },
+    % endfor
+    // DMA cores
+    {
+        "thread": "${f'hart_{1 + 9*i + 8}'}",
+        "roi": [
+            // Iteration 1
+            {"idx": 1, "label": "clr interrupt"},
+            {"idx": 2, "label": "get job ptr and args"},
+            {"idx": 3, "label": "job"},
+            // Iteration 2
+            {"idx": 5, "label": "clr interrupt"},
+            {"idx": 6, "label": "get job ptr and args"},
+            {"idx": 7, "label": "job"},
+        ]
+    },
+% endfor
+]
\ No newline at end of file
diff --git a/target/sim/experiments/offload/run.py b/target/sim/experiments/offload/run.py
index 6f4bd9afb..e42734d7f 100755
--- a/target/sim/experiments/offload/run.py
+++ b/target/sim/experiments/offload/run.py
@@ -24,9 +24,6 @@
 FILE_DIR = Path(__file__).parent.resolve()
 TARGET_DIR = FILE_DIR / '../../'
 SNITCH_DIR = TARGET_DIR / '../../deps/snitch_cluster'
-AXPY_VERIFY_PY = SNITCH_DIR / 'sw/blas/axpy/scripts/verify.py'
-GEMM_VERIFY_PY = SNITCH_DIR / 'sw/blas/gemm/scripts/verify.py'
-KMEANS_VERIFY_PY = SNITCH_DIR / 'sw/apps/kmeans/scripts/verify.py'
 APP = 'experimental_offload'
 SOURCE_BUILD_DIR = TARGET_DIR / f'sw/host/apps/{APP}/build'
 TARGET_BUILD_DIR = FILE_DIR / 'build'
@@ -35,11 +32,6 @@
 BIN_DIR = Path('bin')
 VSIM_BUILDDIR = Path('work-vsim')
 
-KMEANS_CFG_TEMPLATE = FILE_DIR / 'data' / 'kmeans.json.tpl'
-
-KMEANS_ROI_TEMPLATE = FILE_DIR / 'roi' / 'kmeans.json.tpl'
-GEMM_ROI_TEMPLATE = FILE_DIR / 'roi' / 'gemm.json.tpl'
-
 
 def run(cmd, env=None, dry_run=False):
     cmd = [str(arg) for arg in cmd]
@@ -92,19 +84,19 @@ def build_hw(tests, dry_run=False):
 
 def post_process_traces(test, dry_run=False):
     n_clusters_to_use = test['n_clusters_to_use']
+    multicast = test['multicast']
     run_dir = test['run_dir']
     logdir = run_dir / 'logs'
     device_elf = test['device_elf']
     hw_cfg = test['hw_cfg']
     roi_spec = logdir / 'roi_spec.json'
+    app = test['app']
     # Read and render specification template JSON
-    if test['app'] == 'gemm':
-        roi_spec_tpl = GEMM_ROI_TEMPLATE
-    elif test['app'] == 'kmeans':
-        roi_spec_tpl = KMEANS_ROI_TEMPLATE
+    if app in ['gemm', 'kmeans', 'atax', 'correlation', 'covariance']:
+        roi_spec_tpl = FILE_DIR / 'roi' / f'{app}.json.tpl'
     with open(roi_spec_tpl, 'r') as f:
         spec_template = Template(f.read())
-        rendered_spec = spec_template.render(nr_clusters=n_clusters_to_use)
+        rendered_spec = spec_template.render(nr_clusters=n_clusters_to_use, multicast=multicast)
         spec = json5.loads(rendered_spec)
     with open(roi_spec, 'w') as f:
         json.dump(spec, f, indent=4)
@@ -113,21 +105,26 @@ def post_process_traces(test, dry_run=False):
     run(['make', '-C', TARGET_DIR, f'SIM_DIR={run_dir}', f'BINARY={device_elf}', 'annotate', '-j'],
         dry_run=dry_run)
     run(['make', '-C', TARGET_DIR, f'SIM_DIR={run_dir}', f'ROI_SPEC={roi_spec}',
-        f'CFG_OVERRIDE={hw_cfg}', 'visual-trace'], dry_run=dry_run)
+        f'CFG_OVERRIDE={hw_cfg}', f'BINARY={device_elf}', 'visual-trace'], dry_run=dry_run)
 
 
-def get_gemm_cfg(n):
-    filled_template = Template(filename=str(GEMM_CFG_TEMPLATE)).render(N=n)
+def get_data_cfg(test):
+    app = test['app']
+    cfg_template = str(FILE_DIR / 'data' / f'{app}.json.tpl')
+    filled_template = Template(filename=cfg_template).render(**test)
     with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
         temp_file.write(filled_template)
         return temp_file.name
 
 
-def get_kmeans_cfg(**kwargs):
-    filled_template = Template(filename=str(KMEANS_CFG_TEMPLATE)).render(**kwargs)
-    with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
-        temp_file.write(filled_template)
-        return temp_file.name
+def get_data_cfg_prefix(test):
+    app = test['app']
+    if app == 'kmeans':
+        return f'L{test["n_samples"]}'
+    elif app in ['atax']:
+        return f'L{test["N"]}'
+    elif app in ['correlation', 'covariance']:
+        return f'L{test["M"]}'
 
 
 # Get tests from a test list file
@@ -142,14 +139,15 @@ def get_tests(testlist, run_dir, hw_cfg):
     for test in tests:
 
         # Alias test parameters
-        length = test['length']
+        if 'length' in test:
+            length = test['length']
         n_clusters_to_use = test['n_clusters_to_use']
         multicast = test['multicast']
         app = test['app']
 
         # Resolve derived test parameters
         mcast_prefix = "M" if multicast else "U"
-        prefix = f'{app}/L{length}/{mcast_prefix}/N{n_clusters_to_use}'
+        prefix = f'{app}/{get_data_cfg_prefix(test)}/{mcast_prefix}/N{n_clusters_to_use}'
         full_hw_cfg = f'{mcast_prefix}-{hw_cfg}'
         hw_cfg_file = CFG_DIR / f'{full_hw_cfg}.hjson'
         vsim_builddir = VSIM_BUILDDIR / f'{full_hw_cfg}'
@@ -161,37 +159,34 @@ def get_tests(testlist, run_dir, hw_cfg):
         cflags = f'-DN_CLUSTERS_TO_USE={n_clusters_to_use}'
         if multicast:
             cflags += ' -DUSE_MULTICAST'
-        if app == 'axpy':
-            cflags += ' -DOFFLOAD_AXPY'
-        elif app == 'gemm':
-            cflags += ' -DOFFLOAD_GEMM'
-        elif app == 'kmeans':
-            cflags += ' -DOFFLOAD_KMEANS'
-        elif app == 'mc':
+        if app == 'mc':
             cflags += f' -DOFFLOAD_MONTECARLO -DMC_LENGTH={length}'
+        else:
+            cflags += f' -DOFFLOAD_{app.upper()}'
         env = extend_environment(
             RISCV_CFLAGS=cflags,
-            LENGTH=f'{length}',
             SECTION=".wide_spm",
             OFFLOAD=app)
-        if app == 'gemm':
-            gemm_cfg_file = get_gemm_cfg(length)
-            env = extend_environment(env, DATA_CFG=gemm_cfg_file)
+        if app in ['axpy', 'gemm', 'atax', 'correlation', 'covariance']:
+            data_cfg = get_data_cfg(test)
+            env = extend_environment(env, DATA_CFG=data_cfg)
         elif app == 'kmeans':
-            kmeans_cfg_file = get_kmeans_cfg(n_samples=length)
-            env = extend_environment(env, KMEANS_DATA_CFG=kmeans_cfg_file)
+            data_cfg = get_data_cfg(test)
+            env = extend_environment(env, KMEANS_DATA_CFG=data_cfg)
 
         # Extend test with derived parameters
         test['sim_bin'] = sim_bin
         test['prefix'] = prefix
         test['elf'] = elf
         test['device_elf'] = device_elf
-        if app == 'axpy':
-            test['cmd'] = [str(AXPY_VERIFY_PY), str(sim_bin), str(elf)]
-        elif app == 'gemm':
-            test['cmd'] = [str(GEMM_VERIFY_PY), str(sim_bin), str(elf)]
-        elif app == 'kmeans':
-            test['cmd'] = [str(KMEANS_VERIFY_PY), str(sim_bin), str(elf), '--no-gui']
+        if app in ['axpy', 'gemm']:
+            verify_py = str(SNITCH_DIR / f'sw/blas/{app}/scripts/verify.py')
+            test['cmd'] = [verify_py, str(sim_bin), str(elf)]
+        elif app in ['kmeans', 'atax', 'correlation', 'covariance']:
+            verify_py = str(SNITCH_DIR / f'sw/apps/{app}/scripts/verify.py')
+            test['cmd'] = [verify_py, str(sim_bin), str(elf)]
+            if app == 'kmeans':
+                test['cmd'].append('--no-gui')
         elif app == 'mc':
             test['sim_bin'] = sim_bin
         test['run_dir'] = unique_run_dir
diff --git a/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h
index 1dc19777e..b1b3c4c25 100644
--- a/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h
+++ b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h
@@ -5,7 +5,7 @@
 #define XSSR
 #include "axpy.h"
 
-void axpy_job_unified(job_args_t* job_args) {
+void axpy_job_unified(void* job_args) {
     double* local_x;
     double* local_y;
     double* local_z;
diff --git a/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h b/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h
index d0be98c63..b1b6bc331 100644
--- a/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h
+++ b/target/sim/sw/device/apps/experimental_offload/src/kmeans_job.h
@@ -5,7 +5,7 @@ __thread uint32_t n_samples_per_cluster, n_samples_per_core;
 __thread double *local_samples, *local_centroids, *final_centroids, *partial_centroids;
 __thread uint32_t *membership, *partial_membership_cnt;
 
-void kmeans_iteration_job(job_args_t* job_args) {
+void kmeans_iteration_job(void* job_args) {
     kmeans_args_t *args = (kmeans_args_t *)job_args;
 
     // Aliases
diff --git a/target/sim/sw/device/apps/experimental_offload/src/offload.c b/target/sim/sw/device/apps/experimental_offload/src/offload.c
index a55d34c19..a5e21410a 100644
--- a/target/sim/sw/device/apps/experimental_offload/src/offload.c
+++ b/target/sim/sw/device/apps/experimental_offload/src/offload.c
@@ -10,28 +10,44 @@ __thread usr_data_t* volatile usr_data_ptr;
 __thread uint32_t local_job_addr;
 __thread uint32_t remote_job_addr;
 
+// Job arguments are already in TCDM, no need to load them with the DMA
+#define JOB_ARGS_PRELOADED
+
 #include "axpy_job.h"
 // #include "gemm_job.h"
 // #include "montecarlo_job.h"
 #include "kmeans_job.h"
+#include "atax/src/atax.h"
+#include "correlation/src/correlation.h"
+#include "covariance/src/covariance.h"
 
 // Job function type
-typedef void (*job_func_t)(job_args_t* args);
+typedef void (*job_func_t)(void* args);
 
 // Job function array
-__thread job_func_t jobs[N_JOB_TYPES] = {axpy_job_unified, NULL, NULL, kmeans_iteration_job};
+__thread job_func_t jobs[N_JOB_TYPES] = {
+    axpy_job_unified,
+    NULL,
+    NULL,
+    kmeans_iteration_job,
+    atax_job,
+    correlation_job,
+    covariance_job
+};
 
 static inline void run_job() {
     // Invoke job
 #if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
     job_t* job = (job_t *)local_job_addr;
     uint32_t job_id = job->id;
-    if (snrt_is_dm_core())
-        snrt_mcycle();
-    jobs[job_id](&job->args);
+    if (snrt_is_dm_core()) snrt_mcycle();
+    if (snrt_is_dm_core()) snrt_mcycle();
+    jobs[job_id]((void *)&job->args);
     snrt_cluster_hw_barrier();
-    if (snrt_is_dm_core())
+    if (snrt_is_dm_core()) {
+        snrt_mcycle();
         return_to_cva6_accelerated(job->offload_id);
+    }
 #else
     job_t* remote_job = (job_t*)remote_job_addr;
     job_t* local_job = (job_t *)local_job_addr;
@@ -44,9 +60,11 @@ static inline void run_job() {
         if (snrt_cluster_idx() != 0)
             snrt_dma_start_1d(&local_job->args, &remote_job->args, job_args_size(local_job->id));
         snrt_dma_wait_all();
+        snrt_mcycle();
     }
     snrt_cluster_hw_barrier();
-    jobs[local_job->id](&local_job->args);
+    jobs[local_job->id]((void *)&local_job->args);
+    if (snrt_is_dm_core()) snrt_mcycle();
     return_to_cva6(SYNC_ALL);
 #endif
 }
@@ -76,7 +94,7 @@ int main() {
     snrt_wfi();
 
 #if !defined(SUPPORTS_MULTICAST) || !defined(USE_MULTICAST)
-    // Get pointer to remote job in last cluster's TCDM
+    // Get pointer to remote job in first cluster's TCDM
     remote_job_addr = usr_data_ptr->local_job_addr;
 #endif
 
diff --git a/target/sim/sw/host/apps/experimental_offload/Makefile b/target/sim/sw/host/apps/experimental_offload/Makefile
index ed7608378..9d70f8c99 100644
--- a/target/sim/sw/host/apps/experimental_offload/Makefile
+++ b/target/sim/sw/host/apps/experimental_offload/Makefile
@@ -31,6 +31,18 @@ ifeq ($(OFFLOAD),kmeans)
 include $(APPS_DIR)/kmeans/Makefile
 endif
 
+ifeq ($(OFFLOAD),atax)
+include $(APPS_DIR)/atax/Makefile
+endif
+
+ifeq ($(OFFLOAD),correlation)
+include $(APPS_DIR)/correlation/Makefile
+endif
+
+ifeq ($(OFFLOAD),covariance)
+include $(APPS_DIR)/covariance/Makefile
+endif
+
 include ../common.mk
 
 ifneq ($(OFFLOAD),mc)
diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.c b/target/sim/sw/host/apps/experimental_offload/src/offload.c
index c3609f42e..1a364c39f 100644
--- a/target/sim/sw/host/apps/experimental_offload/src/offload.c
+++ b/target/sim/sw/host/apps/experimental_offload/src/offload.c
@@ -20,6 +20,12 @@ const int n_clusters_to_use = N_CLUSTERS;
 #elif defined(OFFLOAD_KMEANS)
 #include "kmeans/data/data.h"
 #include "kmeans_job.h"
+#elif defined(OFFLOAD_ATAX)
+#include "atax/data/data.h"
+#elif defined(OFFLOAD_CORRELATION)
+#include "correlation/data/data.h"
+#elif defined(OFFLOAD_COVARIANCE)
+#include "covariance/data/data.h"
 #endif
 
 #ifdef OFFLOAD_KMEANS
@@ -162,6 +168,92 @@ static inline void send_job_and_wakeup(job_t *job, uint64_t l1_job_ptr) {
 #endif
             break;
         }
+        case J_ATAX: {
+            atax_args_t args = job->args.atax;
+
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            uint64_t mask = ((n_clusters_to_use - 1) << 18);
+            enable_multicast(mask);
+#endif
+            *((volatile uint64_t *)(l1_job_ptr)) = job->id;
+            *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) =
+                job->offload_id;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(atax_args_t, M))) = args.M;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(atax_args_t, N))) = args.N;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(atax_args_t, A_addr))) = args.A_addr;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(atax_args_t, x_addr))) = args.x_addr;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(atax_args_t, y_addr))) = args.y_addr;
+
+            mcycle();  // Wakeup
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511;
+            disable_multicast();
+#else
+            wakeup_snitches();
+#endif
+            break;
+        }
+        case J_CORRELATION: {
+            correlation_args_t args = job->args.correlation;
+
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            uint64_t mask = ((n_clusters_to_use - 1) << 18);
+            enable_multicast(mask);
+#endif
+            *((volatile uint64_t *)(l1_job_ptr)) = job->id;
+            *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) =
+                job->offload_id;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(correlation_args_t, N))) = args.N;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(correlation_args_t, M))) = args.M;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(correlation_args_t, data_addr))) = args.data_addr;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(correlation_args_t, corr_addr))) = args.corr_addr;
+
+            mcycle();  // Wakeup
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511;
+            disable_multicast();
+#else
+            wakeup_snitches();
+#endif
+            break;
+        }
+        case J_COVARIANCE: {
+            covariance_args_t args = job->args.covariance;
+
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            uint64_t mask = ((n_clusters_to_use - 1) << 18);
+            enable_multicast(mask);
+#endif
+            *((volatile uint64_t *)(l1_job_ptr)) = job->id;
+            *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) =
+                job->offload_id;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(covariance_args_t, N))) = args.N;
+            *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(covariance_args_t, M))) = args.M;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(covariance_args_t, data_addr))) = args.data_addr;
+            *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) +
+                                    offsetof(covariance_args_t, cov_addr))) = args.cov_addr;
+
+            mcycle();  // Wakeup
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+            *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511;
+            disable_multicast();
+#else
+            wakeup_snitches();
+#endif
+            break;
+        }
     }
 }
 
@@ -215,6 +307,37 @@ int main() {
     job_t jobs[N_JOBS];
     jobs[0] = first_iter_kmeans;
     for (uint32_t i = 1; i < N_JOBS; i++) jobs[i] = succ_iter_kmeans;
+#elif defined(OFFLOAD_ATAX)
+    // TODO should we divide M and N by n_cluster_to_use?
+    atax_args_t atax_args = {M,
+                             N,
+                             WIDE_SPM_ADDR((uint64_t)A),
+                             WIDE_SPM_ADDR((uint64_t)x),
+                             WIDE_SPM_ADDR((uint64_t)y)};
+    job_args_t job_args;
+    job_args.atax = atax_args;
+    job_t atax = {J_ATAX, 0, job_args};
+    job_t jobs[N_JOBS] = {atax, atax};
+#elif defined(OFFLOAD_CORRELATION)
+    // TODO should we divide M and N by n_cluster_to_use?
+    correlation_args_t correlation_args = {N,
+                                           M,
+                                           WIDE_SPM_ADDR((uint64_t)data),
+                                           WIDE_SPM_ADDR((uint64_t)corr)};
+    job_args_t job_args;
+    job_args.correlation = correlation_args;
+    job_t correlation = {J_CORRELATION, 0, job_args};
+    job_t jobs[N_JOBS] = {correlation, correlation};
+#elif defined(OFFLOAD_COVARIANCE)
+    // TODO should we divide M and N by n_cluster_to_use?
+    covariance_args_t covariance_args = {N,
+                                         M,
+                                         WIDE_SPM_ADDR((uint64_t)data),
+                                         WIDE_SPM_ADDR((uint64_t)cov)};
+    job_args_t job_args;
+    job_args.covariance = covariance_args;
+    job_t covariance = {J_COVARIANCE, 0, job_args};
+    job_t jobs[N_JOBS] = {covariance, covariance};
 #endif
 
     volatile uint32_t n_jobs = N_JOBS;
@@ -275,6 +398,18 @@ int main() {
     double pi_estimate = *((double *)mc_args.result_ptr);
     double err = fabs(pi_estimate - 3.14);
     if (err > 0.5) return 1;
+#elif defined(OFFLOAD_ATAX)
+    // Copy results from wide SPM to DRAM for verification
+    sys_dma_blk_memcpy((uint64_t)y, WIDE_SPM_ADDR((uint64_t)y),
+                       N * sizeof(double));
+#elif defined(OFFLOAD_CORRELATION)
+    // Copy results from wide SPM to DRAM for verification
+    sys_dma_blk_memcpy((uint64_t)corr, WIDE_SPM_ADDR((uint64_t)corr),
+                       M * M * sizeof(double));
+#elif defined(OFFLOAD_COVARIANCE)
+    // Copy results from wide SPM to DRAM for verification
+    sys_dma_blk_memcpy((uint64_t)cov, WIDE_SPM_ADDR((uint64_t)cov),
+                       M * M * sizeof(double));
 #endif
 
     // Exit routine
diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.h b/target/sim/sw/host/apps/experimental_offload/src/offload.h
index 3bd14ad46..72da2899f 100644
--- a/target/sim/sw/host/apps/experimental_offload/src/offload.h
+++ b/target/sim/sw/host/apps/experimental_offload/src/offload.h
@@ -5,6 +5,9 @@
 #include <stdint.h>
 #include "axpy/src/args.h"
 #include "kmeans/src/args.h"
+#include "atax/src/args.h"
+#include "correlation/src/args.h"
+#include "covariance/src/args.h"
 
 typedef struct {
     volatile uint32_t local_job_addr;
@@ -82,6 +85,36 @@ typedef struct {
     mc_args_t args;
 } mc_job_t;
 
+//////////
+// ATAX //
+//////////
+
+typedef struct {
+    uint32_t id;
+    uint8_t offload_id;
+    atax_args_t args;
+} atax_job_t;
+
+/////////////////
+// Correlation //
+/////////////////
+
+typedef struct {
+    uint32_t id;
+    uint8_t offload_id;
+    correlation_args_t args;
+} correlation_job_t;
+
+////////////////
+// Covariance //
+////////////////
+
+typedef struct {
+    uint32_t id;
+    uint8_t offload_id;
+    covariance_args_t args;
+} covariance_job_t;
+
 /////////////
 // Generic //
 /////////////
@@ -95,6 +128,9 @@ typedef union {
     gemm_args_t gemm;
     mc_args_t mc;
     kmeans_args_t kmeans;
+    atax_args_t atax;
+    correlation_args_t correlation;
+    covariance_args_t covariance;
 } job_args_t;
 
 typedef struct {
@@ -103,8 +139,16 @@ typedef struct {
     job_args_t args;
 } job_t;
 
-#define N_JOB_TYPES 4
-typedef enum { J_AXPY = 0, J_GEMM = 1, J_MONTECARLO = 2, J_KMEANS = 3 } job_id_t;
+#define N_JOB_TYPES 7
+typedef enum {
+    J_AXPY = 0,
+    J_GEMM = 1,
+    J_MONTECARLO = 2,
+    J_KMEANS = 3,
+    J_ATAX = 4,
+    J_CORRELATION = 5,
+    J_COVARIANCE = 6
+} job_id_t;
 
 static inline uint32_t job_args_size(job_id_t job_id) {
     switch (job_id) {
@@ -116,6 +160,12 @@ static inline uint32_t job_args_size(job_id_t job_id) {
         return sizeof(mc_args_t);
     case J_KMEANS:
         return sizeof(kmeans_args_t);
+    case J_ATAX:
+        return sizeof(atax_args_t);
+    case J_CORRELATION:
+        return sizeof(correlation_args_t);
+    case J_COVARIANCE:
+        return sizeof(covariance_args_t);
     default:
         return 0;
     }