[software] Run single iteration of 2dconv

pulp-platform · Sep 17, 2024 · ecc141e · ecc141e
1 parent 56f08f6
commit ecc141e
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 143 deletions.
diff --git a/config/terapool.mk b/config/terapool.mk
@@ -32,7 +32,7 @@ banking_factor ?= 4
 
 # Access latency between remote groups
 # Options: "7", "9" or "11":
-remote_group_latency_cycles ?= 7
+remote_group_latency_cycles ?= 11
 
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 9
@@ -45,4 +45,4 @@ dmas_per_group ?= 4 # Brust Length = 16
 
 # L2 Banks/Channels
 l2_banks = 16
-l2_size  ?= 16777216 # 1000000
+l2_size  ?= 16777216 # 1000000
diff --git a/hardware/Makefile b/hardware/Makefile
@@ -46,7 +46,7 @@ verilator_top   ?= mempool_tb_verilator
 # Python
 python          ?= python3
 # Enable tracing
-snitch_trace    ?= 1
+snitch_trace    ?= 0
 
 # Path to DRAMsys
 dramsys_resouces_path ?= $(MEMPOOL_DIR)/hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs

diff --git a/software/apps/baremetal/conv2d_db/main.c b/software/apps/baremetal/conv2d_db/main.c
@@ -26,75 +26,13 @@ dump(time, 0);
 
 volatile int32_t in[M * N] __attribute__((section(".l1_prio")));
 volatile int32_t out[M * N] __attribute__((section(".l1_prio")));
-volatile int32_t round_barrier[NUM_CORES*BANKING_FACTOR] __attribute__((section(".l1_prio")));
-volatile int32_t out_l2[M * N] __attribute__((section(".l2")))
-__attribute__((aligned(NUM_CORES * BANKING_FACTOR * 4)));
-
-uint32_t final_log_barrier(uint32_t* round_barrier, uint32_t step, uint32_t log2_radix,
-                           uint32_t core_id) {
-  uint32_t next_step = step << log2_radix;
-  uint32_t barrier_idx = (core_id / next_step) * next_step + step - 1;
-  uint32_t *log_barrier = &round_barrier[barrier_idx*BANKING_FACTOR];
-
-  uint32_t val = __atomic_fetch_add(log_barrier, step, __ATOMIC_RELAXED);
-  if (val == NUM_CORES - step) {
-    // Last core of last stage
-    dump_time(2);
-    return (uint32_t)log_barrier;
-  } else if (val == (uint32_t)(next_step - step)) {
-    // Last core of this stage
-    __atomic_store_n(log_barrier, 0, __ATOMIC_RELAXED);
-    return final_log_barrier(round_barrier, step << log2_radix, log2_radix, core_id);
-  } else {
-    if (val == 0 && log_barrier == &round_barrier[0]) {
-      dump_time(1);
-    }
-    // Middle cores, sleep
-    mempool_wfi();
-  }
-  return 0;
-}
-
-uint32_t dma_log_barrier(uint32_t* round_barrier, uint32_t step, uint32_t log2_radix, uint32_t core_id) {
-  uint32_t next_step = step << log2_radix;
-  uint32_t barrier_idx = (core_id / next_step) * next_step + step - 1;
-  uint32_t *log_barrier = &round_barrier[barrier_idx*BANKING_FACTOR];
-
-  uint32_t val = __atomic_fetch_add(log_barrier, step, __ATOMIC_RELAXED);
-  if (val == NUM_CORES - step) {
-    // Last core of last stage
-    dump_time(2);
-    // Clear wfi that was triggered by the first core
-    mempool_wfi();
-    return (uint32_t)log_barrier;
-  } else if (val == (uint32_t)(next_step - step)) {
-    // Last core of this stage
-    __atomic_store_n(log_barrier, 0, __ATOMIC_RELAXED);
-    return dma_log_barrier(round_barrier, step << log2_radix, log2_radix, core_id);
-  } else if (val == 0 && log_barrier == &round_barrier[0]) {
-    // First core of first barrier in first stage
-    dump_time(1);
-    // Check that the DMA from the previous iteration is done
-    dma_wait();
-    // Wake up all cores to get to work
-    wake_up_all();
-    mempool_wfi();
-    dump_time(0);
-  } else {
-    // Middle cores, sleep
-    mempool_wfi();
-  }
-  return 0;
-}
+volatile int32_t out_l2[M * N] __attribute__((section(".l2")));
 
 int main() {
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id);
 
-  // Initial setup
-  round_barrier[core_id*BANKING_FACTOR] = 0;
-
   int32_t kernel[KERNEL_N * KERNEL_N];
 
   kernel[0] = 1;
@@ -109,89 +47,24 @@ int main() {
   kernel[7] = 2;
   kernel[8] = 1;
 
-  // Double-buffered convolution
-  const int last_round = 4;
-  const uint32_t log2_radix = LOG_RADIX;
-
-  const int32_t *in_comp;
-  const int32_t *in_dma;
-  int32_t *out_comp;
-  int32_t *out_dma;
-  uint32_t bar;
-
-  // Wait at barrier until everyone is ready
-  mempool_barrier(num_cores);
-  mempool_start_benchmark();
-
-  // Initialize img
+  // Initialize Matrices
   if (core_id == 0) {
-    dma_memcpy_nonblocking((void *)in, (void *)in_l2, M * N / 2 * sizeof(int32_t));
-    // Initial launch, Core 0 launched the data transfer
-    wake_up_all();
-    dump_time(0);
+    dma_memcpy_blocking((void *)in, (void *)in_l2, M * N * sizeof(int32_t));
   }
+  mempool_barrier(num_cores);
 
   mempool_start_benchmark();
-
-  for (int round = 0; round < last_round; ++round) {
-    if (round % 2 == 0) {
-      in_comp = (const int32_t *)&in[0];
-      out_comp = (int32_t *)&out[0];
-      in_dma = (const int32_t *)&in[N * M / 2];
-      out_dma = (int32_t *)&out[N * M / 2];
-    } else {
-      in_dma = (const int32_t *)&in[0];
-      out_dma = (int32_t *)&out[0];
-      in_comp = (const int32_t *)&in[N * M / 2];
-      out_comp = (int32_t *)&out[N * M / 2];
-    }
-    mempool_wfi();
-    // Barrier, launch DMA for next iteration
-    bar = dma_log_barrier(round_barrier, 1, log2_radix, core_id);
-    mempool_start_benchmark();
-    if (bar) {
-      // We are the last one, reset the barrier
-      // The old data can now be overwritten with a new DMA request
-      if (round != last_round - 1) {
-        dma_memcpy_nonblocking((void *)in_dma, (void *)in_l2,
-                               M * N / 2 * sizeof(int32_t));
-      }
-      if (round != 0) {
-        dma_memcpy_nonblocking((void *)out_l2, (void *)out_dma,
-                               M * N / 2 * sizeof(int32_t));
-      }
-      // We are the last one, reset the barrier
-      __atomic_store_n((uint32_t *)bar, 0, __ATOMIC_RELAXED);
-      if (round != last_round - 1) {
-        wake_up_all();
-      }
-    } else {
-      // Wait until the core checking the DMA gives the signal
-      // mempool_wfi();
-    }
-    mempool_start_benchmark();
-    conv2d_3x3_crazy_parallel((const int32_t *)in_comp, N, M / 2,
-                              (const int32_t *)kernel, (int32_t *)out_comp,
-                              core_id, NUM_CORES);
-    mempool_start_benchmark();
-  }
-
-  // Last write back
-  bar = final_log_barrier(round_barrier, 1, log2_radix, core_id);
+  conv2d_3x3_crazy_parallel((const int32_t *)in, N, M,
+                            (const int32_t *)kernel, (int32_t *)out,
+                            core_id, NUM_CORES);
   mempool_start_benchmark();
-  if (bar) {
-    // We are the last one, reset the barrier
-    // The old data can now be overwritten with a new DMA request
-    dma_memcpy_blocking((void *)out_l2, (void *)out_dma,
-                        M * N / 2 * sizeof(int32_t));
-    // We are the last one, reset the barrier
-    __atomic_store_n((uint32_t *)bar, 0, __ATOMIC_RELAXED);
-    wake_up_all();
-    mempool_wfi();
-  }
-
+  mempool_log_barrier(8, core_id);
   mempool_start_benchmark();
-  mempool_barrier(num_cores);
+  conv2d_3x3_crazy_parallel((const int32_t *)in, N, M,
+                            (const int32_t *)kernel, (int32_t *)out,
+                            core_id, NUM_CORES);
+  mempool_start_benchmark();
+  mempool_log_barrier(8, core_id);
   mempool_stop_benchmark();
 
   return 0;