diff --git a/CHANGELOG.md b/CHANGELOG.md index a571ef4b0..e8e986d04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,9 +16,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Update default Questasim version to 2022.3 - Decrease stack size to 128 words - Add CFFT radix-4 and radix-2 kernels +- Parametrize the performance counters ### Fixed - Fix type issue in `snitch_addr_demux` +- Properly disable the debugging CSRs in ASIC implementations ## 0.6.0 - 2023-01-09 diff --git a/hardware/deps/snitch/Bender.yml b/hardware/deps/snitch/Bender.yml index 48a702145..24427e910 100644 --- a/hardware/deps/snitch/Bender.yml +++ b/hardware/deps/snitch/Bender.yml @@ -13,6 +13,7 @@ dependencies: sources: - defines: SNITCH_ENABLE_PERF: 1 + SNITCH_ENABLE_STALL_COUNTER: 1 files: # packages - src/riscv_instr.sv diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv index 01d422aa9..3b5fdb356 100644 --- a/hardware/deps/snitch/src/snitch.sv +++ b/hardware/deps/snitch/src/snitch.sv @@ -10,6 +10,7 @@ `include "common_cells/assertions.svh" // `SNITCH_ENABLE_PERF Enables mcycle, minstret performance counters (read only) +// `SNITCH_ENABLE_STALL_COUNTER Enables stall_ins, stall_raw, stall_lsu performance counters (read only) module snitch import snitch_pkg::meta_id_t; @@ -291,11 +292,14 @@ module snitch `ifdef SNITCH_ENABLE_PERF logic [63:0] cycle_q; logic [63:0] instret_q; + `FFAR(cycle_q, cycle_q + 1, '0, clk_i, rst_i); + `FFLAR(instret_q, instret_q + 1, !stall, '0, clk_i, rst_i); + `endif + + `ifdef SNITCH_ENABLE_STALL_COUNTER logic [31:0] stall_ins_q; logic [31:0] stall_raw_q; logic [31:0] stall_lsu_q; - `FFAR(cycle_q, cycle_q + 1, '0, clk_i, rst_i); - `FFLAR(instret_q, instret_q + 1, !stall, '0, clk_i, rst_i); `FFLAR(stall_ins_q, stall_ins_q + 1, stall && (!inst_ready_i) && inst_valid_o, '0, clk_i, rst_i) `FFLAR(stall_raw_q, stall_raw_q + 1, (!operands_ready) || (!dst_ready), '0, clk_i, rst_i) `FFLAR(stall_lsu_q, stall_lsu_q + 1, lsu_stall, '0, clk_i, rst_i) @@ -1417,6 +1421,8 @@ module snitch riscv_instr::CSR_MINSTRETH: begin csr_rvalue = instret_q[63:32]; end + `endif + `ifdef SNITCH_ENABLE_STALL_COUNTER riscv_instr::CSR_MHPMCOUNTER3: begin csr_rvalue = stall_ins_q[31:0]; end @@ -1436,8 +1442,13 @@ module snitch end // CSR registers - `FFLAR(csr_trace_q, alu_result, csr_trace_en, '0, clk_i, rst_i); - `FFLAR(csr_stack_limit_q, alu_result, csr_stack_limit_en, 32'hFFFF_FFFF, clk_i, rst_i); + `ifdef TARGET_ASIC + assign csr_trace_q = '0; + assign csr_stack_limit_q = '0; + `else + `FFLAR(csr_trace_q, alu_result, csr_trace_en, '0, clk_i, rst_i); + `FFLAR(csr_stack_limit_q, alu_result, csr_stack_limit_en, 32'hFFFF_FFFF, clk_i, rst_i); + `endif // pragma translate_off always_ff @(posedge clk_i or posedge rst_i) begin diff --git a/hardware/scripts/questa/wave.tcl b/hardware/scripts/questa/wave.tcl index d66163071..970e59cfb 100644 --- a/hardware/scripts/questa/wave.tcl +++ b/hardware/scripts/questa/wave.tcl @@ -5,6 +5,26 @@ onerror {resume} quietly WaveActivateNextPane {} 0 +# Add a vector of the core's utilization signals to quickly get an overview of the systems activity +set num_cores [examine -radix dec mempool_pkg::NumCores] + +add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/snitch_utilization +add wave -noupdate -group Utilization /mempool_tb/instruction_handshake +add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/lsu_utilization +add wave -noupdate -group Utilization /mempool_tb/lsu_handshake +add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/lsu_pressure +add wave -noupdate -group Utilization /mempool_tb/lsu_request +if {[examine -radix dec /snitch_pkg::XPULPIMG]} { + add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/gen_utilization/dspu_utilization + add wave -noupdate -group Utilization /mempool_tb/gen_utilization/dspu_handshake + add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/gen_utilization/mac_utilization + add wave -noupdate -group Utilization /mempool_tb/gen_utilization/dspu_mac +} +set axi_channels [expr [examine -radix dec mempool_pkg::NumGroups] * [examine -radix dec mempool_pkg::NumAXIMastersPerGroup]] +add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $axi_channels -radix unsigned /mempool_tb/axi_w_utilization +add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $axi_channels -radix unsigned /mempool_tb/axi_r_utilization + + # Add a vector of the core's wfi signal to quickly see which cores are active add wave /mempool_tb/wfi diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv index c8dd12e9b..dcefc5365 100644 --- a/hardware/tb/mempool_tb.sv +++ b/hardware/tb/mempool_tb.sv @@ -330,4 +330,67 @@ module mempool_tb; end : l2_init end : gen_l2_banks_init + /************************************** + * MAC Utilization * + **************************************/ +`ifndef TARGET_SYNTHESIS +`ifndef TARGET_VERILATOR + + // Cores + logic [NumCores-1:0] instruction_handshake, lsu_request, lsu_handshake; + int unsigned snitch_utilization, lsu_pressure, lsu_utilization; + assign snitch_utilization = $countones(instruction_handshake); + assign lsu_utilization = $countones(lsu_handshake); + assign lsu_pressure = $countones(lsu_request); + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin + for (genvar c = 0; c < NumCoresPerTile; c++) begin + logic valid_instr, stall; + logic lsu_valid, lsu_ready; + // Snitch + assign valid_instr = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.valid_instr; + assign stall = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.stall; + assign instruction_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = valid_instr & !stall; + // Interconnect + assign lsu_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qvalid_o; + assign lsu_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qready_i; + assign lsu_request[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & !lsu_ready; + assign lsu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & lsu_ready; + end + end + end + // DSPU + if (snitch_pkg::XPULPIMG) begin: gen_utilization + logic [NumCores-1:0] dspu_handshake, dspu_mac; + int unsigned dspu_utilization, mac_utilization; + assign dspu_utilization = $countones(dspu_handshake); + assign mac_utilization = $countones(dspu_mac); + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin + for (genvar c = 0; c < NumCoresPerTile; c++) begin + logic dsp_valid, dsp_ready, mac; + assign dsp_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_valid_i; + assign dsp_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_ready_o; + assign mac = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.operator_i ==? riscv_instr::P_MAC; + assign dspu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = dsp_valid & dsp_ready; + assign dspu_mac[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = dsp_valid & dsp_ready & mac; + end + end + end + end + // AXI + logic [NumGroups*NumAXIMastersPerGroup-1:0] w_valid, w_ready, r_ready, r_valid; + int unsigned axi_w_utilization, axi_r_utilization; + assign axi_w_utilization = $countones(w_valid & w_ready); + assign axi_r_utilization = $countones(r_ready & r_valid); + for (genvar a = 0; a < NumGroups*NumAXIMastersPerGroup; a++) begin + assign w_valid[a] = dut.i_mempool_cluster.axi_mst_req_o[a].w_valid; + assign w_ready[a] = dut.i_mempool_cluster.axi_mst_resp_i[a].w_ready; + assign r_ready[a] = dut.i_mempool_cluster.axi_mst_req_o[a].r_ready; + assign r_valid[a] = dut.i_mempool_cluster.axi_mst_resp_i[a].r_valid; + end + +`endif +`endif + endmodule : mempool_tb