Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance metrics #89

Merged
merged 3 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Update default Questasim version to 2022.3
- Decrease stack size to 128 words
- Add CFFT radix-4 and radix-2 kernels
- Parametrize the performance counters

### Fixed
- Fix type issue in `snitch_addr_demux`
- Properly disable the debugging CSRs in ASIC implementations

## 0.6.0 - 2023-01-09

Expand Down
1 change: 1 addition & 0 deletions hardware/deps/snitch/Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
sources:
- defines:
SNITCH_ENABLE_PERF: 1
SNITCH_ENABLE_STALL_COUNTER: 1
files:
# packages
- src/riscv_instr.sv
Expand Down
19 changes: 15 additions & 4 deletions hardware/deps/snitch/src/snitch.sv
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
`include "common_cells/assertions.svh"

// `SNITCH_ENABLE_PERF Enables mcycle, minstret performance counters (read only)
// `SNITCH_ENABLE_STALL_COUNTER Enables stall_ins, stall_raw, stall_lsu performance counters (read only)

module snitch
import snitch_pkg::meta_id_t;
Expand Down Expand Up @@ -291,11 +292,14 @@ module snitch
`ifdef SNITCH_ENABLE_PERF
logic [63:0] cycle_q;
logic [63:0] instret_q;
`FFAR(cycle_q, cycle_q + 1, '0, clk_i, rst_i);
`FFLAR(instret_q, instret_q + 1, !stall, '0, clk_i, rst_i);
`endif

`ifdef SNITCH_ENABLE_STALL_COUNTER
logic [31:0] stall_ins_q;
logic [31:0] stall_raw_q;
logic [31:0] stall_lsu_q;
`FFAR(cycle_q, cycle_q + 1, '0, clk_i, rst_i);
`FFLAR(instret_q, instret_q + 1, !stall, '0, clk_i, rst_i);
`FFLAR(stall_ins_q, stall_ins_q + 1, stall && (!inst_ready_i) && inst_valid_o, '0, clk_i, rst_i)
`FFLAR(stall_raw_q, stall_raw_q + 1, (!operands_ready) || (!dst_ready), '0, clk_i, rst_i)
`FFLAR(stall_lsu_q, stall_lsu_q + 1, lsu_stall, '0, clk_i, rst_i)
Expand Down Expand Up @@ -1417,6 +1421,8 @@ module snitch
riscv_instr::CSR_MINSTRETH: begin
csr_rvalue = instret_q[63:32];
end
`endif
`ifdef SNITCH_ENABLE_STALL_COUNTER
riscv_instr::CSR_MHPMCOUNTER3: begin
csr_rvalue = stall_ins_q[31:0];
end
Expand All @@ -1436,8 +1442,13 @@ module snitch
end

// CSR registers
`FFLAR(csr_trace_q, alu_result, csr_trace_en, '0, clk_i, rst_i);
`FFLAR(csr_stack_limit_q, alu_result, csr_stack_limit_en, 32'hFFFF_FFFF, clk_i, rst_i);
`ifdef TARGET_ASIC
assign csr_trace_q = '0;
assign csr_stack_limit_q = '0;
`else
`FFLAR(csr_trace_q, alu_result, csr_trace_en, '0, clk_i, rst_i);
`FFLAR(csr_stack_limit_q, alu_result, csr_stack_limit_en, 32'hFFFF_FFFF, clk_i, rst_i);
`endif

// pragma translate_off
always_ff @(posedge clk_i or posedge rst_i) begin
Expand Down
20 changes: 20 additions & 0 deletions hardware/scripts/questa/wave.tcl
SamuelRiedel marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
onerror {resume}
quietly WaveActivateNextPane {} 0

# Add a vector of the core's utilization signals to quickly get an overview of the systems activity
set num_cores [examine -radix dec mempool_pkg::NumCores]

add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/snitch_utilization
add wave -noupdate -group Utilization /mempool_tb/instruction_handshake
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/lsu_utilization
add wave -noupdate -group Utilization /mempool_tb/lsu_handshake
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/lsu_pressure
add wave -noupdate -group Utilization /mempool_tb/lsu_request
if {[examine -radix dec /snitch_pkg::XPULPIMG]} {
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/gen_utilization/dspu_utilization
add wave -noupdate -group Utilization /mempool_tb/gen_utilization/dspu_handshake
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $num_cores -radix unsigned /mempool_tb/gen_utilization/mac_utilization
add wave -noupdate -group Utilization /mempool_tb/gen_utilization/dspu_mac
}
set axi_channels [expr [examine -radix dec mempool_pkg::NumGroups] * [examine -radix dec mempool_pkg::NumAXIMastersPerGroup]]
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $axi_channels -radix unsigned /mempool_tb/axi_w_utilization
add wave -noupdate -group Utilization -color {Cornflower Blue} -format Analog-Step -height 84 -max $axi_channels -radix unsigned /mempool_tb/axi_r_utilization


# Add a vector of the core's wfi signal to quickly see which cores are active
add wave /mempool_tb/wfi

Expand Down
63 changes: 63 additions & 0 deletions hardware/tb/mempool_tb.sv
SamuelRiedel marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -330,4 +330,67 @@ module mempool_tb;
end : l2_init
end : gen_l2_banks_init

/**************************************
* MAC Utilization *
**************************************/
`ifndef TARGET_SYNTHESIS
`ifndef TARGET_VERILATOR

// Cores
logic [NumCores-1:0] instruction_handshake, lsu_request, lsu_handshake;
int unsigned snitch_utilization, lsu_pressure, lsu_utilization;
assign snitch_utilization = $countones(instruction_handshake);
assign lsu_utilization = $countones(lsu_handshake);
assign lsu_pressure = $countones(lsu_request);
for (genvar g = 0; g < NumGroups; g++) begin
for (genvar t = 0; t < NumTilesPerGroup; t++) begin
for (genvar c = 0; c < NumCoresPerTile; c++) begin
logic valid_instr, stall;
logic lsu_valid, lsu_ready;
// Snitch
assign valid_instr = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.valid_instr;
assign stall = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.stall;
assign instruction_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = valid_instr & !stall;
// Interconnect
assign lsu_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qvalid_o;
assign lsu_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qready_i;
assign lsu_request[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & !lsu_ready;
assign lsu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & lsu_ready;
end
end
end
// DSPU
if (snitch_pkg::XPULPIMG) begin: gen_utilization
logic [NumCores-1:0] dspu_handshake, dspu_mac;
int unsigned dspu_utilization, mac_utilization;
assign dspu_utilization = $countones(dspu_handshake);
assign mac_utilization = $countones(dspu_mac);
for (genvar g = 0; g < NumGroups; g++) begin
for (genvar t = 0; t < NumTilesPerGroup; t++) begin
for (genvar c = 0; c < NumCoresPerTile; c++) begin
logic dsp_valid, dsp_ready, mac;
assign dsp_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_valid_i;
assign dsp_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_ready_o;
assign mac = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.operator_i ==? riscv_instr::P_MAC;
assign dspu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = dsp_valid & dsp_ready;
assign dspu_mac[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = dsp_valid & dsp_ready & mac;
end
end
end
end
// AXI
logic [NumGroups*NumAXIMastersPerGroup-1:0] w_valid, w_ready, r_ready, r_valid;
int unsigned axi_w_utilization, axi_r_utilization;
assign axi_w_utilization = $countones(w_valid & w_ready);
assign axi_r_utilization = $countones(r_ready & r_valid);
for (genvar a = 0; a < NumGroups*NumAXIMastersPerGroup; a++) begin
assign w_valid[a] = dut.i_mempool_cluster.axi_mst_req_o[a].w_valid;
assign w_ready[a] = dut.i_mempool_cluster.axi_mst_resp_i[a].w_ready;
assign r_ready[a] = dut.i_mempool_cluster.axi_mst_req_o[a].r_ready;
assign r_valid[a] = dut.i_mempool_cluster.axi_mst_resp_i[a].r_valid;
end

`endif
`endif

endmodule : mempool_tb