From cdd7a8a5af9db98ac6e5eaa9ac680009eb68c134 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 22 Jun 2023 13:59:31 +0200 Subject: [PATCH] occamy: Extend wide interconnect with multicast --- docs/rm/2_addrmap.md | 82 +---- hw/occamy/occamy_soc.sv.tpl | 1 + target/sim/cfg/Q1C2.hjson | 409 +++++++++++++++++++++++ target/sim/cfg/Q1C4.hjson | 409 +++++++++++++++++++++++ target/sim/cfg/Q2C2.hjson | 409 +++++++++++++++++++++++ target/sim/cfg/Q4C1.hjson | 409 +++++++++++++++++++++++ target/sim/sw/device/Makefile | 1 + target/sim/sw/device/apps/mcast/Makefile | 10 + target/sim/sw/device/apps/mcast/main.c | 44 +++ target/sim/sw/host/apps/offload/Makefile | 1 + util/occamygen/occamygen.py | 96 ++++-- util/solder/solder.py | 44 +-- 12 files changed, 1784 insertions(+), 131 deletions(-) create mode 100644 target/sim/cfg/Q1C2.hjson create mode 100644 target/sim/cfg/Q1C4.hjson create mode 100644 target/sim/cfg/Q2C2.hjson create mode 100644 target/sim/cfg/Q4C1.hjson create mode 100644 target/sim/sw/device/apps/mcast/Makefile create mode 100644 target/sim/sw/device/apps/mcast/main.c diff --git a/docs/rm/2_addrmap.md b/docs/rm/2_addrmap.md index 63f6ff01c..acd208f8b 100644 --- a/docs/rm/2_addrmap.md +++ b/docs/rm/2_addrmap.md @@ -41,15 +41,10 @@ This is the current address map of occamy. Note that the Quadrants address map h | HBM\_CFG\_CTRL | 64.0 KB | used | 0x0a80\_0000 | 0x0a80\_ffff | | - | 7.9 MB | free | 0x0a81\_0000 | 0x0aff\_ffff | | QUAD\_0\_CFG | 64.0 KB | used | 0x0b00\_0000 | 0x0b00\_ffff | -| QUAD\_1\_CFG | 64.0 KB | used | 0x0b01\_0000 | 0x0b01\_ffff | -| QUAD\_2\_CFG | 64.0 KB | used | 0x0b02\_0000 | 0x0b02\_ffff | -| QUAD\_3\_CFG | 64.0 KB | used | 0x0b03\_0000 | 0x0b03\_ffff | -| QUAD\_4\_CFG | 64.0 KB | used | 0x0b04\_0000 | 0x0b04\_ffff | -| QUAD\_5\_CFG | 64.0 KB | used | 0x0b05\_0000 | 0x0b05\_ffff | -| - | 15.6 MB | free | 0x0b06\_0000 | 0x0bff\_ffff | +| - | 15.9 MB | free | 0x0b01\_0000 | 0x0bff\_ffff | | PLIC | 64.0 MB | used | 0x0c00\_0000 | 0x0fff\_ffff | -| QUADRANTS | 6.0 MB | used | 0x1000\_0000 | 0x105f\_ffff | -| - | 10.0 MB | free | 0x1060\_0000 | 0x10ff\_ffff | +| QUADRANTS | 256.0 KB | used | 0x1000\_0000 | 0x1003\_ffff | +| - | 15.7 MB | free | 0x1004\_0000 | 0x10ff\_ffff | | SYS\_IDMA\_CFG | 64.0 KB | used | 0x1100\_0000 | 0x1100\_ffff | | - | 239.9 MB | free | 0x1101\_0000 | 0x1fff\_ffff | | PCIE | 640.0 MB | used | 0x2000\_0000 | 0x47ff\_ffff | @@ -81,74 +76,5 @@ This is the current address map of occamy. Note that the Quadrants address map h | 0 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1000\_0000 | 0x1001\_ffff | | 0 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1002\_0000 | 0x1002\_ffff | | 0 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1003\_0000 | 0x1003\_ffff | -| 0 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1004\_0000 | 0x1005\_ffff | -| 0 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1006\_0000 | 0x1006\_ffff | -| 0 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1007\_0000 | 0x1007\_ffff | -| 0 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1008\_0000 | 0x1009\_ffff | -| 0 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x100a\_0000 | 0x100a\_ffff | -| 0 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x100b\_0000 | 0x100b\_ffff | -| 0 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x100c\_0000 | 0x100d\_ffff | -| 0 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x100e\_0000 | 0x100e\_ffff | -| 0 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x100f\_0000 | 0x100f\_ffff | -| 1 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1010\_0000 | 0x1011\_ffff | -| 1 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1012\_0000 | 0x1012\_ffff | -| 1 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1013\_0000 | 0x1013\_ffff | -| 1 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1014\_0000 | 0x1015\_ffff | -| 1 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1016\_0000 | 0x1016\_ffff | -| 1 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1017\_0000 | 0x1017\_ffff | -| 1 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1018\_0000 | 0x1019\_ffff | -| 1 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x101a\_0000 | 0x101a\_ffff | -| 1 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x101b\_0000 | 0x101b\_ffff | -| 1 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x101c\_0000 | 0x101d\_ffff | -| 1 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x101e\_0000 | 0x101e\_ffff | -| 1 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x101f\_0000 | 0x101f\_ffff | -| 2 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1020\_0000 | 0x1021\_ffff | -| 2 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1022\_0000 | 0x1022\_ffff | -| 2 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1023\_0000 | 0x1023\_ffff | -| 2 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1024\_0000 | 0x1025\_ffff | -| 2 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1026\_0000 | 0x1026\_ffff | -| 2 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1027\_0000 | 0x1027\_ffff | -| 2 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1028\_0000 | 0x1029\_ffff | -| 2 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x102a\_0000 | 0x102a\_ffff | -| 2 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x102b\_0000 | 0x102b\_ffff | -| 2 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x102c\_0000 | 0x102d\_ffff | -| 2 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x102e\_0000 | 0x102e\_ffff | -| 2 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x102f\_0000 | 0x102f\_ffff | -| 3 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1030\_0000 | 0x1031\_ffff | -| 3 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1032\_0000 | 0x1032\_ffff | -| 3 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1033\_0000 | 0x1033\_ffff | -| 3 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1034\_0000 | 0x1035\_ffff | -| 3 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1036\_0000 | 0x1036\_ffff | -| 3 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1037\_0000 | 0x1037\_ffff | -| 3 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1038\_0000 | 0x1039\_ffff | -| 3 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x103a\_0000 | 0x103a\_ffff | -| 3 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x103b\_0000 | 0x103b\_ffff | -| 3 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x103c\_0000 | 0x103d\_ffff | -| 3 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x103e\_0000 | 0x103e\_ffff | -| 3 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x103f\_0000 | 0x103f\_ffff | -| 4 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1040\_0000 | 0x1041\_ffff | -| 4 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1042\_0000 | 0x1042\_ffff | -| 4 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1043\_0000 | 0x1043\_ffff | -| 4 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1044\_0000 | 0x1045\_ffff | -| 4 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1046\_0000 | 0x1046\_ffff | -| 4 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1047\_0000 | 0x1047\_ffff | -| 4 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1048\_0000 | 0x1049\_ffff | -| 4 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x104a\_0000 | 0x104a\_ffff | -| 4 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x104b\_0000 | 0x104b\_ffff | -| 4 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x104c\_0000 | 0x104d\_ffff | -| 4 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x104e\_0000 | 0x104e\_ffff | -| 4 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x104f\_0000 | 0x104f\_ffff | -| 5 | 0 | CLUSTER\_TCDM | 128.0 KB | 0x1050\_0000 | 0x1051\_ffff | -| 5 | 0 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1052\_0000 | 0x1052\_ffff | -| 5 | 0 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1053\_0000 | 0x1053\_ffff | -| 5 | 1 | CLUSTER\_TCDM | 128.0 KB | 0x1054\_0000 | 0x1055\_ffff | -| 5 | 1 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x1056\_0000 | 0x1056\_ffff | -| 5 | 1 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x1057\_0000 | 0x1057\_ffff | -| 5 | 2 | CLUSTER\_TCDM | 128.0 KB | 0x1058\_0000 | 0x1059\_ffff | -| 5 | 2 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x105a\_0000 | 0x105a\_ffff | -| 5 | 2 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x105b\_0000 | 0x105b\_ffff | -| 5 | 3 | CLUSTER\_TCDM | 128.0 KB | 0x105c\_0000 | 0x105d\_ffff | -| 5 | 3 | CLUSTER\_PERIPHERAL | 64.0 KB | 0x105e\_0000 | 0x105e\_ffff | -| 5 | 3 | CLUSTER\_ZERO\_MEM | 64.0 KB | 0x105f\_0000 | 0x105f\_ffff | -| - | - | EMPTY | 10.0 MB | 0x1060\_0000 | 0x10ff\_ffff | +| - | - | EMPTY | 15.7 MB | 0x1004\_0000 | 0x10ff\_ffff | diff --git a/hw/occamy/occamy_soc.sv.tpl b/hw/occamy/occamy_soc.sv.tpl index 4d25d5333..9ad7a78aa 100644 --- a/hw/occamy/occamy_soc.sv.tpl +++ b/hw/occamy/occamy_soc.sv.tpl @@ -419,6 +419,7 @@ module ${name}_soc logic [${wide_in.iw-1}:0] id; logic [${wide_in.aw-1}:0] src, dst; logic [${wide_in.aw-1}:0] num_bytes; + logic user_src, user_dst; axi_pkg::cache_t cache_src, cache_dst; axi_pkg::burst_t burst_src, burst_dst; logic decouple_rw; diff --git a/target/sim/cfg/Q1C2.hjson b/target/sim/cfg/Q1C2.hjson new file mode 100644 index 000000000..f7bc9b2bf --- /dev/null +++ b/target/sim/cfg/Q1C2.hjson @@ -0,0 +1,409 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Cluster configuration for Occamy. +{ + is_remote_quadrant: false, + remote_quadrants: [], + enable_multicast: true, + quadrant_pre_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + pre_xbar_slv_id_width_no_rocache: 3, + wide_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + quadrant_inter_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + hbm_xbar: { + max_slv_trans: 128, + max_mst_trans: 128, + fall_through: false, + }, + narrow_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + cuts: { + narrow_to_quad: 3, + quad_to_narrow: 3, + quad_to_pre: 1, + pre_to_inter: 1, + inter_to_quad: 3, + narrow_to_cva6: 2, + narrow_conv_to_spm_narrow_pre: 2, + narrow_conv_to_spm_narrow: 1, + narrow_and_pcie: 3, + narrow_and_wide: 1, + wide_conv_to_spm_wide: 3, + wide_to_wide_zero_mem: 0, + wide_to_hbm: 3, + wide_and_inter: 3, + wide_and_hbi: 3, + narrow_and_hbi: 3, + pre_to_hbmx: 3, + hbmx_to_hbm: 3, + atomic_adapter_narrow: 1, + atomic_adapter_narrow_wide: 1, + // Give some flexibility in peripheral xbar placement + periph_axi_lite_narrow: 2, + periph_axi_lite: 2, + periph_axi_lite_narrow_hbm_xbar_cfg: 2, + // Non-right-side chip peripherals + periph_axi_lite_narrow_hbm_cfg: 3, + periph_axi_lite_narrow_pcie_cfg: 3, + periph_axi_lite_narrow_chip_ctrl_cfg: 3, + periph_axi_lite_narrow_hbi_narrow_cfg: 3, + periph_axi_lite_narrow_hbi_wide_cfg: 3, + periph_axi_lite_narrow_bootrom_cfg: 3, + periph_axi_lite_narrow_fll_system_cfg: 3, + periph_axi_lite_narrow_fll_periph_cfg: 3, + periph_axi_lite_narrow_fll_hbm2e_cfg: 3, + // Right-side or latency-invariant chip peripherals + periph_axi_lite_narrow_soc_ctrl_cfg: 1, + periph_axi_lite_narrow_uart_cfg: 1, + periph_axi_lite_narrow_i2c_cfg: 1, + periph_axi_lite_narrow_gpio_cfg: 1, + periph_axi_lite_narrow_clint_cfg: 1, + periph_axi_lite_narrow_plic_cfg: 1, + periph_axi_lite_narrow_spim_cfg: 1, + periph_axi_lite_narrow_timer_cfg: 1, + } + txns: { + wide_and_inter: 128, + wide_to_hbm: 128, + narrow_and_wide: 16, + rmq: 4, + } + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 2, + // number of pending transactions on the narrow/wide network + narrow_trans: 32, + wide_trans: 32, + // Disable for easier flow trials. + ro_cache_cfg: { + width: 1024, + count: 128, + sets: 2, + max_trans: 32, + address_regions: 4, + } + narrow_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + wide_xbar_slv_id_width: 3 + narrow_xbar: { + max_slv_trans: 8, + max_mst_trans: 8, + fall_through: false, + }, + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + cfg_base_addr: 184549376, // 0x0b000000 + cfg_base_offset: 65536 // 0x10000 + }, + cluster: { + name: "occamy_cluster" + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x10000000 + cluster_base_offset: 262144 // 0x40000 + cluster_base_hartid: 1, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, // 128 kiB + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_user_width: 48, // same as addr_width + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ], + } + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } + // peripherals + peripherals: { + rom: { + address: 16777216, // 0x0100_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + clint: { + address: 67108864, // 0x0400_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + axi_lite_peripherals: [ + { + name: "debug", + address: 0, // 0x0000_0000 + length: 4096, // 4 kiB 0x1000 + } + ], + axi_lite_narrow_peripherals: [ + { + name: "soc_ctrl", + address: 33554432, // 0x0200_0000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "fll_system", + address: 33558528, // 0x0200_1000 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_periph", + address: 33559552, // 0x0200_1400 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_hbm2e", + address: 33560576, // 0x0200_1800 + length: 1024, // 1 kiB 0x400 + }, + { + name: "uart", + address: 33562624, // 0x0200_2000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "gpio", + address: 33566720, // 0x0200_3000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "i2c", + address: 33570816, // 0x0200_4000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "chip_ctrl", + address: 33574912, // 0x0200_5000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "timer", + address: 33579008, // 0x0200_6000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "hbm_xbar_cfg", + address: 33583104, // 0x0200_7000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "spim", + address: 50331648, // 0x0300_0000 + length: 131072, // 4 kiB 0x2_0000 + }, + { + name: "pcie_cfg", + address: 83886080, // 0x0500_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + { + name: "hbi_wide_cfg", + address: 100663296, // 0x0600_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "hbi_narrow_cfg", + address: 117440512, // 0x0700_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "plic", + address: 201326592, // 0x0C00_0000 + length: 67108864, // 64 MiB 0x400_0000 + }, + ], + }, + // non-peripheral IPs + pcie: { + address_io: 536870912, // 0x2000_0000 + address_mm: 1207959552, // 0x4800_0000 + length: 671088640, // 640 MiB 0x2800_0000 + }, + spm_narrow: { + address: 1879048192, // 0x7000_0000 + length: 524288, // 512 kiB 0x8_0000 + # An uncached alias address space of the same length + uncached_alias: 1879572480, // 0x7008_0000 + }, + spm_wide: { + address: 1895825408, // 0x7100_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + wide_zero_mem: { + address: 4294967296, // 0x1_0000_0000 + length: 8589934592, // 8 GiB 0x2_0000_0000 + }, + sys_idma_cfg: { + address: 285212672, // 0x1100_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + hbi: { + address: 1099511627776, // 0x100_0000_0000 + length: 1099511627776, // 1 TiB 0x100_0000_0000 + } + hbm: { + address_0: 2147483648, // 0x8000_0000 + address_1: 68719476736, // 0x10_0000_0000 + channel_size: 1073741824, // 1 GiB 0x4000_0000 + nr_channels_total: 8, + nr_channels_address_0: 2, + cfg_regions: { + top: { + address: 134217728, // 0x0800_0000 + length: 4194304, // 4 MiB 0x40_0000 + }, + phy: { + address: 150994944 // 0x0900_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + seq: { + address: 167772160, // 0x0A00_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + ctrl: { + address: 176160768, // 0x0A80_0000 + length: 65536, // 64 kiB 0x1_0000 + } + } + }, + // dram corresponds to 'hbm address_0' and 'nr_channels_address_0' + dram: { + address: 2147483648, // 0x8000_0000 + length: 2147483648, // 2 GiB 0x8000_0000 + }, +} diff --git a/target/sim/cfg/Q1C4.hjson b/target/sim/cfg/Q1C4.hjson new file mode 100644 index 000000000..e51394324 --- /dev/null +++ b/target/sim/cfg/Q1C4.hjson @@ -0,0 +1,409 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Cluster configuration for Occamy. +{ + is_remote_quadrant: false, + remote_quadrants: [], + enable_multicast: true, + quadrant_pre_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + pre_xbar_slv_id_width_no_rocache: 3, + wide_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + quadrant_inter_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + hbm_xbar: { + max_slv_trans: 128, + max_mst_trans: 128, + fall_through: false, + }, + narrow_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + cuts: { + narrow_to_quad: 3, + quad_to_narrow: 3, + quad_to_pre: 1, + pre_to_inter: 1, + inter_to_quad: 3, + narrow_to_cva6: 2, + narrow_conv_to_spm_narrow_pre: 2, + narrow_conv_to_spm_narrow: 1, + narrow_and_pcie: 3, + narrow_and_wide: 1, + wide_conv_to_spm_wide: 3, + wide_to_wide_zero_mem: 0, + wide_to_hbm: 3, + wide_and_inter: 3, + wide_and_hbi: 3, + narrow_and_hbi: 3, + pre_to_hbmx: 3, + hbmx_to_hbm: 3, + atomic_adapter_narrow: 1, + atomic_adapter_narrow_wide: 1, + // Give some flexibility in peripheral xbar placement + periph_axi_lite_narrow: 2, + periph_axi_lite: 2, + periph_axi_lite_narrow_hbm_xbar_cfg: 2, + // Non-right-side chip peripherals + periph_axi_lite_narrow_hbm_cfg: 3, + periph_axi_lite_narrow_pcie_cfg: 3, + periph_axi_lite_narrow_chip_ctrl_cfg: 3, + periph_axi_lite_narrow_hbi_narrow_cfg: 3, + periph_axi_lite_narrow_hbi_wide_cfg: 3, + periph_axi_lite_narrow_bootrom_cfg: 3, + periph_axi_lite_narrow_fll_system_cfg: 3, + periph_axi_lite_narrow_fll_periph_cfg: 3, + periph_axi_lite_narrow_fll_hbm2e_cfg: 3, + // Right-side or latency-invariant chip peripherals + periph_axi_lite_narrow_soc_ctrl_cfg: 1, + periph_axi_lite_narrow_uart_cfg: 1, + periph_axi_lite_narrow_i2c_cfg: 1, + periph_axi_lite_narrow_gpio_cfg: 1, + periph_axi_lite_narrow_clint_cfg: 1, + periph_axi_lite_narrow_plic_cfg: 1, + periph_axi_lite_narrow_spim_cfg: 1, + periph_axi_lite_narrow_timer_cfg: 1, + } + txns: { + wide_and_inter: 128, + wide_to_hbm: 128, + narrow_and_wide: 16, + rmq: 4, + } + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 4, + // number of pending transactions on the narrow/wide network + narrow_trans: 32, + wide_trans: 32, + // Disable for easier flow trials. + ro_cache_cfg: { + width: 1024, + count: 128, + sets: 2, + max_trans: 32, + address_regions: 4, + } + narrow_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + wide_xbar_slv_id_width: 3 + narrow_xbar: { + max_slv_trans: 8, + max_mst_trans: 8, + fall_through: false, + }, + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + cfg_base_addr: 184549376, // 0x0b000000 + cfg_base_offset: 65536 // 0x10000 + }, + cluster: { + name: "occamy_cluster" + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x10000000 + cluster_base_offset: 262144 // 0x40000 + cluster_base_hartid: 1, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, // 128 kiB + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_user_width: 48, // same as addr_width + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ], + } + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } + // peripherals + peripherals: { + rom: { + address: 16777216, // 0x0100_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + clint: { + address: 67108864, // 0x0400_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + axi_lite_peripherals: [ + { + name: "debug", + address: 0, // 0x0000_0000 + length: 4096, // 4 kiB 0x1000 + } + ], + axi_lite_narrow_peripherals: [ + { + name: "soc_ctrl", + address: 33554432, // 0x0200_0000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "fll_system", + address: 33558528, // 0x0200_1000 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_periph", + address: 33559552, // 0x0200_1400 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_hbm2e", + address: 33560576, // 0x0200_1800 + length: 1024, // 1 kiB 0x400 + }, + { + name: "uart", + address: 33562624, // 0x0200_2000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "gpio", + address: 33566720, // 0x0200_3000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "i2c", + address: 33570816, // 0x0200_4000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "chip_ctrl", + address: 33574912, // 0x0200_5000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "timer", + address: 33579008, // 0x0200_6000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "hbm_xbar_cfg", + address: 33583104, // 0x0200_7000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "spim", + address: 50331648, // 0x0300_0000 + length: 131072, // 4 kiB 0x2_0000 + }, + { + name: "pcie_cfg", + address: 83886080, // 0x0500_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + { + name: "hbi_wide_cfg", + address: 100663296, // 0x0600_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "hbi_narrow_cfg", + address: 117440512, // 0x0700_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "plic", + address: 201326592, // 0x0C00_0000 + length: 67108864, // 64 MiB 0x400_0000 + }, + ], + }, + // non-peripheral IPs + pcie: { + address_io: 536870912, // 0x2000_0000 + address_mm: 1207959552, // 0x4800_0000 + length: 671088640, // 640 MiB 0x2800_0000 + }, + spm_narrow: { + address: 1879048192, // 0x7000_0000 + length: 524288, // 512 kiB 0x8_0000 + # An uncached alias address space of the same length + uncached_alias: 1879572480, // 0x7008_0000 + }, + spm_wide: { + address: 1895825408, // 0x7100_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + wide_zero_mem: { + address: 4294967296, // 0x1_0000_0000 + length: 8589934592, // 8 GiB 0x2_0000_0000 + }, + sys_idma_cfg: { + address: 285212672, // 0x1100_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + hbi: { + address: 1099511627776, // 0x100_0000_0000 + length: 1099511627776, // 1 TiB 0x100_0000_0000 + } + hbm: { + address_0: 2147483648, // 0x8000_0000 + address_1: 68719476736, // 0x10_0000_0000 + channel_size: 1073741824, // 1 GiB 0x4000_0000 + nr_channels_total: 8, + nr_channels_address_0: 2, + cfg_regions: { + top: { + address: 134217728, // 0x0800_0000 + length: 4194304, // 4 MiB 0x40_0000 + }, + phy: { + address: 150994944 // 0x0900_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + seq: { + address: 167772160, // 0x0A00_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + ctrl: { + address: 176160768, // 0x0A80_0000 + length: 65536, // 64 kiB 0x1_0000 + } + } + }, + // dram corresponds to 'hbm address_0' and 'nr_channels_address_0' + dram: { + address: 2147483648, // 0x8000_0000 + length: 2147483648, // 2 GiB 0x8000_0000 + }, +} diff --git a/target/sim/cfg/Q2C2.hjson b/target/sim/cfg/Q2C2.hjson new file mode 100644 index 000000000..d6337b3df --- /dev/null +++ b/target/sim/cfg/Q2C2.hjson @@ -0,0 +1,409 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Cluster configuration for Occamy. +{ + is_remote_quadrant: false, + remote_quadrants: [], + enable_multicast: true, + quadrant_pre_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + pre_xbar_slv_id_width_no_rocache: 3, + wide_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + quadrant_inter_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + hbm_xbar: { + max_slv_trans: 128, + max_mst_trans: 128, + fall_through: false, + }, + narrow_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + cuts: { + narrow_to_quad: 3, + quad_to_narrow: 3, + quad_to_pre: 1, + pre_to_inter: 1, + inter_to_quad: 3, + narrow_to_cva6: 2, + narrow_conv_to_spm_narrow_pre: 2, + narrow_conv_to_spm_narrow: 1, + narrow_and_pcie: 3, + narrow_and_wide: 1, + wide_conv_to_spm_wide: 3, + wide_to_wide_zero_mem: 0, + wide_to_hbm: 3, + wide_and_inter: 3, + wide_and_hbi: 3, + narrow_and_hbi: 3, + pre_to_hbmx: 3, + hbmx_to_hbm: 3, + atomic_adapter_narrow: 1, + atomic_adapter_narrow_wide: 1, + // Give some flexibility in peripheral xbar placement + periph_axi_lite_narrow: 2, + periph_axi_lite: 2, + periph_axi_lite_narrow_hbm_xbar_cfg: 2, + // Non-right-side chip peripherals + periph_axi_lite_narrow_hbm_cfg: 3, + periph_axi_lite_narrow_pcie_cfg: 3, + periph_axi_lite_narrow_chip_ctrl_cfg: 3, + periph_axi_lite_narrow_hbi_narrow_cfg: 3, + periph_axi_lite_narrow_hbi_wide_cfg: 3, + periph_axi_lite_narrow_bootrom_cfg: 3, + periph_axi_lite_narrow_fll_system_cfg: 3, + periph_axi_lite_narrow_fll_periph_cfg: 3, + periph_axi_lite_narrow_fll_hbm2e_cfg: 3, + // Right-side or latency-invariant chip peripherals + periph_axi_lite_narrow_soc_ctrl_cfg: 1, + periph_axi_lite_narrow_uart_cfg: 1, + periph_axi_lite_narrow_i2c_cfg: 1, + periph_axi_lite_narrow_gpio_cfg: 1, + periph_axi_lite_narrow_clint_cfg: 1, + periph_axi_lite_narrow_plic_cfg: 1, + periph_axi_lite_narrow_spim_cfg: 1, + periph_axi_lite_narrow_timer_cfg: 1, + } + txns: { + wide_and_inter: 128, + wide_to_hbm: 128, + narrow_and_wide: 16, + rmq: 4, + } + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + nr_s1_quadrant: 2, + s1_quadrant: { + nr_clusters: 2, + // number of pending transactions on the narrow/wide network + narrow_trans: 32, + wide_trans: 32, + // Disable for easier flow trials. + ro_cache_cfg: { + width: 1024, + count: 128, + sets: 2, + max_trans: 32, + address_regions: 4, + } + narrow_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + wide_xbar_slv_id_width: 3 + narrow_xbar: { + max_slv_trans: 8, + max_mst_trans: 8, + fall_through: false, + }, + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + cfg_base_addr: 184549376, // 0x0b000000 + cfg_base_offset: 65536 // 0x10000 + }, + cluster: { + name: "occamy_cluster" + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x10000000 + cluster_base_offset: 262144 // 0x40000 + cluster_base_hartid: 1, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, // 128 kiB + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_user_width: 48, // same as addr_width + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ], + } + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } + // peripherals + peripherals: { + rom: { + address: 16777216, // 0x0100_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + clint: { + address: 67108864, // 0x0400_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + axi_lite_peripherals: [ + { + name: "debug", + address: 0, // 0x0000_0000 + length: 4096, // 4 kiB 0x1000 + } + ], + axi_lite_narrow_peripherals: [ + { + name: "soc_ctrl", + address: 33554432, // 0x0200_0000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "fll_system", + address: 33558528, // 0x0200_1000 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_periph", + address: 33559552, // 0x0200_1400 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_hbm2e", + address: 33560576, // 0x0200_1800 + length: 1024, // 1 kiB 0x400 + }, + { + name: "uart", + address: 33562624, // 0x0200_2000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "gpio", + address: 33566720, // 0x0200_3000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "i2c", + address: 33570816, // 0x0200_4000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "chip_ctrl", + address: 33574912, // 0x0200_5000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "timer", + address: 33579008, // 0x0200_6000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "hbm_xbar_cfg", + address: 33583104, // 0x0200_7000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "spim", + address: 50331648, // 0x0300_0000 + length: 131072, // 4 kiB 0x2_0000 + }, + { + name: "pcie_cfg", + address: 83886080, // 0x0500_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + { + name: "hbi_wide_cfg", + address: 100663296, // 0x0600_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "hbi_narrow_cfg", + address: 117440512, // 0x0700_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "plic", + address: 201326592, // 0x0C00_0000 + length: 67108864, // 64 MiB 0x400_0000 + }, + ], + }, + // non-peripheral IPs + pcie: { + address_io: 536870912, // 0x2000_0000 + address_mm: 1207959552, // 0x4800_0000 + length: 671088640, // 640 MiB 0x2800_0000 + }, + spm_narrow: { + address: 1879048192, // 0x7000_0000 + length: 524288, // 512 kiB 0x8_0000 + # An uncached alias address space of the same length + uncached_alias: 1879572480, // 0x7008_0000 + }, + spm_wide: { + address: 1895825408, // 0x7100_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + wide_zero_mem: { + address: 4294967296, // 0x1_0000_0000 + length: 8589934592, // 8 GiB 0x2_0000_0000 + }, + sys_idma_cfg: { + address: 285212672, // 0x1100_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + hbi: { + address: 1099511627776, // 0x100_0000_0000 + length: 1099511627776, // 1 TiB 0x100_0000_0000 + } + hbm: { + address_0: 2147483648, // 0x8000_0000 + address_1: 68719476736, // 0x10_0000_0000 + channel_size: 1073741824, // 1 GiB 0x4000_0000 + nr_channels_total: 8, + nr_channels_address_0: 2, + cfg_regions: { + top: { + address: 134217728, // 0x0800_0000 + length: 4194304, // 4 MiB 0x40_0000 + }, + phy: { + address: 150994944 // 0x0900_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + seq: { + address: 167772160, // 0x0A00_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + ctrl: { + address: 176160768, // 0x0A80_0000 + length: 65536, // 64 kiB 0x1_0000 + } + } + }, + // dram corresponds to 'hbm address_0' and 'nr_channels_address_0' + dram: { + address: 2147483648, // 0x8000_0000 + length: 2147483648, // 2 GiB 0x8000_0000 + }, +} diff --git a/target/sim/cfg/Q4C1.hjson b/target/sim/cfg/Q4C1.hjson new file mode 100644 index 000000000..e92330cf8 --- /dev/null +++ b/target/sim/cfg/Q4C1.hjson @@ -0,0 +1,409 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Cluster configuration for Occamy. +{ + is_remote_quadrant: false, + remote_quadrants: [], + enable_multicast: true, + quadrant_pre_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + pre_xbar_slv_id_width_no_rocache: 3, + wide_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + quadrant_inter_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + hbm_xbar: { + max_slv_trans: 128, + max_mst_trans: 128, + fall_through: false, + }, + narrow_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + cuts: { + narrow_to_quad: 3, + quad_to_narrow: 3, + quad_to_pre: 1, + pre_to_inter: 1, + inter_to_quad: 3, + narrow_to_cva6: 2, + narrow_conv_to_spm_narrow_pre: 2, + narrow_conv_to_spm_narrow: 1, + narrow_and_pcie: 3, + narrow_and_wide: 1, + wide_conv_to_spm_wide: 3, + wide_to_wide_zero_mem: 0, + wide_to_hbm: 3, + wide_and_inter: 3, + wide_and_hbi: 3, + narrow_and_hbi: 3, + pre_to_hbmx: 3, + hbmx_to_hbm: 3, + atomic_adapter_narrow: 1, + atomic_adapter_narrow_wide: 1, + // Give some flexibility in peripheral xbar placement + periph_axi_lite_narrow: 2, + periph_axi_lite: 2, + periph_axi_lite_narrow_hbm_xbar_cfg: 2, + // Non-right-side chip peripherals + periph_axi_lite_narrow_hbm_cfg: 3, + periph_axi_lite_narrow_pcie_cfg: 3, + periph_axi_lite_narrow_chip_ctrl_cfg: 3, + periph_axi_lite_narrow_hbi_narrow_cfg: 3, + periph_axi_lite_narrow_hbi_wide_cfg: 3, + periph_axi_lite_narrow_bootrom_cfg: 3, + periph_axi_lite_narrow_fll_system_cfg: 3, + periph_axi_lite_narrow_fll_periph_cfg: 3, + periph_axi_lite_narrow_fll_hbm2e_cfg: 3, + // Right-side or latency-invariant chip peripherals + periph_axi_lite_narrow_soc_ctrl_cfg: 1, + periph_axi_lite_narrow_uart_cfg: 1, + periph_axi_lite_narrow_i2c_cfg: 1, + periph_axi_lite_narrow_gpio_cfg: 1, + periph_axi_lite_narrow_clint_cfg: 1, + periph_axi_lite_narrow_plic_cfg: 1, + periph_axi_lite_narrow_spim_cfg: 1, + periph_axi_lite_narrow_timer_cfg: 1, + } + txns: { + wide_and_inter: 128, + wide_to_hbm: 128, + narrow_and_wide: 16, + rmq: 4, + } + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + nr_s1_quadrant: 4, + s1_quadrant: { + nr_clusters: 1, + // number of pending transactions on the narrow/wide network + narrow_trans: 32, + wide_trans: 32, + // Disable for easier flow trials. + ro_cache_cfg: { + width: 1024, + count: 128, + sets: 2, + max_trans: 32, + address_regions: 4, + } + narrow_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + wide_xbar_slv_id_width: 3 + narrow_xbar: { + max_slv_trans: 8, + max_mst_trans: 8, + fall_through: false, + }, + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + cfg_base_addr: 184549376, // 0x0b000000 + cfg_base_offset: 65536 // 0x10000 + }, + cluster: { + name: "occamy_cluster" + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x10000000 + cluster_base_offset: 262144 // 0x40000 + cluster_base_hartid: 1, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, // 128 kiB + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_user_width: 48, // same as addr_width + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ], + } + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } + // peripherals + peripherals: { + rom: { + address: 16777216, // 0x0100_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + clint: { + address: 67108864, // 0x0400_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + axi_lite_peripherals: [ + { + name: "debug", + address: 0, // 0x0000_0000 + length: 4096, // 4 kiB 0x1000 + } + ], + axi_lite_narrow_peripherals: [ + { + name: "soc_ctrl", + address: 33554432, // 0x0200_0000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "fll_system", + address: 33558528, // 0x0200_1000 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_periph", + address: 33559552, // 0x0200_1400 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_hbm2e", + address: 33560576, // 0x0200_1800 + length: 1024, // 1 kiB 0x400 + }, + { + name: "uart", + address: 33562624, // 0x0200_2000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "gpio", + address: 33566720, // 0x0200_3000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "i2c", + address: 33570816, // 0x0200_4000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "chip_ctrl", + address: 33574912, // 0x0200_5000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "timer", + address: 33579008, // 0x0200_6000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "hbm_xbar_cfg", + address: 33583104, // 0x0200_7000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "spim", + address: 50331648, // 0x0300_0000 + length: 131072, // 4 kiB 0x2_0000 + }, + { + name: "pcie_cfg", + address: 83886080, // 0x0500_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + { + name: "hbi_wide_cfg", + address: 100663296, // 0x0600_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "hbi_narrow_cfg", + address: 117440512, // 0x0700_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "plic", + address: 201326592, // 0x0C00_0000 + length: 67108864, // 64 MiB 0x400_0000 + }, + ], + }, + // non-peripheral IPs + pcie: { + address_io: 536870912, // 0x2000_0000 + address_mm: 1207959552, // 0x4800_0000 + length: 671088640, // 640 MiB 0x2800_0000 + }, + spm_narrow: { + address: 1879048192, // 0x7000_0000 + length: 524288, // 512 kiB 0x8_0000 + # An uncached alias address space of the same length + uncached_alias: 1879572480, // 0x7008_0000 + }, + spm_wide: { + address: 1895825408, // 0x7100_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + wide_zero_mem: { + address: 4294967296, // 0x1_0000_0000 + length: 8589934592, // 8 GiB 0x2_0000_0000 + }, + sys_idma_cfg: { + address: 285212672, // 0x1100_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + hbi: { + address: 1099511627776, // 0x100_0000_0000 + length: 1099511627776, // 1 TiB 0x100_0000_0000 + } + hbm: { + address_0: 2147483648, // 0x8000_0000 + address_1: 68719476736, // 0x10_0000_0000 + channel_size: 1073741824, // 1 GiB 0x4000_0000 + nr_channels_total: 8, + nr_channels_address_0: 2, + cfg_regions: { + top: { + address: 134217728, // 0x0800_0000 + length: 4194304, // 4 MiB 0x40_0000 + }, + phy: { + address: 150994944 // 0x0900_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + seq: { + address: 167772160, // 0x0A00_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + ctrl: { + address: 176160768, // 0x0A80_0000 + length: 65536, // 64 kiB 0x1_0000 + } + } + }, + // dram corresponds to 'hbm address_0' and 'nr_channels_address_0' + dram: { + address: 2147483648, // 0x8000_0000 + length: 2147483648, // 2 GiB 0x8000_0000 + }, +} diff --git a/target/sim/sw/device/Makefile b/target/sim/sw/device/Makefile index 5e3b27ee5..8fc34cec8 100644 --- a/target/sim/sw/device/Makefile +++ b/target/sim/sw/device/Makefile @@ -7,6 +7,7 @@ # Add user applications to APPS variable APPS = blas/axpy APPS += blas/gemm +APPS += mcast TARGET ?= all diff --git a/target/sim/sw/device/apps/mcast/Makefile b/target/sim/sw/device/apps/mcast/Makefile new file mode 100644 index 000000000..3ed6ea44f --- /dev/null +++ b/target/sim/sw/device/apps/mcast/Makefile @@ -0,0 +1,10 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP = mcast +SRCS = main.c + +include ../common.mk diff --git a/target/sim/sw/device/apps/mcast/main.c b/target/sim/sw/device/apps/mcast/main.c new file mode 100644 index 000000000..c3d6b8fe2 --- /dev/null +++ b/target/sim/sw/device/apps/mcast/main.c @@ -0,0 +1,44 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#define LENGTH 32 + +#define CLUSTER_BCAST_MASK ((N_CLUSTERS - 1) << 18) + +// Allocate a buffer in the main memory which we will use to copy data around +// with the DMA. +uint32_t buffer_src[LENGTH]; + +int main() { + + uint32_t snrt_is_first_cluster = snrt_cluster_idx() == 0; + + // Allocate destination buffer + uint32_t *buffer_dst = snrt_l1_next(); + + // First cluster initializes the source buffer in DRAM and multicast- + // copies it to the destination buffer in every cluster's TCDM. + // All other clusters wait for an interrupt to signal the transfer + // completion. + if (snrt_is_dm_core()) { + if (snrt_is_first_cluster) { + + uint32_t snrt_is_first_cluster = snrt_cluster_idx() == 0; + // Initialize source buffer + for (uint32_t i = 0; i < LENGTH; i++) { + buffer_src[i] = 0xAAAAAAAA; + } + + // Initiate DMA transfer + snrt_dma_start_1d_mcast(buffer_dst, buffer_src, CLUSTER_BCAST_MASK, sizeof(buffer_src)); + snrt_dma_wait_all(); + } + } + + snrt_cluster_hw_barrier(); +} diff --git a/target/sim/sw/host/apps/offload/Makefile b/target/sim/sw/host/apps/offload/Makefile index 990cc1579..d6db6c718 100644 --- a/target/sim/sw/host/apps/offload/Makefile +++ b/target/sim/sw/host/apps/offload/Makefile @@ -18,6 +18,7 @@ APP = offload SRCS = src/offload.c DEVICE_APPS = blas/axpy DEVICE_APPS += blas/gemm +DEVICE_APPS += mcast # Compiler toolchain RISCV_CC = riscv64-unknown-elf-gcc diff --git a/util/occamygen/occamygen.py b/util/occamygen/occamygen.py index 5cccba744..92aa37fde 100755 --- a/util/occamygen/occamygen.py +++ b/util/occamygen/occamygen.py @@ -143,7 +143,8 @@ def main(): nr_s1_quadrants = occamy.cfg["nr_s1_quadrant"] nr_s1_clusters = occamy.cfg["s1_quadrant"]["nr_clusters"] is_remote_quadrant = occamy.cfg["is_remote_quadrant"] - enable_multicast = occamy.cfg["enable_multicast"] + enable_narrow_multicast = occamy.cfg["enable_multicast"] + enable_wide_multicast = occamy.cfg["enable_multicast"] # Iterate over Hives to get the number of cores. nr_cluster_cores = len([ core for hive in occamy.cfg["cluster"]["hives"] @@ -572,17 +573,30 @@ def main(): no_loopback=True, atop_support=False, context="soc", - node=am_quadrant_pre_xbar[i]) + node=am_quadrant_pre_xbar[i], + forward_mcast=enable_wide_multicast + ) # Default port: - quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar", am_quadrant_inter_xbar) + quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar", + am_quadrant_inter_xbar, + forward_mcast=enable_wide_multicast) quadrant_pre_xbar.add_output_entry("hbm_xbar", am_hbm_xbar) - quadrant_pre_xbar.add_input("quadrant") + quadrant_pre_xbar.add_input("quadrant", is_mcast_master=enable_wide_multicast) quadrant_pre_xbars.append(quadrant_pre_xbar) # Quadrant inter xbar # Connects all quadrant pre xbars to all quadrants, with additional wide xbar M/S pair + + # Default port: soc wide xbar (last port) + num_slave_ports = nr_s1_quadrants + len(occamy.cfg["remote_quadrants"]) + \ + is_remote_quadrant + 1 + default_mst_port_idx_bits = clog2(num_slave_ports) + default_mst_port_idx = "{0:b}".format(num_slave_ports - 1) + default_mst_port_idx = "{}'b{}".format(default_mst_port_idx_bits*num_slave_ports, + default_mst_port_idx*num_slave_ports) + quadrant_inter_xbar = solder.AxiXbar( 48, 512, @@ -596,17 +610,18 @@ def main(): no_loopback=True, atop_support=False, context="soc", - node=am_quadrant_inter_xbar) + node=am_quadrant_inter_xbar, + enable_multicast=enable_wide_multicast, + default_mst_port_idx=default_mst_port_idx) - # Default port: soc wide xbar - quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar) - quadrant_inter_xbar.add_input("wide_xbar") for i in range(nr_s1_quadrants): # Default route passes HBI through quadrant 0 # --> mask this route, forcing it through default wide xbar quadrant_inter_xbar.add_output_entry("quadrant_{}".format(i), - am_wide_xbar_quadrant_s1[i]) - quadrant_inter_xbar.add_input("quadrant_{}".format(i)) + am_wide_xbar_quadrant_s1[i], + is_mcast_target=enable_wide_multicast, + forward_mcast=enable_wide_multicast) + quadrant_inter_xbar.add_input("quadrant_{}".format(i), is_mcast_master=enable_wide_multicast) for i, rq in enumerate(occamy.cfg["remote_quadrants"]): quadrant_inter_xbar.add_input("rmq_{}".format(i)) quadrant_inter_xbar.add_output_entry("rmq_{}".format(i), am_remote_quadrants[i]) @@ -615,6 +630,9 @@ def main(): quadrant_inter_xbar.add_output("remote", []) quadrant_inter_xbar.add_input("remote") + quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar) + quadrant_inter_xbar.add_input("wide_xbar", is_mcast_master=enable_wide_multicast) + hbm_xbar = solder.AxiXbar( 48, 512, @@ -652,18 +670,21 @@ def main(): no_loopback=True, atop_support=False, context="soc", - node=am_soc_wide_xbar) + node=am_soc_wide_xbar, + enable_multicast=False, + forward_mcast=enable_wide_multicast) # Default port: HBI (always escalate "upwards" in hierarchy -> off-chip) if not is_remote_quadrant: soc_wide_xbar.add_output_entry("hbi", am_hbi) soc_wide_xbar.add_output_entry("hbm_xbar", am_hbm_xbar) - soc_wide_xbar.add_output_entry("quadrant_inter_xbar", am_quadrant_inter_xbar) + soc_wide_xbar.add_output_entry("quadrant_inter_xbar", am_quadrant_inter_xbar, + is_mcast_target=False, forward_mcast=enable_wide_multicast) soc_wide_xbar.add_output_entry("soc_narrow", am_soc_narrow_xbar) soc_wide_xbar.add_input("hbi") soc_wide_xbar.add_input("quadrant_inter_xbar") soc_wide_xbar.add_input("soc_narrow") - soc_wide_xbar.add_input("sys_idma_mst") + soc_wide_xbar.add_input("sys_idma_mst", is_mcast_master=enable_wide_multicast) soc_wide_xbar.add_output_entry("spm_wide", am_spm_wide) soc_wide_xbar.add_output_entry("wide_zero_mem", am_wide_zero_mem) @@ -684,7 +705,7 @@ def main(): no_loopback=True, context="soc", node=am_soc_narrow_xbar, - enable_multicast=enable_multicast) + enable_multicast=enable_narrow_multicast) for i in range(nr_s1_quadrants): soc_narrow_xbar.add_output_symbolic_multi("s1_quadrant_{}".format(i), @@ -692,11 +713,11 @@ def main(): "S1QuadrantAddressSpace"), (f"s1_quadrant_cfg_base_addr[{i}]", "S1QuadrantCfgAddressSpace")], - is_mcast_target=enable_multicast, - forward_mcast=enable_multicast) + is_mcast_target=enable_narrow_multicast, + forward_mcast=enable_narrow_multicast) soc_narrow_xbar.add_input("s1_quadrant_{}".format(i)) - soc_narrow_xbar.add_input("cva6", is_mcast_master=enable_multicast) + soc_narrow_xbar.add_input("cva6", is_mcast_master=enable_narrow_multicast) soc_narrow_xbar.add_input("soc_wide") soc_narrow_xbar.add_input("periph") soc_narrow_xbar.add_input("pcie") @@ -728,7 +749,8 @@ def main(): # We need 3 "crossbars", which are really simple muxes and demuxes quadrant_s1_ctrl_xbars = dict() for name, (iw, lm, forward_mcast) in { - 'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS", enable_multicast), + 'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS", + enable_narrow_multicast), 'quad_to_soc': (soc_narrow_xbar.iw, "axi_pkg::CUT_MST_PORTS", False), }.items(): # Reuse (preserve) narrow Xbar IDs and max transactions @@ -773,6 +795,13 @@ def main(): # S1 Quadrants # ################ # Dummy entries to generate associated types. + + num_slave_ports = nr_s1_clusters + 1 + default_mst_port_idx_bits = clog2(num_slave_ports) + default_mst_port_idx = "{0:b}".format(nr_s1_clusters) + default_mst_port_idx = "{}'b{}".format(default_mst_port_idx_bits*num_slave_ports, + default_mst_port_idx*num_slave_ports) + wide_xbar_quadrant_s1 = solder.AxiXbar( 48, 512, @@ -786,13 +815,11 @@ def main(): no_loopback=True, atop_support=False, context="quadrant_s1", - node=am_wide_xbar_quadrant_s1[0]) - - num_slave_ports = nr_s1_clusters + 1 - default_mst_port_idx_bits = clog2(nr_s1_clusters + 1) - default_mst_port_idx = "{0:b}".format(nr_s1_clusters) - default_mst_port_idx = "{}'b{}".format(default_mst_port_idx_bits*num_slave_ports, - default_mst_port_idx*num_slave_ports) + node=am_wide_xbar_quadrant_s1[0], + enable_multicast=enable_wide_multicast, + default_mst_port_idx=default_mst_port_idx, + mcast_to_default_slave=enable_wide_multicast + ) narrow_xbar_quadrant_s1 = solder.AxiXbar( 48, @@ -809,27 +836,28 @@ def main(): fall_through=occamy.cfg["s1_quadrant"]["narrow_xbar"]["fall_through"], no_loopback=True, context="quadrant_s1", - enable_multicast=enable_multicast, + enable_multicast=enable_narrow_multicast, default_mst_port_idx=default_mst_port_idx) - wide_xbar_quadrant_s1.add_output("top", []) - wide_xbar_quadrant_s1.add_input("top") - - narrow_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_multicast) - for i in range(nr_s1_clusters): wide_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i), f"cluster_base_addr[{i}]", - "ClusterAddressSpace") + "ClusterAddressSpace", + is_mcast_target=enable_wide_multicast, + forward_mcast=True) + wide_xbar_quadrant_s1.add_input("cluster_{}".format(i), is_mcast_master=enable_wide_multicast) - wide_xbar_quadrant_s1.add_input("cluster_{}".format(i)) narrow_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i), f"cluster_base_addr[{i}]", "ClusterAddressSpace", - is_mcast_target=enable_multicast, + is_mcast_target=enable_narrow_multicast, forward_mcast=False) narrow_xbar_quadrant_s1.add_input("cluster_{}".format(i)) + wide_xbar_quadrant_s1.add_output("top", [], is_mcast_target=enable_wide_multicast, forward_mcast=enable_wide_multicast) + wide_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_wide_multicast) + + narrow_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_narrow_multicast) narrow_xbar_quadrant_s1.add_output("top", []) # remote downstream mux diff --git a/util/solder/solder.py b/util/solder/solder.py index 2eef2d291..3e38130e2 100644 --- a/util/solder/solder.py +++ b/util/solder/solder.py @@ -342,8 +342,10 @@ def emit(aw, dw, iw, uw, enable_multicast=False): code += f"logic [{(dw + 7) // 8 - 1}:0], " if enable_multicast: user_t = "struct packed {" - user_t += f"logic [{max(0, aw - 1)}:0] mcast; " - user_t += f"logic [{max(0, uw - aw - 1)}:0] atomics_id;}}" + user_t += f"logic [{max(0, aw - 1)}:0] mcast;" + if uw > aw: + user_t += f" logic [{max(0, uw - aw - 1)}:0] atomics_id;" + user_t += "}" else: user_t = f"logic [{max(0, uw - 1)}:0]" code += f"{user_t})\n" @@ -1476,6 +1478,7 @@ def __init__(self, enable_multicast=False, forward_mcast=False, default_mst_port_idx="'0", + mcast_to_default_slave=False, **kwargs): super().__init__(**kwargs) self.aw = aw @@ -1494,6 +1497,7 @@ def __init__(self, else: self.forward_mcast = forward_mcast self.default_mst_port_idx = default_mst_port_idx + self.mcast_to_default_slave = mcast_to_default_slave self.addrmap = list() self.connections = dict() self.latency_mode = latency_mode or "axi_pkg::CUT_ALL_PORTS" @@ -1588,7 +1592,8 @@ def emit(self): if not self.outputs[i]['is_mcast_target']: if self.outputs[i+1]['is_mcast_target']: violations.append(True) - assert (not violations), 'Multicast-targetable slaves must be at lower indices' + assert (not violations), \ + f'{self.name}: multicast-targetable slaves must be at lower indices' # Sort address map rules by `is_multicast_rule` to ensure that # multicast rules are at lower indices self.addrmap.sort(key=operator.itemgetter('is_mcast_rule')) @@ -1627,24 +1632,25 @@ def emit(self): cfg = "/// Configuration of the `{}` crossbar.\n".format(self.name) cfg += "localparam axi_pkg::xbar_cfg_t {} = '{{\n".format( self.cfg_name) - cfg += " NoSlvPorts: {}_NUM_INPUTS,\n".format( + cfg += " NoSlvPorts: {}_NUM_INPUTS,\n".format( self.name.upper()) - cfg += " NoMstPorts: {}_NUM_OUTPUTS,\n".format( + cfg += " NoMstPorts: {}_NUM_OUTPUTS,\n".format( self.name.upper()) - cfg += " MaxSlvTrans: {},\n".format(self.max_slv_trans) - cfg += " MaxMstTrans: {},\n".format(self.max_mst_trans) - cfg += " FallThrough: {},\n".format(int(self.fall_through)) - cfg += " LatencyMode: {},\n".format(self.latency_mode) - cfg += " PipelineStages: {},\n".format(0) - cfg += " AxiIdWidthSlvPorts: {},\n".format(self.iw) - cfg += " AxiIdUsedSlvPorts: {},\n".format(self.iw) - cfg += " UniqueIds: {},\n".format(0) - cfg += " AxiAddrWidth: {},\n".format(self.aw) - cfg += " AxiDataWidth: {},\n".format(self.dw) - cfg += " NoAddrRules: {},\n".format(self.addr_map_len()) - cfg += " NoMulticastRules: {},\n".format(self.num_mcast_rules()) - cfg += " NoMulticastPorts: {},\n".format(self.num_mcast_ports()) - cfg += " default: '0\n" + cfg += " MaxSlvTrans: {},\n".format(self.max_slv_trans) + cfg += " MaxMstTrans: {},\n".format(self.max_mst_trans) + cfg += " FallThrough: {},\n".format(int(self.fall_through)) + cfg += " LatencyMode: {},\n".format(self.latency_mode) + cfg += " PipelineStages: {},\n".format(0) + cfg += " AxiIdWidthSlvPorts: {},\n".format(self.iw) + cfg += " AxiIdUsedSlvPorts: {},\n".format(self.iw) + cfg += " UniqueIds: {},\n".format(0) + cfg += " AxiAddrWidth: {},\n".format(self.aw) + cfg += " AxiDataWidth: {},\n".format(self.dw) + cfg += " NoAddrRules: {},\n".format(self.addr_map_len()) + cfg += " NoMulticastRules: {},\n".format(self.num_mcast_rules()) + cfg += " NoMulticastPorts: {},\n".format(self.num_mcast_ports()) + cfg += " McastToDefaultSlave: {},\n".format(int(self.mcast_to_default_slave)) + cfg += " default: '0\n" cfg += "};\n" code_package += "\n" + cfg