Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added documentation to retrans_buffer and changed RDMA_sw for easier debugging #81

Merged
merged 2 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion examples_sw/apps/rdma_service/client/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ int main(int argc, char *argv[])
sg.rdma.len = min_size;
sg.rdma.local_stream = strmHost;

// Get a hMem to write values into the payload of the RDMA-packets
uint64_t *hMem = (uint64_t*)(cthread.getQpair()->local.vaddr);

// Set the Coyote Operation, which can either be a REMOTE_WRITE or a REMOTE_READ, depending on the settings for the experiment
CoyoteOper coper = oper ? CoyoteOper::REMOTE_RDMA_WRITE : CoyoteOper::REMOTE_RDMA_READ;;

Expand Down Expand Up @@ -219,6 +222,9 @@ int main(int argc, char *argv[])
# endif
cthread.invoke(coper, &sg);

// Increment the hMem-value
// hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

// Check the number of completed RDMA-transactions, wait until all operations have been completed. Check for stalling in-between.
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) {
# ifdef VERBOSE
Expand Down Expand Up @@ -256,10 +262,16 @@ int main(int argc, char *argv[])
std::cout << "rdma_client: invoke the operation " << std::endl;
# endif
cthread.invoke(coper, &sg);

// Increment the hMem-value
hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

bool message_written = false;
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) {
# ifdef VERBOSE
std::cout << "rdma_client: Current number of completed operations: " << cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl;
# endif
# endif

// As long as the completion is not yet received, check for a possible stall-event
if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
}
Expand Down
16 changes: 15 additions & 1 deletion examples_sw/apps/rdma_service/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ int main(int argc, char *argv[])
memset(&sg, 0, sizeof(rdmaSg));
sg.rdma.len = min_size; sg.rdma.local_stream = strmHost;

// Get a memory handle to manipulate values in the RDMA payloads
uint64_t *hMem = (uint64_t*)(cthread->getQpair()->local.vaddr);

while(sg.rdma.len <= max_size) {
// Sync via the cThread that is part of the cService-daemon that was just started in the background
# ifdef VERBOSE
Expand Down Expand Up @@ -161,7 +164,18 @@ int main(int argc, char *argv[])
// LAT - iterate over the number of ping-pong-exchanges according to the desired experiment setting
for(int i = 0; i < n_reps_lat; i++) {
// Wait for the next incoming WRITE
while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { }
bool message_written = false;
while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) {
if(!message_written) {
std::cout << "RDMA-Server: Waiting for an incoming RDMA-WRITE at currently " << i << "." << std::endl;
message_written = true;
}
}

// Increment the number in the payload before writing back
hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

std::cout << "RDMA-Server: Invoking a RDMA-WRITE from the Server to the Client at currently " << (i+1) << "." << std::endl;
cthread->invoke(CoyoteOper::REMOTE_RDMA_WRITE, &sg);
}
} else {
Expand Down
64 changes: 42 additions & 22 deletions hw/hdl/network/rdma/rdma_mux_retrans.sv
Original file line number Diff line number Diff line change
Expand Up @@ -31,51 +31,60 @@ import lynxTypes::*;

/**
* @brief RDMA retrans multiplexer
* Used for split-up of the interfaces: 1 Interface towards the HLS stack, 2 interfaces exposed to the roce_stack
*
*/
module rdma_mux_retrans (
input logic aclk,
input logic aresetn,

metaIntf.s s_req_net,
metaIntf.m m_req_user,
AXI4S.s s_axis_user_req,
AXI4S.s s_axis_user_rsp,
AXI4S.m m_axis_net,

metaIntf.m m_req_ddr_rd,
metaIntf.m m_req_ddr_wr,
AXI4S.s s_axis_ddr,
AXI4S.m m_axis_ddr
metaIntf.s s_req_net, // Incoming read requests from the HLS-stack
metaIntf.m m_req_user, // Outgoing read requests to the roce_stack
AXI4S.s s_axis_user_req, // Incoming data (rd_req) from the roce_stack
AXI4S.s s_axis_user_rsp, // Incoming data (rd_rsp) from the roce_stack
AXI4S.m m_axis_net, // Outgoing data to the HLS-stack

metaIntf.m m_req_ddr_rd, // Outgoing read commands to the roce_stack
metaIntf.m m_req_ddr_wr, // Outgoing write commands to the roce_stack
AXI4S.s s_axis_ddr, // Incoming data (mem_rd) from the roce_stack
AXI4S.m m_axis_ddr // Outgoing data (mem_wr) to the roce_stack

// Write data from the HLS-stack to the roce_stack are directly forwarded, as well as WRITE-requests / commands
);

// Parameter for the number of outstanding bits, with a bit-counter
localparam integer RDMA_N_OST = RDMA_N_WR_OUTSTANDING;
localparam integer RDMA_OST_BITS = $clog2(RDMA_N_OST);

// sink and source signals for requests and commands
logic seq_snk_valid;
logic seq_snk_ready;
logic seq_src_valid;
logic seq_src_ready;

// Signals for
logic [LEN_BITS-1:0] len_snk;
logic [LEN_BITS-1:0] len_next;
logic actv_snk;
logic actv_next;
logic rd_snk;
logic rd_next;

// Signals to connect to the queues that lead to the control signals toward the top-level module
metaIntf #(.STYPE(req_t)) req_user ();
metaIntf #(.STYPE(logic[MEM_CMD_BITS-1:0])) req_ddr_rd ();
metaIntf #(.STYPE(logic[MEM_CMD_BITS-1:0])) req_ddr_wr ();

// --------------------------------------------------------------------------------
// I/O !!! interface
// --------------------------------------------------------------------------------

// Queues for all control interfaces to / from the top-level-design
meta_queue #(.DATA_BITS($bits(req_t))) inst_meta_user_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_user), .m_meta(m_req_user));
meta_queue #(.DATA_BITS(MEM_CMD_BITS)) inst_meta_ddr_rd_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_ddr_rd), .m_meta(m_req_ddr_rd));
meta_queue #(.DATA_BITS(MEM_CMD_BITS)) inst_meta_ddr_wr_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_ddr_wr), .m_meta(m_req_ddr_wr));


// Get the sink-values from incoming mem-read-command from the HLS-networking stack
assign len_snk = s_req_net.data.len[LEN_BITS-1:0];
assign actv_snk = s_req_net.data.actv;
assign rd_snk = is_opcode_rd_resp(s_req_net.data.opcode);
Expand All @@ -85,8 +94,9 @@ assign rd_snk = is_opcode_rd_resp(s_req_net.data.opcode);
// --------------------------------------------------------------------------------
always_comb begin
if(actv_snk) begin
// User
// User - action initiated by the active signals set in the s_req_net port, which is connected to the HLS-networking-stack
if(rd_snk) begin
// Case: READ RESPONSE
seq_snk_valid = seq_snk_ready & req_user.ready & s_req_net.valid;
req_user.valid = seq_snk_valid;
req_ddr_rd.valid = 1'b0;
Expand All @@ -95,6 +105,7 @@ always_comb begin
s_req_net.ready = seq_snk_ready & req_user.ready;
end
else begin
// case: WRITE (probably? But why do you need to request data for this? Shouldn't it be automatically delivered to the stack?)
seq_snk_valid = seq_snk_ready & req_ddr_wr.ready & s_req_net.valid;
req_user.valid = 1'b0;
req_ddr_rd.valid = 1'b0;
Expand All @@ -104,7 +115,7 @@ always_comb begin
end
end
else begin
// Retrans
// Retrans - no active signal set in the s_req_net port, indicates a required retransmission
seq_snk_valid = seq_snk_ready & req_ddr_rd.ready & s_req_net.valid;
req_user.valid = 1'b0;
req_ddr_rd.valid = seq_snk_valid;
Expand All @@ -114,6 +125,7 @@ always_comb begin
end
end

// Construct the required control-signals towards the top-level-module from the s_req_net-port that is fed by the HLS-stack
always_comb begin
req_ddr_rd.data = 0;
req_ddr_rd.data[0+:64] = (64'b0 |
Expand All @@ -132,6 +144,7 @@ always_comb begin
req_user.data = s_req_net.data;
end

// Queue for requests with sink and source
queue_stream #(
.QTYPE(logic [1+1+LEN_BITS-1:0]),
.QDEPTH(N_OUTSTANDING)
Expand Down Expand Up @@ -167,6 +180,7 @@ AXI4S #(.AXI4S_DATA_BITS(AXI_NET_BITS)) axis_ddr_wr ();
// I/O !!! interface
// --------------------------------------------------------------------------------

// Queue for data towards the HLS-stack
axis_data_fifo_512 inst_data_que_net (
.s_axis_aresetn(aresetn),
.s_axis_aclk(aclk),
Expand All @@ -182,6 +196,7 @@ axis_data_fifo_512 inst_data_que_net (
.m_axis_tlast (m_axis_net.tlast)
);

// Queue for data towards the top-level module
axis_data_fifo_512 inst_data_que_ddr (
.s_axis_aresetn(aresetn),
.s_axis_aclk(aclk),
Expand All @@ -197,7 +212,7 @@ axis_data_fifo_512 inst_data_que_ddr (
.m_axis_tlast (m_axis_ddr.tlast)
);

// REG
// REG - move on states of the FSM according
always_ff @(posedge aclk) begin: PROC_REG
if (aresetn == 1'b0) begin
state_C <= ST_IDLE;
Expand All @@ -214,14 +229,16 @@ always_ff @(posedge aclk) begin: PROC_REG
end
end

// NSL
// NSL - state transition function
always_comb begin: NSL
state_N = state_C;

case(state_C)
// If there's a valid request coming from the source, switch to MUX-state
ST_IDLE:
state_N = (seq_src_valid) ? ST_MUX : ST_IDLE;

// If done, switch back to IDLE
ST_MUX:
state_N = tr_done ? (seq_src_valid ? ST_MUX : ST_IDLE) : ST_MUX;

Expand All @@ -234,7 +251,7 @@ always_comb begin: DP
actv_N = actv_C;
rd_N = rd_C;

// Transfer done
// Transfer done if the counter-value is at 0 and interfaces are ready
tr_done = (cnt_C == 0) &&
(actv_C ?
(rd_C ? (s_axis_user_rsp.tvalid & s_axis_user_rsp.tready) :
Expand All @@ -245,6 +262,7 @@ always_comb begin: DP

case(state_C)
ST_IDLE: begin
// Get the values for the counter etc. from the sink/source-queue
if(seq_src_valid) begin
seq_src_ready = 1'b1;
rd_N = rd_next;
Expand All @@ -255,7 +273,9 @@ always_comb begin: DP

ST_MUX: begin
if(tr_done) begin
// If done, set the counter next to 0
cnt_N = 0;
// Get the next values from the sink/source-queue
if(seq_src_valid) begin
seq_src_ready = 1'b1;
rd_N = rd_next;
Expand All @@ -264,6 +284,7 @@ always_comb begin: DP
end
end
else begin
// If not done, decrement the counter according to transmission state on the data-ports
cnt_N = actv_C ?
(rd_C ? ( (s_axis_user_rsp.tvalid & s_axis_user_rsp.tready ? cnt_C - 1 : cnt_C) ) :
( (s_axis_user_req.tvalid & s_axis_user_req.tready ? cnt_C - 1 : cnt_C) ) ) :
Expand Down Expand Up @@ -314,10 +335,12 @@ always_comb begin
end
end

// MUX: Decide which data is forwarded towards the HLS-networking-stack
assign axis_net.tdata = actv_C ? (rd_C ? s_axis_user_rsp.tdata : s_axis_user_req.tdata) : s_axis_ddr.tdata;
assign axis_net.tkeep = actv_C ? (rd_C ? s_axis_user_rsp.tkeep : s_axis_user_req.tkeep) : s_axis_ddr.tkeep;
assign axis_net.tlast = actv_C ? (rd_C ? s_axis_user_rsp.tlast : s_axis_user_req.tlast) : s_axis_ddr.tlast;

// Data-loop? Not exactly what this is for. Seems to loop data back from the top-level module to the top-level module
assign axis_ddr_wr.tdata = s_axis_user_req.tdata;
assign axis_ddr_wr.tkeep = s_axis_user_req.tkeep;
assign axis_ddr_wr.tlast = s_axis_user_req.tlast;
Expand All @@ -326,12 +349,10 @@ assign axis_ddr_wr.tlast = s_axis_user_req.tlast;
// DEBUG
//

/*
create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_retrans
set_property -dict [list CONFIG.C_PROBE29_WIDTH {22} CONFIG.C_PROBE23_WIDTH {28} CONFIG.C_NUM_OF_PROBES {35} CONFIG.Component_Name {ila_retrans} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE34_MU_CNT {2} CONFIG.C_PROBE33_MU_CNT {2} CONFIG.C_PROBE32_MU_CNT {2} CONFIG.C_PROBE31_MU_CNT {2} CONFIG.C_PROBE30_MU_CNT {2} CONFIG.C_PROBE29_MU_CNT {2} CONFIG.C_PROBE28_MU_CNT {2} CONFIG.C_PROBE27_MU_CNT {2} CONFIG.C_PROBE26_MU_CNT {2} CONFIG.C_PROBE25_MU_CNT {2} CONFIG.C_PROBE24_MU_CNT {2} CONFIG.C_PROBE23_MU_CNT {2} CONFIG.C_PROBE22_MU_CNT {2} CONFIG.C_PROBE21_MU_CNT {2} CONFIG.C_PROBE20_MU_CNT {2} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_retrans]
*/

/*
// create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_retrans
// set_property -dict [list CONFIG.C_DATA_DEPTH {8192} CONFIG.C_PROBE29_WIDTH {22} CONFIG.C_PROBE23_WIDTH {28} CONFIG.C_NUM_OF_PROBES {35} CONFIG.Component_Name {ila_retrans} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE34_MU_CNT {2} CONFIG.C_PROBE33_MU_CNT {2} CONFIG.C_PROBE32_MU_CNT {2} CONFIG.C_PROBE31_MU_CNT {2} CONFIG.C_PROBE30_MU_CNT {2} CONFIG.C_PROBE29_MU_CNT {2} CONFIG.C_PROBE28_MU_CNT {2} CONFIG.C_PROBE27_MU_CNT {2} CONFIG.C_PROBE26_MU_CNT {2} CONFIG.C_PROBE25_MU_CNT {2} CONFIG.C_PROBE24_MU_CNT {2} CONFIG.C_PROBE23_MU_CNT {2} CONFIG.C_PROBE22_MU_CNT {2} CONFIG.C_PROBE21_MU_CNT {2} CONFIG.C_PROBE20_MU_CNT {2} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_retrans]

ila_retrans inst_ila_retrans (
.clk(aclk),

Expand Down Expand Up @@ -379,6 +400,5 @@ ila_retrans inst_ila_retrans (
.probe33(req_user.ready),
.probe34(req_user.valid)
);
*/

endmodule
19 changes: 15 additions & 4 deletions hw/hdl/network/rdma/roce_stack.sv
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,9 @@ assign rdma_wr_req.ready = m_rdma_wr_req.ready;
//


/* ila_rdma inst_ila_rdma (
/*
ila_rdma inst_ila_rdma (
.clk(nclk),

.probe0(s_rdma_sq.valid),
.probe1(s_rdma_sq.ready),
.probe2(rdma_sq.valid),
Expand Down Expand Up @@ -261,8 +261,19 @@ assign rdma_wr_req.ready = m_rdma_wr_req.ready;
.probe34(s_rdma_conn_interface.valid),
.probe35(s_rdma_conn_interface.ready),
.probe36(rdma_rd_req.data), // 128
); */

.probe37(rdma_wr_req.data), // 128
.probe38(s_axis_rx.tvalid),
.probe39(s_axis_rx.tready),
.probe40(s_axis_rx.tdata), // 512
.probe41(s_axis_rx.tkeep), // 64
.probe42(s_axis_rx.tlast),
.probe43(m_axis_tx.tvalid),
.probe44(m_axis_tx.tready),
.probe45(m_axis_tx.tdata), // 512
.probe46(m_axis_tx.tkeep), // 64
.probe47(m_axis_tx.tlast)
);
*/

metaIntf #(.STYPE(logic[103:0])) m_axis_dbg_0 ();
metaIntf #(.STYPE(logic[103:0])) m_axis_dbg_1 ();
Expand Down
Loading