Skip to content

Commit

Permalink
Update of the software + comments to retrans + roce_stack
Browse files Browse the repository at this point in the history
  • Loading branch information
Maximilian committed Oct 9, 2024
1 parent 1ba3880 commit 1a4cb53
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 25 deletions.
14 changes: 13 additions & 1 deletion examples_sw/apps/rdma_service/client/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ int main(int argc, char *argv[])
sg.rdma.len = min_size;
sg.rdma.local_stream = strmHost;

// Get a hMem to write values into the payload of the RDMA-packets
uint64_t *hMem = (uint64_t*)(cthread.getQpair()->local.vaddr);

// Set the Coyote Operation, which can either be a REMOTE_WRITE or a REMOTE_READ, depending on the settings for the experiment
CoyoteOper coper = oper ? CoyoteOper::REMOTE_RDMA_WRITE : CoyoteOper::REMOTE_RDMA_READ;;

Expand Down Expand Up @@ -219,6 +222,9 @@ int main(int argc, char *argv[])
# endif
cthread.invoke(coper, &sg);

// Increment the hMem-value
// hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

// Check the number of completed RDMA-transactions, wait until all operations have been completed. Check for stalling in-between.
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) {
# ifdef VERBOSE
Expand Down Expand Up @@ -256,10 +262,16 @@ int main(int argc, char *argv[])
std::cout << "rdma_client: invoke the operation " << std::endl;
# endif
cthread.invoke(coper, &sg);

// Increment the hMem-value
hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

bool message_written = false;
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) {
# ifdef VERBOSE
std::cout << "rdma_client: Current number of completed operations: " << cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl;
# endif
# endif

// As long as the completion is not yet received, check for a possible stall-event
if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
}
Expand Down
16 changes: 15 additions & 1 deletion examples_sw/apps/rdma_service/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ int main(int argc, char *argv[])
memset(&sg, 0, sizeof(rdmaSg));
sg.rdma.len = min_size; sg.rdma.local_stream = strmHost;

// Get a memory handle to manipulate values in the RDMA payloads
uint64_t *hMem = (uint64_t*)(cthread->getQpair()->local.vaddr);

while(sg.rdma.len <= max_size) {
// Sync via the cThread that is part of the cService-daemon that was just started in the background
# ifdef VERBOSE
Expand Down Expand Up @@ -161,7 +164,18 @@ int main(int argc, char *argv[])
// LAT - iterate over the number of ping-pong-exchanges according to the desired experiment setting
for(int i = 0; i < n_reps_lat; i++) {
// Wait for the next incoming WRITE
while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { }
bool message_written = false;
while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) {
if(!message_written) {
std::cout << "RDMA-Server: Waiting for an incoming RDMA-WRITE at currently " << i << "." << std::endl;
message_written = true;
}
}

// Increment the number in the payload before writing back
hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1;

std::cout << "RDMA-Server: Invoking a RDMA-WRITE from the Server to the Client at currently " << (i+1) << "." << std::endl;
cthread->invoke(CoyoteOper::REMOTE_RDMA_WRITE, &sg);
}
} else {
Expand Down
64 changes: 42 additions & 22 deletions hw/hdl/network/rdma/rdma_mux_retrans.sv
Original file line number Diff line number Diff line change
Expand Up @@ -31,51 +31,60 @@ import lynxTypes::*;

/**
* @brief RDMA retrans multiplexer
* Used for split-up of the interfaces: 1 Interface towards the HLS stack, 2 interfaces exposed to the roce_stack
*
*/
module rdma_mux_retrans (
input logic aclk,
input logic aresetn,

metaIntf.s s_req_net,
metaIntf.m m_req_user,
AXI4S.s s_axis_user_req,
AXI4S.s s_axis_user_rsp,
AXI4S.m m_axis_net,

metaIntf.m m_req_ddr_rd,
metaIntf.m m_req_ddr_wr,
AXI4S.s s_axis_ddr,
AXI4S.m m_axis_ddr
metaIntf.s s_req_net, // Incoming read requests from the HLS-stack
metaIntf.m m_req_user, // Outgoing read requests to the roce_stack
AXI4S.s s_axis_user_req, // Incoming data (rd_req) from the roce_stack
AXI4S.s s_axis_user_rsp, // Incoming data (rd_rsp) from the roce_stack
AXI4S.m m_axis_net, // Outgoing data to the HLS-stack

metaIntf.m m_req_ddr_rd, // Outgoing read commands to the roce_stack
metaIntf.m m_req_ddr_wr, // Outgoing write commands to the roce_stack
AXI4S.s s_axis_ddr, // Incoming data (mem_rd) from the roce_stack
AXI4S.m m_axis_ddr // Outgoing data (mem_wr) to the roce_stack

// Write data from the HLS-stack to the roce_stack are directly forwarded, as well as WRITE-requests / commands
);

// Parameter for the number of outstanding bits, with a bit-counter
localparam integer RDMA_N_OST = RDMA_N_WR_OUTSTANDING;
localparam integer RDMA_OST_BITS = $clog2(RDMA_N_OST);

// sink and source signals for requests and commands
logic seq_snk_valid;
logic seq_snk_ready;
logic seq_src_valid;
logic seq_src_ready;

// Signals for
logic [LEN_BITS-1:0] len_snk;
logic [LEN_BITS-1:0] len_next;
logic actv_snk;
logic actv_next;
logic rd_snk;
logic rd_next;

// Signals to connect to the queues that lead to the control signals toward the top-level module
metaIntf #(.STYPE(req_t)) req_user ();
metaIntf #(.STYPE(logic[MEM_CMD_BITS-1:0])) req_ddr_rd ();
metaIntf #(.STYPE(logic[MEM_CMD_BITS-1:0])) req_ddr_wr ();

// --------------------------------------------------------------------------------
// I/O !!! interface
// --------------------------------------------------------------------------------

// Queues for all control interfaces to / from the top-level-design
meta_queue #(.DATA_BITS($bits(req_t))) inst_meta_user_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_user), .m_meta(m_req_user));
meta_queue #(.DATA_BITS(MEM_CMD_BITS)) inst_meta_ddr_rd_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_ddr_rd), .m_meta(m_req_ddr_rd));
meta_queue #(.DATA_BITS(MEM_CMD_BITS)) inst_meta_ddr_wr_q (.aclk(aclk), .aresetn(aresetn), .s_meta(req_ddr_wr), .m_meta(m_req_ddr_wr));


// Get the sink-values from incoming mem-read-command from the HLS-networking stack
assign len_snk = s_req_net.data.len[LEN_BITS-1:0];
assign actv_snk = s_req_net.data.actv;
assign rd_snk = is_opcode_rd_resp(s_req_net.data.opcode);
Expand All @@ -85,8 +94,9 @@ assign rd_snk = is_opcode_rd_resp(s_req_net.data.opcode);
// --------------------------------------------------------------------------------
always_comb begin
if(actv_snk) begin
// User
// User - action initiated by the active signals set in the s_req_net port, which is connected to the HLS-networking-stack
if(rd_snk) begin
// Case: READ RESPONSE
seq_snk_valid = seq_snk_ready & req_user.ready & s_req_net.valid;
req_user.valid = seq_snk_valid;
req_ddr_rd.valid = 1'b0;
Expand All @@ -95,6 +105,7 @@ always_comb begin
s_req_net.ready = seq_snk_ready & req_user.ready;
end
else begin
// case: WRITE (probably? But why do you need to request data for this? Shouldn't it be automatically delivered to the stack?)
seq_snk_valid = seq_snk_ready & req_ddr_wr.ready & s_req_net.valid;
req_user.valid = 1'b0;
req_ddr_rd.valid = 1'b0;
Expand All @@ -104,7 +115,7 @@ always_comb begin
end
end
else begin
// Retrans
// Retrans - no active signal set in the s_req_net port, indicates a required retransmission
seq_snk_valid = seq_snk_ready & req_ddr_rd.ready & s_req_net.valid;
req_user.valid = 1'b0;
req_ddr_rd.valid = seq_snk_valid;
Expand All @@ -114,6 +125,7 @@ always_comb begin
end
end

// Construct the required control-signals towards the top-level-module from the s_req_net-port that is fed by the HLS-stack
always_comb begin
req_ddr_rd.data = 0;
req_ddr_rd.data[0+:64] = (64'b0 |
Expand All @@ -132,6 +144,7 @@ always_comb begin
req_user.data = s_req_net.data;
end

// Queue for requests with sink and source
queue_stream #(
.QTYPE(logic [1+1+LEN_BITS-1:0]),
.QDEPTH(N_OUTSTANDING)
Expand Down Expand Up @@ -167,6 +180,7 @@ AXI4S #(.AXI4S_DATA_BITS(AXI_NET_BITS)) axis_ddr_wr ();
// I/O !!! interface
// --------------------------------------------------------------------------------

// Queue for data towards the HLS-stack
axis_data_fifo_512 inst_data_que_net (
.s_axis_aresetn(aresetn),
.s_axis_aclk(aclk),
Expand All @@ -182,6 +196,7 @@ axis_data_fifo_512 inst_data_que_net (
.m_axis_tlast (m_axis_net.tlast)
);

// Queue for data towards the top-level module
axis_data_fifo_512 inst_data_que_ddr (
.s_axis_aresetn(aresetn),
.s_axis_aclk(aclk),
Expand All @@ -197,7 +212,7 @@ axis_data_fifo_512 inst_data_que_ddr (
.m_axis_tlast (m_axis_ddr.tlast)
);

// REG
// REG - move on states of the FSM according
always_ff @(posedge aclk) begin: PROC_REG
if (aresetn == 1'b0) begin
state_C <= ST_IDLE;
Expand All @@ -214,14 +229,16 @@ always_ff @(posedge aclk) begin: PROC_REG
end
end

// NSL
// NSL - state transition function
always_comb begin: NSL
state_N = state_C;

case(state_C)
// If there's a valid request coming from the source, switch to MUX-state
ST_IDLE:
state_N = (seq_src_valid) ? ST_MUX : ST_IDLE;

// If done, switch back to IDLE
ST_MUX:
state_N = tr_done ? (seq_src_valid ? ST_MUX : ST_IDLE) : ST_MUX;

Expand All @@ -234,7 +251,7 @@ always_comb begin: DP
actv_N = actv_C;
rd_N = rd_C;

// Transfer done
// Transfer done if the counter-value is at 0 and interfaces are ready
tr_done = (cnt_C == 0) &&
(actv_C ?
(rd_C ? (s_axis_user_rsp.tvalid & s_axis_user_rsp.tready) :
Expand All @@ -245,6 +262,7 @@ always_comb begin: DP

case(state_C)
ST_IDLE: begin
// Get the values for the counter etc. from the sink/source-queue
if(seq_src_valid) begin
seq_src_ready = 1'b1;
rd_N = rd_next;
Expand All @@ -255,7 +273,9 @@ always_comb begin: DP

ST_MUX: begin
if(tr_done) begin
// If done, set the counter next to 0
cnt_N = 0;
// Get the next values from the sink/source-queue
if(seq_src_valid) begin
seq_src_ready = 1'b1;
rd_N = rd_next;
Expand All @@ -264,6 +284,7 @@ always_comb begin: DP
end
end
else begin
// If not done, decrement the counter according to transmission state on the data-ports
cnt_N = actv_C ?
(rd_C ? ( (s_axis_user_rsp.tvalid & s_axis_user_rsp.tready ? cnt_C - 1 : cnt_C) ) :
( (s_axis_user_req.tvalid & s_axis_user_req.tready ? cnt_C - 1 : cnt_C) ) ) :
Expand Down Expand Up @@ -314,10 +335,12 @@ always_comb begin
end
end

// MUX: Decide which data is forwarded towards the HLS-networking-stack
assign axis_net.tdata = actv_C ? (rd_C ? s_axis_user_rsp.tdata : s_axis_user_req.tdata) : s_axis_ddr.tdata;
assign axis_net.tkeep = actv_C ? (rd_C ? s_axis_user_rsp.tkeep : s_axis_user_req.tkeep) : s_axis_ddr.tkeep;
assign axis_net.tlast = actv_C ? (rd_C ? s_axis_user_rsp.tlast : s_axis_user_req.tlast) : s_axis_ddr.tlast;

// Data-loop? Not exactly what this is for. Seems to loop data back from the top-level module to the top-level module
assign axis_ddr_wr.tdata = s_axis_user_req.tdata;
assign axis_ddr_wr.tkeep = s_axis_user_req.tkeep;
assign axis_ddr_wr.tlast = s_axis_user_req.tlast;
Expand All @@ -326,12 +349,10 @@ assign axis_ddr_wr.tlast = s_axis_user_req.tlast;
// DEBUG
//

/*
create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_retrans
set_property -dict [list CONFIG.C_PROBE29_WIDTH {22} CONFIG.C_PROBE23_WIDTH {28} CONFIG.C_NUM_OF_PROBES {35} CONFIG.Component_Name {ila_retrans} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE34_MU_CNT {2} CONFIG.C_PROBE33_MU_CNT {2} CONFIG.C_PROBE32_MU_CNT {2} CONFIG.C_PROBE31_MU_CNT {2} CONFIG.C_PROBE30_MU_CNT {2} CONFIG.C_PROBE29_MU_CNT {2} CONFIG.C_PROBE28_MU_CNT {2} CONFIG.C_PROBE27_MU_CNT {2} CONFIG.C_PROBE26_MU_CNT {2} CONFIG.C_PROBE25_MU_CNT {2} CONFIG.C_PROBE24_MU_CNT {2} CONFIG.C_PROBE23_MU_CNT {2} CONFIG.C_PROBE22_MU_CNT {2} CONFIG.C_PROBE21_MU_CNT {2} CONFIG.C_PROBE20_MU_CNT {2} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_retrans]
*/

/*
// create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_retrans
// set_property -dict [list CONFIG.C_DATA_DEPTH {8192} CONFIG.C_PROBE29_WIDTH {22} CONFIG.C_PROBE23_WIDTH {28} CONFIG.C_NUM_OF_PROBES {35} CONFIG.Component_Name {ila_retrans} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE34_MU_CNT {2} CONFIG.C_PROBE33_MU_CNT {2} CONFIG.C_PROBE32_MU_CNT {2} CONFIG.C_PROBE31_MU_CNT {2} CONFIG.C_PROBE30_MU_CNT {2} CONFIG.C_PROBE29_MU_CNT {2} CONFIG.C_PROBE28_MU_CNT {2} CONFIG.C_PROBE27_MU_CNT {2} CONFIG.C_PROBE26_MU_CNT {2} CONFIG.C_PROBE25_MU_CNT {2} CONFIG.C_PROBE24_MU_CNT {2} CONFIG.C_PROBE23_MU_CNT {2} CONFIG.C_PROBE22_MU_CNT {2} CONFIG.C_PROBE21_MU_CNT {2} CONFIG.C_PROBE20_MU_CNT {2} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_retrans]

ila_retrans inst_ila_retrans (
.clk(aclk),

Expand Down Expand Up @@ -379,6 +400,5 @@ ila_retrans inst_ila_retrans (
.probe33(req_user.ready),
.probe34(req_user.valid)
);
*/

endmodule
12 changes: 11 additions & 1 deletion hw/hdl/network/rdma/roce_stack.sv
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,17 @@ ila_rdma inst_ila_rdma (
.probe34(s_rdma_conn_interface.valid),
.probe35(s_rdma_conn_interface.ready),
.probe36(rdma_rd_req.data), // 128
.probe37(rdma_wr_req.data) // 128
.probe37(rdma_wr_req.data), // 128
.probe38(s_axis_rx.tvalid),
.probe39(s_axis_rx.tready),
.probe40(s_axis_rx.tdata), // 512
.probe41(s_axis_rx.tkeep), // 64
.probe42(s_axis_rx.tlast),
.probe43(m_axis_tx.tvalid),
.probe44(m_axis_tx.tready),
.probe45(m_axis_tx.tdata), // 512
.probe46(m_axis_tx.tkeep), // 64
.probe47(m_axis_tx.tlast)
);


Expand Down

0 comments on commit 1a4cb53

Please sign in to comment.