From 1ba38805103f6b1678cdd750bb9a5c4f8dc908d2 Mon Sep 17 00:00:00 2001
From: Maximilian <hmaximili@ethz.ch>
Date: Tue, 17 Sep 2024 14:22:36 +0200
Subject: [PATCH] Final update for programming-script, added
 debugging-instructions to the Readme

---
 README.md                                     | 29 +++++++++++++++++++
 examples_hw/apps/rdma_perf/init_ip.tcl        |  2 +-
 examples_sw/apps/rdma_service/client/main.cpp |  2 +-
 hw/hdl/network/rdma/roce_stack.sv             |  4 +--
 program_coyote.sh                             |  2 +-
 5 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 382f3371..85f99f0f 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,35 @@ It is important to know that latency-measurements are conducted using a ping-pon
 
 #### tcp_iperf
 
+## Coyote v2 Hardware-Debugging
+Coyote can be debugged on the hardware-level using the AMD ILA / ChipScope-cores. This requires interaction with the Vivado GUI, so that it's important to know how to access the different project files, include ILA-cores and trigger a rebuild of the bitstream: 
+
+#### Shell (Static and Dynamic Layer)
+Open the Vivado GUI and click `Open Project`. The required file is located within the previously generated hardware-build directory, at `.../<Name of HW-build folder>/test_shell/test.xpr` and should now be selected for opening the shell-project. 
+
+###### Creating a new ILA
+The `Sources` tab in the GUI can now be used to navigate to any file that is part of the shell - i.e. the networking stacks. There, a new ILA can be placed by including the module-template in the source code: 
+~~~~
+ila_<name> inst_ila_<name> (
+  .clk(nclk); 
+  .probe0(<Signal #1>), 
+  .probe1(<Signal #2>), 
+  ...
+); 
+~~~~
+It makes sense to annotate (in comments) the bidwidth of each signal, since this information is required for the instantiation of the ILA-IP. 
+In the next step, select the tab `IP Catalog` from the section `PROJECT MANAGER` on the left side of the GUI, search for `ILA` and select the first found item ("ILA (Integrated Logic Analyzer)"). Then, you enter the "Component Name" that was previously used for the instantiation of the module in hardware ("ila_<name>"), select the right number of probes and the desired sample data depth. Afterwards, assign the right bitwidth to all probes in the different tabs of the interface. Finally, you can start a `Out of context per IP`-run by clicking `Generate` in the next interface. Once this run is through, the bitstream-generation can be restarted via 
+~~~~
+$ make bitgen
+~~~~
+in the original build-directory as described before. This build-process is expected to be considerably faster than the original run. Once it's finished, the new ILA should be accessible for testing: 
+
+###### Using an ILA for debugging
+In the project-interface of the GUI click on `Open Hardware Manager` and select "Open target" in the top-dialogue. If you're logged into a machine with a locally attached FPGA, select `Auto Connect`, otherwise chose `Open New Target` to connect to a remote machine with FPGA via the network. Once the connection is established, you'll be able to select the specific ILA from the `Hardware` tab on the left side of the hardware manager. This opens a waveform-display, where the capturing-settings and the trigger-setup can be selected. This allows to create a data capturing customized to the desired experiment or debugging purpose. 
+
+#### Application Layer
+The application layer Vivado-project can be opened via `.../<Name of HW-build folder>/test_config_0/user_c0_0/test.xpr`. The subsequent steps for creating and using new ILAs are then identical to what's described above. 
+
 ## Deploying on the ETHZ HACC-cluster 
 The ETHZ HACC is a premiere cluster for research in systems, architecture and applications (https://github.com/fpgasystems/hacc/tree/main). Its hardware equipment provides the ideal environment to run Coyote-based experiments, since users can book up to 10 servers with U55C-accelerator cards connected via a fully switched 100G-network. User accounts for this platform can be obtained following the explanation on the previously cited homepage. 
 
diff --git a/examples_hw/apps/rdma_perf/init_ip.tcl b/examples_hw/apps/rdma_perf/init_ip.tcl
index a4d1815f..fdcb14e1 100644
--- a/examples_hw/apps/rdma_perf/init_ip.tcl
+++ b/examples_hw/apps/rdma_perf/init_ip.tcl
@@ -1,2 +1,2 @@
 create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_0
-set_property -dict [list CONFIG.C_PROBE17_WIDTH {128} CONFIG.C_PROBE14_WIDTH {128} CONFIG.C_NUM_OF_PROBES {20} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_0]
\ No newline at end of file
+set_property -dict [list CONFIG.C_PROBE17_WIDTH {128} CONFIG.C_PROBE14_WIDTH {128} CONFIG.C_NUM_OF_PROBES {20} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2} CONTROL.DATA_DEPTH {4096}] [get_ips ila_0]
\ No newline at end of file
diff --git a/examples_sw/apps/rdma_service/client/main.cpp b/examples_sw/apps/rdma_service/client/main.cpp
index 26163641..37a194dd 100644
--- a/examples_sw/apps/rdma_service/client/main.cpp
+++ b/examples_sw/apps/rdma_service/client/main.cpp
@@ -234,7 +234,7 @@ int main(int argc, char *argv[])
 
         // Generate the required output based on the statistical data from the benchmarking tool 
         std::cout << std::fixed << std::setprecision(2);
-        std::cout << std::setw(8) << sg.rdma.len << " [bytes], thoughput: " 
+        std::cout << std::setw(8) << sg.rdma.len << " [bytes], throughput: " 
                     << std::setw(8) << ((1 + oper) * ((1000 * sg.rdma.len ))) / ((bench.getAvg()) / n_reps_thr) << " [MB/s], latency: ";
 
         // Sync - reset the completion counter from the thread, sync-up via ACK-handshakes 
diff --git a/hw/hdl/network/rdma/roce_stack.sv b/hw/hdl/network/rdma/roce_stack.sv
index de11b543..69d45858 100644
--- a/hw/hdl/network/rdma/roce_stack.sv
+++ b/hw/hdl/network/rdma/roce_stack.sv
@@ -220,7 +220,7 @@ assign rdma_wr_req.ready = m_rdma_wr_req.ready;
 // RoCE stack
 //
 
-/*
+
 ila_rdma inst_ila_rdma (
   .clk(nclk),
 
@@ -263,7 +263,7 @@ ila_rdma inst_ila_rdma (
   .probe36(rdma_rd_req.data), // 128
   .probe37(rdma_wr_req.data) // 128
 );
-*/
+
 
 metaIntf #(.STYPE(logic[103:0])) m_axis_dbg_0 ();
 metaIntf #(.STYPE(logic[103:0])) m_axis_dbg_1 ();
diff --git a/program_coyote.sh b/program_coyote.sh
index b0e72de1..b28b6081 100644
--- a/program_coyote.sh
+++ b/program_coyote.sh
@@ -56,7 +56,7 @@ if [ $DRV_INSERT -eq 1 ]; then
     echo "***"
     echo "** IP_ADDRESS: $DEVICE_1_IP_ADDRESS_HEX_0"
     echo "** MAC_ADDRESS: $DEVICE_1_MAC_ADDRESS_0"
-    sgutil program driver -m $DRV_PATH -p ip_addr=$DEVICE_1_IP_ADDRESS_HEX_0,mac_addr=$DEVICE_1_MAC_ADDRESS_0
+    sgutil program driver -i $DRV_PATH -p ip_addr=$DEVICE_1_IP_ADDRESS_HEX_0,mac_addr=$DEVICE_1_MAC_ADDRESS_0
     # sgutil program driver -m $DRV_PATH
     echo "***"
     echo "** Driver loaded "