From 94ed6c3872d2784f1d40dfdfaf334f12a4501dd2 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 30 Jul 2024 13:00:02 +0200 Subject: [PATCH 1/3] Fixed perf_fpga --- examples_hw/apps/perf_fpga/vfpga_top.svh | 8 ++++---- examples_sw/apps/perf_fpga/main.cpp | 6 +++--- scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt | 9 ++++++--- sw/include/cDefs.hpp | 10 +++++----- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/examples_hw/apps/perf_fpga/vfpga_top.svh b/examples_hw/apps/perf_fpga/vfpga_top.svh index c27727e7..d136c62d 100644 --- a/examples_hw/apps/perf_fpga/vfpga_top.svh +++ b/examples_hw/apps/perf_fpga/vfpga_top.svh @@ -160,8 +160,8 @@ end always_comb begin // Requests sq_rd.data = 0; - sq_rd.data.opcode = 5'd1; - sq_rd.data.strm = 2'b0; + sq_rd.data.opcode = LOCAL_READ; + sq_rd.data.strm = STRM_HOST; sq_rd.data.mode = 0; sq_rd.data.rdma = 0; sq_rd.data.remote = 0; @@ -173,8 +173,8 @@ always_comb begin sq_rd.valid = (state_C == ST_READ) && ~done_req; sq_wr.data = 0; - sq_wr.data.opcode = 5'd2; - sq_wr.data.strm = 2'b0; + sq_wr.data.opcode = LOCAL_WRITE; + sq_wr.data.strm = STRM_HOST; sq_wr.data.mode = 0; sq_wr.data.rdma = 0; sq_wr.data.remote = 0; diff --git a/examples_sw/apps/perf_fpga/main.cpp b/examples_sw/apps/perf_fpga/main.cpp index f4978325..d546fb92 100644 --- a/examples_sw/apps/perf_fpga/main.cpp +++ b/examples_sw/apps/perf_fpga/main.cpp @@ -115,12 +115,12 @@ int main(int argc, char *argv[]) while(curr_size <= maxSize) { for(int j = 0; j < nBenchRuns; j++) { time_bench_rd.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_RD)); - time_bench_wr.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_WR)); + time_bench_wr.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_WR)); // TODO Check correctness of results } std::cout << std::fixed << std::setprecision(2); std::cout << std::setw(8) << curr_size << " [bytes], RD: " - << std::setw(8) << ((n_reps * 1024 * curr_size) / vctr_avg(time_bench_rd)) << " [MB/s], WR: " - << std::setw(8) << ((n_reps * 1024 * curr_size) / vctr_avg(time_bench_wr)) << " [MB/s]" << std::endl; + << std::setw(8) << (((double) n_reps * 1024 * curr_size) / vctr_avg(time_bench_rd)) << " [MB/s], WR: " + << std::setw(8) << (((double) n_reps * 1024 * curr_size) / vctr_avg(time_bench_wr)) << " [MB/s]" << std::endl; time_bench_rd.clear(); time_bench_wr.clear(); diff --git a/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt b/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt index 74e5644b..1671e35c 100644 --- a/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt +++ b/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt @@ -189,6 +189,9 @@ package lynxTypes; parameter integer OPCODE_BITS = 5; parameter integer STRM_BITS = 2; + parameter integer LOCAL_READ = 1; + parameter integer LOCAL_WRITE = 2; + parameter integer STRM_CARD = 0; parameter integer STRM_HOST = 1; parameter integer STRM_TCP = 2; @@ -390,8 +393,8 @@ package lynxTypes; typedef struct packed { // Opcode - logic [OPCODE_BITS-1:0] opcode; - logic [STRM_BITS-1:0] strm; + logic [OPCODE_BITS-1:0] opcode; // One of the values of fpga::CoyoteOper + logic [STRM_BITS-1:0] strm; // One of STRM_CARD, STRM_HOST, STRM_TCP, or STRM_RDMA logic mode; logic rdma; logic remote; @@ -399,7 +402,7 @@ package lynxTypes; // ID logic [DEST_BITS-1:0] vfid; // rsrvd logic [PID_BITS-1:0] pid; - logic [DEST_BITS-1:0] dest; + logic [DEST_BITS-1:0] dest; // The index of the AXI stream that data arrives at/departs from // FLAGS logic last; diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp index 7321daf0..b52858c6 100644 --- a/sw/include/cDefs.hpp +++ b/sw/include/cDefs.hpp @@ -170,11 +170,11 @@ enum class CoyoteOperNew { enum class CoyoteOper { NOOP = 0, - LOCAL_READ = 1, - LOCAL_WRITE = 2, - LOCAL_TRANSFER = 3, - LOCAL_OFFLOAD = 4, - LOCAL_SYNC = 5, + LOCAL_READ = 1, // Transfer data from CPU or FPGA memory to FPGA stream (depending on sgEntry.local.src_stream) + LOCAL_WRITE = 2, // Transfer data from FPGA stream to CPU or FPGA memory (depending on sgEntry.local.dst_stream) + LOCAL_TRANSFER = 3, // LOCAL_READ and LOCAL_WRITE in parallel + LOCAL_OFFLOAD = 4, // Transfer data from CPU memory to FPGA memory + LOCAL_SYNC = 5, // Transfer data from FPGA memory to CPU memory REMOTE_RDMA_READ = 6, REMOTE_RDMA_WRITE = 7, REMOTE_RDMA_SEND = 8, From b9c3c98bc23d945188a43c3e91ab4a729f882a81 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 30 Jul 2024 15:00:34 +0200 Subject: [PATCH 2/3] Fixed perf_local benchmark --- examples_sw/apps/perf_local/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_sw/apps/perf_local/main.cpp b/examples_sw/apps/perf_local/main.cpp index 36c1354f..89063e5f 100644 --- a/examples_sw/apps/perf_local/main.cpp +++ b/examples_sw/apps/perf_local/main.cpp @@ -207,7 +207,7 @@ int main(int argc, char *argv[]) }; bench.runtime(benchmark_thr); std::cout << std::fixed << std::setprecision(2); - std::cout << "Size: " << std::setw(8) << curr_size << ", thr: " << std::setw(8) << (n_regions * 1000 * curr_size) / (bench.getAvg() / n_reps_thr) << " MB/s"; + std::cout << "Size: " << std::setw(8) << curr_size << ", thr: " << std::setw(8) << ((double) n_regions * 1000 * curr_size) / (bench.getAvg() / n_reps_thr) << " MB/s"; #ifndef EN_LAT_TESTS std::cout << std::endl; #endif From fa3e64565931129ea8d7422a7a2d75b6d8140254 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Wed, 31 Jul 2024 16:06:07 +0200 Subject: [PATCH 3/3] Fixed clock frequency and addee write verification --- examples_sw/apps/perf_fpga/main.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples_sw/apps/perf_fpga/main.cpp b/examples_sw/apps/perf_fpga/main.cpp index d546fb92..cec19cdb 100644 --- a/examples_sw/apps/perf_fpga/main.cpp +++ b/examples_sw/apps/perf_fpga/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -28,7 +29,7 @@ constexpr auto const defTargetVfid = 0; constexpr auto const nReps = 1; constexpr auto const defSize = 128; // 2^7 constexpr auto const maxSize = 16 * 1024; -constexpr auto const clkNs = 1000.0 / 300.0; +constexpr auto const clkNs = 1000.0 / 250.0; constexpr auto const nBenchRuns = 100; /** @@ -115,7 +116,14 @@ int main(int argc, char *argv[]) while(curr_size <= maxSize) { for(int j = 0; j < nBenchRuns; j++) { time_bench_rd.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_RD)); - time_bench_wr.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_WR)); // TODO Check correctness of results + memset(hMem, 0xEA, maxSize); + time_bench_wr.emplace_back(benchmark_run(cthread, hMem, BenchOper::START_WR)); + for (size_t i = 0; i < curr_size; i++) { + uint8_t value = (i / 64 + 1) >> std::min(((i % 64) * 8), (size_t) 63); + if (((int8_t *) hMem)[i] != value) { + std::cout << "hMem[" << i << "] value " << (uint32_t) ((uint8_t *) hMem)[i] << " should be " << (uint8_t) value << std::endl; + } + } } std::cout << std::fixed << std::setprecision(2); std::cout << std::setw(8) << curr_size << " [bytes], RD: "