Further documentation + cmake-adaptation for DDR in U250

fpgasystems · Sep 10, 2024 · 22e59c1 · 22e59c1
1 parent 9174c7b
commit 22e59c1
Show file tree

Hide file tree

Showing 11 changed files with 231 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Some of **Coyote's** features:
 
 Full `Vivado/Vitis` suite is needed to build the hardware side of things. Hardware server will be enough for deployment only scenarios. Coyote runs with `Vivado 2022.1`. Previous versions can be used at one's own peril.  
 
-Following AMD platforms are supported: `vcu118`, `Alveo u50`, `Alveo u55c`, `Alveo u200`, `Alveo u250` and `Alveo u280`. Coyote is currently being developed on the HACC cluster at ETH Zurich. For more information and possible external access check out the following link: https://systems.ethz.ch/research/data-processing-on-modern-hardware/hacc.html
+We are currently only actively supporting the AMD `Alveo u55c` accelerator card. Our codebase offers some legacy-support for the following platforms: `vcu118`, `Alveo u50`, `Alveo u200`, `Alveo u250` and `Alveo u280`, but we are not actively working with these cards anymore. Coyote is currently being developed on the HACC cluster at ETH Zurich. For more information and possible external access check out the following link: https://systems.ethz.ch/research/data-processing-on-modern-hardware/hacc.html
 
 
 `CMake` is used for project creation. Additionally `Jinja2` template engine for Python is used for some of the code generation. The API is writen in `C++`, 17 should suffice (for now).

diff --git a/cmake/FindCoyoteHW.cmake b/cmake/FindCoyoteHW.cmake
@@ -220,6 +220,7 @@ macro(validation_checks_hw)
             set(FPGA_PART xcu250-figd2104-2L-e CACHE STRING "FPGA device.")
             set(DDR_SIZE 34)
             set(HBM_SIZE 0)
+            set(N_DDR_CHAN 1)
         elseif(FDEV_NAME STREQUAL "u280")
             set(FPGA_PART xcu280-fsvh2892-2L-e CACHE STRING "FPGA device.")
             set(DDR_SIZE 34)

diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp
@@ -188,11 +188,11 @@ enum class CoyoteOper {
 
 // What do these classes mean? - it's probably classes of memory allocation (regular, huge page, GPU etc.)
 enum class CoyoteAlloc {
-    REG = 0,
-    THP = 1,
-    HPF = 2,
-    PRM = 3,
-    GPU = 4
+    REG = 0, // Regular
+    THP = 1, // Not quite clear what this is for, especially compared to HPF 
+    HPF = 2, // Huge Page
+    PRM = 3, // Programmale Region Memory
+    GPU = 4  // GPU-memory (required for the FPGA-GPU-DMA)
 };
 
 /* AVX regs */

diff --git a/sw/include/cFunc.hpp b/sw/include/cFunc.hpp
@@ -165,30 +165,39 @@ class cFunc : public bFunc {
 
                 // Request to execute a function 
                 case defOpTask: {
-                    // Expansion
+                    // Tuple that can hold multiple arguments 
                     std::tuple<Args...> msg;
 
+                    // Lambda function to read data from the socket to the receive buffer (most likely arguments for execution)
                     auto f_rd = [&](auto& x){
                         using U = decltype(x);
                         int size_arg = sizeof(U);
 
+                        // Try to accept the incoming messages from the socket. If not possible, log an error. 
                         if(n = read(connfd, recv_buf, size_arg) == size_arg) {
                             memcpy(&x, recv_buf, size_arg);
                         } else {
                             syslog(LOG_ERR, "Request invalid, connfd: %d", connfd);
                         }
                     };
+
+                    // Not exactly sure about this, but would argue that the received message is stored in previously declared message 
                     std::apply([=](auto&&... args) {(f_rd(args), ...);}, msg);
 
+                    // Schedule the task for execution in the thread that it belongs to, based on the arguments that were received for it 
                     clients[connfd]->scheduleTask(std::unique_ptr<bTask<Cmpl>>(new auto(std::make_from_tuple<cTask<Cmpl, std::function<Cmpl(cThread<Cmpl>*, Args...)>, Args...>>(std::tuple_cat(
                         std::make_tuple(tid), 
                         std::make_tuple(oid), 
                         std::make_tuple(priority),
                         std::make_tuple(f),
                         msg)))));
 
+                    // While not completed, check for task completion in the associated thread 
                     while(!cmpltd) {
+                        // Check the thread for completion of the scheduled task 
                         cmpltd = clients[connfd]->getTaskCompletedNext(cmpl_tid, cmpl_ev);
+
+                        // If task has been completed, send both the completion tid and completion ev back to the caller, which is cLib through the iTask
                         if(cmpltd) {
                             if(write(connfd, &cmpl_tid, sizeof(int32_t)) != sizeof(int32_t)) {
                                 syslog(LOG_ERR, "Completion tid could not be sent, connfd: %d", connfd);
@@ -198,6 +207,7 @@ class cFunc : public bFunc {
                                 syslog(LOG_ERR, "Completion could not be sent, connfd: %d", connfd);
                             }
                         } else {
+                            // If task has not yet been completed, wait for a certain amount of time before checking again 
                             std::this_thread::sleep_for(std::chrono::nanoseconds(sleepIntervalCompletion));
                         }
                     }
@@ -214,29 +224,39 @@ class cFunc : public bFunc {
         }
 
         syslog(LOG_NOTICE, "Connection %d closing ...", connfd);
-        // Send cleanup
+
+        // Send cleanup - enqueue the connection that should be processed in the queue for cleanup
         mtx_q.lock();
         cln_q.push(connfd);
         mtx_q.unlock();
 
     }
 
+    // Function that cleans up the threads that have finished processing 
     void cleanConns() {
         run_cln = true;
         int connfd;
 
+        // As long as the clean-up runs, get threads to be cleaned from the FIFO and continue cleaning them up 
         while(run_cln) {
+            // Close the lock before accessing the cleaning-queue 
             mtx_q.lock();
             if(!cln_q.empty()) {
+                // Get socket from the cleaning-queue
                 connfd = cln_q.front(); cln_q.pop();
+
+                // Close the request-thread from the reqs-structure
                 reqs[connfd].second.join();
 
+                // Delete the request-thread from the reqs-structure
                 reqs.erase(connfd);
+
+                // Erase the thread from the clients-structure
                 clients.erase(connfd);
             }
             mtx_q.unlock();
 
-
+            // Wait for some time 
             std::this_thread::sleep_for(std::chrono::nanoseconds(sleepIntervalRequests));
         }
     }

diff --git a/sw/include/cLib.hpp b/sw/include/cLib.hpp
@@ -50,6 +50,7 @@ class cLib {
         }
 
         // Set sun-family and sun-path in the server-socket address struct
+        // Which means: This is a local socket for Inter-Process Communication and not a network socket for network communication 
         server.sun_family = AF_UNIX;
         strcpy(server.sun_path, sock_name);
 
@@ -191,6 +192,12 @@ class cLib {
         close(sockfd);
     }
 
+    /**
+     * task, iTask, iCmpl are used for interaction with cFunc: They send a task to cFunc, which then 
+     * places this task in the execution queue of the thread for scheduled execution, wait for the 
+     * completion event and send back the completion ID and the completion event here to the iCmpl. 
+     */
+
     // Task blocking: Variadic function that takes a priority and an arbitrary number of arguments for further processing 
     // Function is basically the same as iTask, but with a blocking completion-handshake at the end 
     Cmpl task(int32_t priority, Args... msg) {

diff --git a/sw/include/cRnfg.hpp b/sw/include/cRnfg.hpp
@@ -41,21 +41,22 @@ using namespace boost::interprocess;
 namespace fpga {
 
 /* Alias */
+// A bitstream consists of a pointer to memory and its length in combination 
 using bStream = std::pair<void*, uint32_t>; // vaddr*, length
 
 /**
  * @brief Coyote reconfiguration loader
  * 
- * Partial bitstream loader
+ * Partial bitstream loader, required for loading partial bitstreams into the vFPGAs 
  * 
  */
 class cRnfg {
 protected: 
 	/* Fpga device */
-	int32_t fd = { 0 };
-    pid_t pid;
-    uint32_t crid;
-    static std::atomic_uint32_t crid_gen;
+	int32_t fd = { 0 }; // File Descript
+    pid_t pid; // Process ID
+    uint32_t crid; // Configuration ID (I guess?)
+    static std::atomic_uint32_t crid_gen; // Atomic for Configuration ID, not sure what this is used for 
 
     /* Locks */
     named_mutex mlock; // Internal memory lock
@@ -64,25 +65,25 @@ class cRnfg {
 	std::unordered_map<void*, csAlloc> mapped_pages;
 
 	/* PR */
-	uint8_t readByte(ifstream& fb);
-	bStream readBitstream(ifstream& fb);
-    void reconfigureBase(void* vaddr, uint32_t len, uint32_t vfid = -1);
+	uint8_t readByte(ifstream& fb); // Function to read a byte from an input stream 
+	bStream readBitstream(ifstream& fb); // Function to read a bitstream from an input stream 
+    void reconfigureBase(void* vaddr, uint32_t len, uint32_t vfid = -1); // Function to reconfigure the base of the FPGA via the bitstream (pointer to it), length of the bitstream and vFPGA-ID
 
 	/* Memory alloc */
-	void* getMem(csAlloc&& cs_alloc);
-	void freeMem(void* vaddr);
+	void* getMem(csAlloc&& cs_alloc); // Function to allocate memory via a csAlloc-object as defined in cDefs
+	void freeMem(void* vaddr); // Function to free memory via its start-address
 
 public:
 
 	/**
-	 * @brief Ctor, Dtor
+	 * @brief Ctor, Dtor - Constructor and Destructor 
 	 * 
 	 */
 	cRnfg(uint32_t dev);
 	~cRnfg();
 
 	/**
-	 * @brief Shell reconfiguration
+	 * @brief Shell reconfiguration - function to call for reconfiguration of the shell 
 	*/
 	void shellReconfigure(std::string name);
 

diff --git a/sw/include/cSched.hpp b/sw/include/cSched.hpp
@@ -36,14 +36,17 @@
 #include <queue>
 #include <syslog.h>
 
+// Has the cRnfg for handling bitstreams - might be interessant for further checks 
 #include "cRnfg.hpp"
 
 using namespace std;
 using namespace boost::interprocess;
 
 namespace fpga {
 
-/* Struct */
+/* Struct 
+ * Consists of ctid, oid and priority for scheduling 
+*/
 struct cLoad {
     int32_t ctid;
     int32_t oid;
@@ -53,15 +56,20 @@ struct cLoad {
 /* Schedule reordering */
 class taskCmprSched {
 private:
+
+    // State variables: Priority and bool for reordering
     bool priority;
     bool reorder;
 
 public: 
+
+    // Constructor: Set state variables 
     taskCmprSched(const bool& priority, const bool& reorder) {
         this->priority = priority;
         this->reorder = reorder;
     }
 
+    // Takes pointers to two cLoads as scheduling requests and decides which one has the higher priority 
     bool operator()(const std::unique_ptr<cLoad>& req1, const std::unique_ptr<cLoad>& req2) {
         // Comparison
         if(priority) {
@@ -84,65 +92,75 @@ class taskCmprSched {
  * 
  * This is the main vFPGA scheduler. It schedules submitted user tasks.
  * These tasks trickle down: cTask -> cThread -> cProcess -> cSched -> vFPGA
+ * That's not true! There is no cProcess in Coyote v2
  * 
  */
 class cSched : public cRnfg {
 protected: 
 	/* vFPGA */
+    // vfid as vFPGA-identifier, fcnfg as the configuration of this vFGPA
 	int32_t vfid = { -1 };
 	fCnfg fcnfg;
 
 	/* Locks */
+    // Lock for thread-safe operations 
     named_mutex plock; // Internal vFPGA lock
 
     /* Scheduling */
     const bool priority;
     const bool reorder;
 
     /* Thread */
+    // Thread used for scheduling tasks
     bool run;
     thread scheduler_thread;
 
     /* Scheduler queue */
+    // Queue that stores pointers to load-objects. The order of the queue is calculated using the comparator-operator specified in taskCmprSched
     condition_variable cv_queue;
     mutex mtx_queue;
     priority_queue<std::unique_ptr<cLoad>, vector<std::unique_ptr<cLoad>>, taskCmprSched> request_queue;
 
     /* Scheduling and completion */
     condition_variable cv_rcnfg;
     mutex mtx_rcnfg;
-    int curr_ctid = { -1 };
+    int curr_ctid = { -1 }; // current completion thread ID 
 
     condition_variable cv_cmplt;
     mutex mtx_cmplt;
-    bool curr_run = { false };
+    bool curr_run = { false }; // current run ID 
 
 	/* Partial bitstreams */
+    // Map with all bitstreams 
 	std::unordered_map<int32_t, bStream> bstreams;
 
 	/* PR */
+    // Function for FPGA-reconfiguration based on the operator ID 
 	void reconfigure(int32_t oid);
 
     /* (Thread) Process requests */
+    // Function for processing Requests 
     void processRequests();
 
 public:
 
 	/**
-	 * @brief Ctor, Dtor
+	 * @brief Ctor, Dtor - constructor and destructor
+     * 
+     * Seems like scheduler gets created per vfid and device  
 	 * 
 	 */
 	cSched(int32_t vfid, uint32_t dev, bool priority = true, bool reorder = true);
 	~cSched();
 
     /**
-     * @brief Run
+     * @brief Run - run the scheduler 
      * 
      */
     void runSched();
 
 	/**
-	 * @brief Getters
+	 * @brief Getters - return the vFGPA-ID 
 	 * 
 	 */
 	inline auto getVfid() const { return vfid; }
@@ -152,10 +170,10 @@ class cSched : public cRnfg {
 	 * 
 	 * @param oid : operator ID
 	 */
-	auto isReconfigurable() const { return fcnfg.en_pr; }
-	void addBitstream(std::string name, int32_t oid);
-	void removeBitstream(int32_t oid);	
-	bool checkBitstream(int32_t oid); 
+	auto isReconfigurable() const { return fcnfg.en_pr; } // Checks if a certain vFPGA is reconfigurable 
+	void addBitstream(std::string name, int32_t oid); // Add a new bitstream to the map 
+	void removeBitstream(int32_t oid); // Remove a bitstream based on the operator 
+	bool checkBitstream(int32_t oid); // Check a bistream (for what?)
 
     /**
      * @brief Schedule operation