From 22e59c1faf22394cf983963f8c1abcf795b06304 Mon Sep 17 00:00:00 2001
From: hmaximili <maximilian.heer@inf.ethz.ch>
Date: Wed, 11 Sep 2024 00:17:13 +0200
Subject: [PATCH] Further documentation + cmake-adaptation for DDR in U250

---
 README.md                |  2 +-
 cmake/FindCoyoteHW.cmake |  1 +
 sw/include/cDefs.hpp     | 10 ++---
 sw/include/cFunc.hpp     | 26 +++++++++++--
 sw/include/cLib.hpp      |  7 ++++
 sw/include/cRnfg.hpp     | 25 +++++++------
 sw/include/cSched.hpp    | 38 ++++++++++++++-----
 sw/include/cService.hpp  | 24 ++++++------
 sw/src/cRnfg.cpp         | 79 +++++++++++++++++++++++++++++++---------
 sw/src/cSched.cpp        | 48 +++++++++++++++++-------
 sw/src/cService.cpp      | 64 ++++++++++++++++++++++----------
 11 files changed, 231 insertions(+), 93 deletions(-)

diff --git a/README.md b/README.md
index 095213e0..896fdbd0 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Some of **Coyote's** features:
 
 Full `Vivado/Vitis` suite is needed to build the hardware side of things. Hardware server will be enough for deployment only scenarios. Coyote runs with `Vivado 2022.1`. Previous versions can be used at one's own peril.  
 
-Following AMD platforms are supported: `vcu118`, `Alveo u50`, `Alveo u55c`, `Alveo u200`, `Alveo u250` and `Alveo u280`. Coyote is currently being developed on the HACC cluster at ETH Zurich. For more information and possible external access check out the following link: https://systems.ethz.ch/research/data-processing-on-modern-hardware/hacc.html
+We are currently only actively supporting the AMD `Alveo u55c` accelerator card. Our codebase offers some legacy-support for the following platforms: `vcu118`, `Alveo u50`, `Alveo u200`, `Alveo u250` and `Alveo u280`, but we are not actively working with these cards anymore. Coyote is currently being developed on the HACC cluster at ETH Zurich. For more information and possible external access check out the following link: https://systems.ethz.ch/research/data-processing-on-modern-hardware/hacc.html
 
 
 `CMake` is used for project creation. Additionally `Jinja2` template engine for Python is used for some of the code generation. The API is writen in `C++`, 17 should suffice (for now).
diff --git a/cmake/FindCoyoteHW.cmake b/cmake/FindCoyoteHW.cmake
index 564cb425..bafd9f2e 100644
--- a/cmake/FindCoyoteHW.cmake
+++ b/cmake/FindCoyoteHW.cmake
@@ -220,6 +220,7 @@ macro(validation_checks_hw)
             set(FPGA_PART xcu250-figd2104-2L-e CACHE STRING "FPGA device.")
             set(DDR_SIZE 34)
             set(HBM_SIZE 0)
+            set(N_DDR_CHAN 1)
         elseif(FDEV_NAME STREQUAL "u280")
             set(FPGA_PART xcu280-fsvh2892-2L-e CACHE STRING "FPGA device.")
             set(DDR_SIZE 34)
diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp
index 480bad35..2db70fc2 100644
--- a/sw/include/cDefs.hpp
+++ b/sw/include/cDefs.hpp
@@ -188,11 +188,11 @@ enum class CoyoteOper {
 
 // What do these classes mean? - it's probably classes of memory allocation (regular, huge page, GPU etc.)
 enum class CoyoteAlloc {
-    REG = 0,
-    THP = 1,
-    HPF = 2,
-    PRM = 3,
-    GPU = 4
+    REG = 0, // Regular
+    THP = 1, // Not quite clear what this is for, especially compared to HPF 
+    HPF = 2, // Huge Page
+    PRM = 3, // Programmale Region Memory
+    GPU = 4  // GPU-memory (required for the FPGA-GPU-DMA)
 };
 
 /* AVX regs */
diff --git a/sw/include/cFunc.hpp b/sw/include/cFunc.hpp
index 55aa7bce..ac1c18a3 100644
--- a/sw/include/cFunc.hpp
+++ b/sw/include/cFunc.hpp
@@ -165,21 +165,26 @@ class cFunc : public bFunc {
 
                 // Request to execute a function 
                 case defOpTask: {
-                    // Expansion
+                    // Tuple that can hold multiple arguments 
                     std::tuple<Args...> msg;
 
+                    // Lambda function to read data from the socket to the receive buffer (most likely arguments for execution)
                     auto f_rd = [&](auto& x){
                         using U = decltype(x);
                         int size_arg = sizeof(U);
 
+                        // Try to accept the incoming messages from the socket. If not possible, log an error. 
                         if(n = read(connfd, recv_buf, size_arg) == size_arg) {
                             memcpy(&x, recv_buf, size_arg);
                         } else {
                             syslog(LOG_ERR, "Request invalid, connfd: %d", connfd);
                         }
                     };
+
+                    // Not exactly sure about this, but would argue that the received message is stored in previously declared message 
                     std::apply([=](auto&&... args) {(f_rd(args), ...);}, msg);
                 
+                    // Schedule the task for execution in the thread that it belongs to, based on the arguments that were received for it 
                     clients[connfd]->scheduleTask(std::unique_ptr<bTask<Cmpl>>(new auto(std::make_from_tuple<cTask<Cmpl, std::function<Cmpl(cThread<Cmpl>*, Args...)>, Args...>>(std::tuple_cat(
                         std::make_tuple(tid), 
                         std::make_tuple(oid), 
@@ -187,8 +192,12 @@ class cFunc : public bFunc {
                         std::make_tuple(f),
                         msg)))));
 
+                    // While not completed, check for task completion in the associated thread 
                     while(!cmpltd) {
+                        // Check the thread for completion of the scheduled task 
                         cmpltd = clients[connfd]->getTaskCompletedNext(cmpl_tid, cmpl_ev);
+
+                        // If task has been completed, send both the completion tid and completion ev back to the caller, which is cLib through the iTask
                         if(cmpltd) {
                             if(write(connfd, &cmpl_tid, sizeof(int32_t)) != sizeof(int32_t)) {
                                 syslog(LOG_ERR, "Completion tid could not be sent, connfd: %d", connfd);
@@ -198,6 +207,7 @@ class cFunc : public bFunc {
                                 syslog(LOG_ERR, "Completion could not be sent, connfd: %d", connfd);
                             }
                         } else {
+                            // If task has not yet been completed, wait for a certain amount of time before checking again 
                             std::this_thread::sleep_for(std::chrono::nanoseconds(sleepIntervalCompletion));
                         }
                     }
@@ -214,29 +224,39 @@ class cFunc : public bFunc {
         }
 
         syslog(LOG_NOTICE, "Connection %d closing ...", connfd);
-        // Send cleanup
+
+        // Send cleanup - enqueue the connection that should be processed in the queue for cleanup
         mtx_q.lock();
         cln_q.push(connfd);
         mtx_q.unlock();
 
     }
 
+    // Function that cleans up the threads that have finished processing 
     void cleanConns() {
         run_cln = true;
         int connfd;
 
+        // As long as the clean-up runs, get threads to be cleaned from the FIFO and continue cleaning them up 
         while(run_cln) {
+            // Close the lock before accessing the cleaning-queue 
             mtx_q.lock();
             if(!cln_q.empty()) {
+                // Get socket from the cleaning-queue
                 connfd = cln_q.front(); cln_q.pop();
+
+                // Close the request-thread from the reqs-structure
                 reqs[connfd].second.join();
 
+                // Delete the request-thread from the reqs-structure
                 reqs.erase(connfd);
+
+                // Erase the thread from the clients-structure
                 clients.erase(connfd);
             }
             mtx_q.unlock();
 
-
+            // Wait for some time 
             std::this_thread::sleep_for(std::chrono::nanoseconds(sleepIntervalRequests));
         }
     }
diff --git a/sw/include/cLib.hpp b/sw/include/cLib.hpp
index 2d04e1a5..afdc66fa 100644
--- a/sw/include/cLib.hpp
+++ b/sw/include/cLib.hpp
@@ -50,6 +50,7 @@ class cLib {
         }
 
         // Set sun-family and sun-path in the server-socket address struct
+        // Which means: This is a local socket for Inter-Process Communication and not a network socket for network communication 
         server.sun_family = AF_UNIX;
         strcpy(server.sun_path, sock_name);
 
@@ -191,6 +192,12 @@ class cLib {
         close(sockfd);
     }
 
+    /**
+     * task, iTask, iCmpl are used for interaction with cFunc: They send a task to cFunc, which then 
+     * places this task in the execution queue of the thread for scheduled execution, wait for the 
+     * completion event and send back the completion ID and the completion event here to the iCmpl. 
+     */
+
     // Task blocking: Variadic function that takes a priority and an arbitrary number of arguments for further processing 
     // Function is basically the same as iTask, but with a blocking completion-handshake at the end 
     Cmpl task(int32_t priority, Args... msg) {
diff --git a/sw/include/cRnfg.hpp b/sw/include/cRnfg.hpp
index b42af18d..e0620ebb 100644
--- a/sw/include/cRnfg.hpp
+++ b/sw/include/cRnfg.hpp
@@ -41,21 +41,22 @@ using namespace boost::interprocess;
 namespace fpga {
 
 /* Alias */
+// A bitstream consists of a pointer to memory and its length in combination 
 using bStream = std::pair<void*, uint32_t>; // vaddr*, length
 
 /**
  * @brief Coyote reconfiguration loader
  * 
- * Partial bitstream loader
+ * Partial bitstream loader, required for loading partial bitstreams into the vFPGAs 
  * 
  */
 class cRnfg {
 protected: 
 	/* Fpga device */
-	int32_t fd = { 0 };
-    pid_t pid;
-    uint32_t crid;
-    static std::atomic_uint32_t crid_gen;
+	int32_t fd = { 0 }; // File Descript
+    pid_t pid; // Process ID
+    uint32_t crid; // Configuration ID (I guess?)
+    static std::atomic_uint32_t crid_gen; // Atomic for Configuration ID, not sure what this is used for 
 
     /* Locks */
     named_mutex mlock; // Internal memory lock
@@ -64,25 +65,25 @@ class cRnfg {
 	std::unordered_map<void*, csAlloc> mapped_pages;
 
 	/* PR */
-	uint8_t readByte(ifstream& fb);
-	bStream readBitstream(ifstream& fb);
-    void reconfigureBase(void* vaddr, uint32_t len, uint32_t vfid = -1);
+	uint8_t readByte(ifstream& fb); // Function to read a byte from an input stream 
+	bStream readBitstream(ifstream& fb); // Function to read a bitstream from an input stream 
+    void reconfigureBase(void* vaddr, uint32_t len, uint32_t vfid = -1); // Function to reconfigure the base of the FPGA via the bitstream (pointer to it), length of the bitstream and vFPGA-ID
 
 	/* Memory alloc */
-	void* getMem(csAlloc&& cs_alloc);
-	void freeMem(void* vaddr);
+	void* getMem(csAlloc&& cs_alloc); // Function to allocate memory via a csAlloc-object as defined in cDefs
+	void freeMem(void* vaddr); // Function to free memory via its start-address
 
 public:
 
 	/**
-	 * @brief Ctor, Dtor
+	 * @brief Ctor, Dtor - Constructor and Destructor 
 	 * 
 	 */
 	cRnfg(uint32_t dev);
 	~cRnfg();
 
 	/**
-	 * @brief Shell reconfiguration
+	 * @brief Shell reconfiguration - function to call for reconfiguration of the shell 
 	*/
 	void shellReconfigure(std::string name);
 
diff --git a/sw/include/cSched.hpp b/sw/include/cSched.hpp
index afaaa596..52af8738 100644
--- a/sw/include/cSched.hpp
+++ b/sw/include/cSched.hpp
@@ -36,6 +36,7 @@
 #include <queue>
 #include <syslog.h>
 
+// Has the cRnfg for handling bitstreams - might be interessant for further checks 
 #include "cRnfg.hpp"
 
 using namespace std;
@@ -43,7 +44,9 @@ using namespace boost::interprocess;
 
 namespace fpga {
 
-/* Struct */
+/* Struct 
+ * Consists of ctid, oid and priority for scheduling 
+*/
 struct cLoad {
     int32_t ctid;
     int32_t oid;
@@ -53,15 +56,20 @@ struct cLoad {
 /* Schedule reordering */
 class taskCmprSched {
 private:
+
+    // State variables: Priority and bool for reordering
     bool priority;
     bool reorder;
 
 public: 
+
+    // Constructor: Set state variables 
     taskCmprSched(const bool& priority, const bool& reorder) {
         this->priority = priority;
         this->reorder = reorder;
     }
 
+    // Takes pointers to two cLoads as scheduling requests and decides which one has the higher priority 
     bool operator()(const std::unique_ptr<cLoad>& req1, const std::unique_ptr<cLoad>& req2) {
         // Comparison
         if(priority) {
@@ -84,15 +92,18 @@ class taskCmprSched {
  * 
  * This is the main vFPGA scheduler. It schedules submitted user tasks.
  * These tasks trickle down: cTask -> cThread -> cProcess -> cSched -> vFPGA
+ * That's not true! There is no cProcess in Coyote v2
  * 
  */
 class cSched : public cRnfg {
 protected: 
 	/* vFPGA */
+    // vfid as vFPGA-identifier, fcnfg as the configuration of this vFGPA
 	int32_t vfid = { -1 };
 	fCnfg fcnfg;
 
 	/* Locks */
+    // Lock for thread-safe operations 
     named_mutex plock; // Internal vFPGA lock
 
     /* Scheduling */
@@ -100,10 +111,12 @@ class cSched : public cRnfg {
     const bool reorder;
 
     /* Thread */
+    // Thread used for scheduling tasks
     bool run;
     thread scheduler_thread;
 
     /* Scheduler queue */
+    // Queue that stores pointers to load-objects. The order of the queue is calculated using the comparator-operator specified in taskCmprSched
     condition_variable cv_queue;
     mutex mtx_queue;
     priority_queue<std::unique_ptr<cLoad>, vector<std::unique_ptr<cLoad>>, taskCmprSched> request_queue;
@@ -111,38 +124,43 @@ class cSched : public cRnfg {
     /* Scheduling and completion */
     condition_variable cv_rcnfg;
     mutex mtx_rcnfg;
-    int curr_ctid = { -1 };
+    int curr_ctid = { -1 }; // current completion thread ID 
 
     condition_variable cv_cmplt;
     mutex mtx_cmplt;
-    bool curr_run = { false };
+    bool curr_run = { false }; // current run ID 
 
 	/* Partial bitstreams */
+    // Map with all bitstreams 
 	std::unordered_map<int32_t, bStream> bstreams;
 
 	/* PR */
+    // Function for FPGA-reconfiguration based on the operator ID 
 	void reconfigure(int32_t oid);
 
     /* (Thread) Process requests */
+    // Function for processing Requests 
     void processRequests();
 
 public:
 
 	/**
-	 * @brief Ctor, Dtor
+	 * @brief Ctor, Dtor - constructor and destructor
+     * 
+     * Seems like scheduler gets created per vfid and device  
 	 * 
 	 */
 	cSched(int32_t vfid, uint32_t dev, bool priority = true, bool reorder = true);
 	~cSched();
 
     /**
-     * @brief Run
+     * @brief Run - run the scheduler 
      * 
      */
     void runSched();
 
 	/**
-	 * @brief Getters
+	 * @brief Getters - return the vFGPA-ID 
 	 * 
 	 */
 	inline auto getVfid() const { return vfid; }
@@ -152,10 +170,10 @@ class cSched : public cRnfg {
 	 * 
 	 * @param oid : operator ID
 	 */
-	auto isReconfigurable() const { return fcnfg.en_pr; }
-	void addBitstream(std::string name, int32_t oid);
-	void removeBitstream(int32_t oid);	
-	bool checkBitstream(int32_t oid); 
+	auto isReconfigurable() const { return fcnfg.en_pr; } // Checks if a certain vFPGA is reconfigurable 
+	void addBitstream(std::string name, int32_t oid); // Add a new bitstream to the map 
+	void removeBitstream(int32_t oid); // Remove a bitstream based on the operator 
+	bool checkBitstream(int32_t oid); // Check a bistream (for what?)
 
     /**
      * @brief Schedule operation
diff --git a/sw/include/cService.hpp b/sw/include/cService.hpp
index 175acde0..d9b289e0 100644
--- a/sw/include/cService.hpp
+++ b/sw/include/cService.hpp
@@ -31,6 +31,7 @@
 #include <condition_variable>
 #include <any>
 
+// Can use Scheduler, cFunc and cThread
 #include "cSched.hpp"
 #include "bFunc.hpp"
 #include "cThread.hpp"
@@ -43,34 +44,35 @@ namespace fpga {
  * @brief Coyote service
  * 
  * Coyote daemon, provides background scheduling service.
+ * Inherits from cSched (not sure why though)
  * 
  */
 class cService : public cSched {
 protected:
-    // Singleton
+    // Singleton: Important that there's only a single instance of cService per vFPGA that controls all threads registered for this vFPGA
     static cService *cservice;
 
-    // Function map
+    // Function map - can store client threads for the calling function-IDs
     std::unordered_map<int32_t, std::unique_ptr<bFunc>> functions;
 
     // Forks
     pid_t pid;
 
-    // ID
+    // ID - targets a single vFPGA & dev, thus these are global variables / identifiers
     int32_t vfid = { -1 };
     uint32_t dev;
     string service_id;
 
-    // Type
+    // Type - remote connection 
     bool remote = { false };
     uint16_t port;
 
-    // Conn
+    // Conn - connection via a socket, uniquely identified via curr_id
     string socket_name;
     int sockfd;
     int curr_id = { 0 };
 
-    // Notify 
+    // Notify - pointer to user interrupt service routine 
     void (*uisr)(int);
 
     /**
@@ -80,14 +82,14 @@ class cService : public cSched {
    void myHandler(int signum);
 
     /**
-     * @brief Initialize
+     * @brief Initializer for daemon and socket (for connection)
      * 
      */
     void daemonInit();
     void socketInit();
 
     /**
-     * @brief Accept connections
+     * @brief Accept connections - methods for (QP?) exchange with local and remote 
     */
     void acceptConnectionLocal();
     void acceptConnectionRemote();
@@ -100,9 +102,9 @@ class cService : public cSched {
 public:
 
     /**
-     * @brief Creates a service for a single vFPGA
+     * @brief Creates a service for a single vFPGA - execute the protected constructor internally to keep the singleton-property 
      * 
-     * @param vfid - vVFPGA id
+     * @param vfid - vFPGA id
      * @param dev - PCIe device
      * @param priority - priority ordering
      * @param reorder - reordeing of tasks
@@ -127,7 +129,7 @@ class cService : public cSched {
     void addFunction(int32_t fid, std::unique_ptr<bFunc> f);
 
     /**
-     * @brief QP exchange util (blocking)
+     * @brief QP exchange util (blocking) - used on the server side, while the client side forces active exchange via the constructor in cLib
      * 
      */
     static void exchangeQpClient() {}
diff --git a/sw/src/cRnfg.cpp b/sw/src/cRnfg.cpp
index 77b96777..b496a695 100644
--- a/sw/src/cRnfg.cpp
+++ b/sw/src/cRnfg.cpp
@@ -28,13 +28,19 @@ namespace fpga
 	 */
 	cRnfg::cRnfg(uint32_t dev) : mlock(open_or_create, "pr_mtx") {
 		DBG3("cRnfg:  ctor called");
-		// Open
+
+		// Generate a string for the file-descriptor of the programmable region / vFPGA of this FPGA
 		std::string region = "/dev/fpga_" + std::to_string(dev) + "_pr";
+
+		// Open the file-descriptor for the vFPGA
 		fd = open(region.c_str(), O_RDWR | O_SYNC);
 		if (fd == -1)
 			throw std::runtime_error("cRcnfg could not be obtained");
 
+		// Get the Process ID of the calling process 
         pid = getpid();
+
+		// Use the atomic crid_gen to generate a new ID 
         crid = crid_gen++;
 	}
 
@@ -45,7 +51,7 @@ namespace fpga
 	cRnfg::~cRnfg() {
 		DBG3("cRnfg:  dtor called");
 
-		// Mapped
+		// Mapped: Free all obtained memory pages 
 		for (auto &it : mapped_pages)
 		{
 			freeMem(it.first);
@@ -53,6 +59,7 @@ namespace fpga
 
         named_mutex::remove("pr_mtx");
 
+		// Close the file-descriptor
 		close(fd);
 	}
 
@@ -61,45 +68,60 @@ namespace fpga
 	// ======-------------------------------------------------------------------------------
 
 	/**
-	 * @brief Bitstream memory allocation
+	 * @brief Bitstream memory allocation - get memory 
 	 *
-	 * @param cs_alloc - allocatin config
+	 * @param cs_alloc - allocation config as defined in cDefs
 	 * @return void* - pointer to allocated mem
 	 */
 	void* cRnfg::getMem(csAlloc&& cs_alloc)
 	{
+		// Pre-initialize memory that needs to be allocated 
 		void *mem = nullptr;
 		void *memNonAligned = nullptr;
+
+		// 64-Bit Array for temporary storage 
 		uint64_t tmp[maxUserCopyVals];
 		uint32_t size;
 
+		// For a valid requested memory size, proceed with memory allocation 
 		if (cs_alloc.size > 0)
 		{
+			// Store requested size, process ID and cr ID in the temporary array 
 			tmp[0] = static_cast<uint64_t>(cs_alloc.size); // n_pages
             tmp[1] = static_cast<uint64_t>(pid);
             tmp[2] = static_cast<uint64_t>(crid);
 
+			// Switch in this case used to check that the requested memory type is PRM (programmale region memory)
 			switch (cs_alloc.alloc)
 			{
                 case CoyoteAlloc::PRM: { // m lock
 
+					// Close lock for thread-safe memory allocation 
                     mlock.lock();
 
+					// IO-Call to the driver for allocating memory for the programmable region 
                     if (ioctl(fd, IOCTL_ALLOC_PR_MEM, &tmp))
                     {
                         throw std::runtime_error("ioctl_alloc_host_pr_mem mapping failed");
                     }
 
+					// Map into memory - not exactly sure how this works at this spot here 
                     memNonAligned = mmap(NULL, (cs_alloc.size + 1) * hugePageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, mmapPr);
-                    if (memNonAligned == MAP_FAILED)
+                    
+					// Check if memory-mapping worked 
+					if (memNonAligned == MAP_FAILED)
                     {
                         throw std::runtime_error("get_pr_mem mmap failed");
                     }
 
+					// Open the lock after the critical memory-operation 
                     mlock.unlock();
 
+					// Align the previously obtained memory for usage for vFPGA-reconfiguration 
                     mem = (void *)((((reinterpret_cast<uint64_t>(memNonAligned) + hugePageSize - 1) >> hugePageShift)) << hugePageShift);
-                    cs_alloc.mem = memNonAligned;
+                    
+					// Store the non-aligned memory in the csAlloc-struct (not sure why not the aligned memory though?)
+					cs_alloc.mem = memNonAligned;
 
                     break;
                 }
@@ -107,20 +129,23 @@ namespace fpga
                     throw std::runtime_error("unauthorized memory allocation");
 			}
 
+			// Place the obtained memory in the memory-mapping-structure that is part of the thread (thread in charge of calling Reconfiguration?)
 			mapped_pages.emplace(mem, cs_alloc);
 			DBG3("Mapped mem at: " << std::hex << reinterpret_cast<uint64_t>(mem) << std::dec);
 		}
 
+		// Return the pointer to the obtained memory
 		return mem;
 	}
 
 	/**
-	 * @brief Bitstream memory deallocation
+	 * @brief Bitstream memory deallocation. Opposite to previous function: Free the obtained memory again. 
 	 *
 	 * @param vaddr - mapped al
 	 */
 	void cRnfg::freeMem(void *vaddr)
 	{
+		// Save vaddr, process ID and cr ID in the temporary array 
 		uint64_t tmp[maxUserCopyVals];
 		uint32_t size;
 
@@ -128,21 +153,27 @@ namespace fpga
         tmp[1] = static_cast<uint64_t>(pid);
         tmp[2] = static_cast<uint64_t>(crid);
 
+		// Check if the current vaddr is actually part of the existing memory mapping
 		if (mapped_pages.find(vaddr) != mapped_pages.end())
 		{
+			// Get the memory-mapping entry for the given vaddr 
 			auto mapped = mapped_pages[vaddr];
 
+			// Check the alloc-struct to find more information on the allocation 
 			switch (mapped.alloc)
 			{
+				// Only operate if the allocation is actually for PR memory 
                 case CoyoteAlloc::PRM : {
 
                     mlock.lock();
 
+					// Unmap the mapped memory 
                     if (munmap(mapped.mem, (mapped.size + 1) * hugePageSize) != 0)
                     {
                         throw std::runtime_error("free_pr_mem munmap failed");
                     }
 
+					// Send IO-call for freeing the PR-memory 
                     if (ioctl(fd, IOCTL_FREE_PR_MEM, &tmp))
                     {
                         throw std::runtime_error("ioctl_free_host_pr_mem failed");
@@ -172,19 +203,26 @@ namespace fpga
 	 */
 	void cRnfg::reconfigureBase(void *vaddr, uint32_t len, uint32_t vfid)
 	{
+		// Create a tmp-array that holds address of the bitstream, length of the bitstream, process ID and CR ID 
 		uint64_t tmp[maxUserCopyVals];
 		tmp[0] = reinterpret_cast<uint64_t>(vaddr);
 		tmp[1] = static_cast<uint64_t>(len);
         tmp[2] = static_cast<uint64_t>(pid);
         tmp[3] = static_cast<uint64_t>(crid);
+
+		// Check the vFPGA-ID (regular number is vFGPA, -1 indicates that the shell needs reconfiguration)
         if(vfid != -1) {
+			// Get the vFPGA-ID as last argument in the tmp-array
             tmp[4] = static_cast<uint64_t>(vfid);
 
+			// Issue a ioctl call to the driver for reconfiguration of the PR 
             if (ioctl(fd, IOCTL_RECONFIGURE_APP, &tmp)) // Blocking
 			    throw std::runtime_error("ioctl_reconfig_app failed");
 
             DBG3("App reconfiguration completed");
         } else {
+
+			// Issue a ioctl call to the driver for reconfiguration of the base shell 
             if (ioctl(fd, IOCTL_RECONFIGURE_SHELL, &tmp)) // Blocking
 			    throw std::runtime_error("ioctl_reconfig_shell failed");
 
@@ -192,7 +230,7 @@ namespace fpga
         }
 	}
 
-	// Util
+	// Util: Read a byte from the input stream and return it 
 	uint8_t cRnfg::readByte(ifstream &fb)
 	{
 		char temp;
@@ -201,19 +239,19 @@ namespace fpga
 	}
 
 	/**
-	 * @brief Read in a bitstream
+	 * @brief Read in a bitstream from the input stream 
 	*/
 	bStream cRnfg::readBitstream(ifstream& fb) {
 		// Size
-		uint32_t len = fb.tellg();
-		fb.seekg(0);
-		uint32_t n_pages = (len + hugePageSize - 1) / hugePageSize;
+		uint32_t len = fb.tellg(); // Get the current read position in the input stream - should possibly be the length of the input stream 
+		fb.seekg(0); // Set read position back to beginning of the input stream 
+		uint32_t n_pages = (len + hugePageSize - 1) / hugePageSize; // Calculate the number of required memory pages to store the bitstream 
 
 		// Get mem
-		void *vaddr = getMem({CoyoteAlloc::PRM, n_pages});
-		uint32_t *vaddr_32 = reinterpret_cast<uint32_t *>(vaddr);
+		void *vaddr = getMem({CoyoteAlloc::PRM, n_pages}); // Get memory to store the bitstream that is read from the input stream 
+		uint32_t *vaddr_32 = reinterpret_cast<uint32_t *>(vaddr); 
 
-		// Read in
+		// Read in: Read the input-stream bytewise and store it bytewise in the mapped memory 
 		for (uint32_t i = 0; i < len / 4; i++)
 		{
 			vaddr_32[i] = 0;
@@ -224,6 +262,8 @@ namespace fpga
 		}
 
 		DBG3("Shell bitstream loaded");
+
+		// Return the bitstream object 
 		return std::make_pair(vaddr, len);
 	}
 	
@@ -233,22 +273,25 @@ namespace fpga
 	// ======-------------------------------------------------------------------------------
 	
 	/**
-	 * @brief Add a bitstream to the map
+	 * @brief Add a bitstream to the map - read in a new bitstream, used for adding it to the FPGA
 	 *
 	 * @param name - path
 	 * @param oid - operator ID
 	 */
 	void cRnfg::shellReconfigure(std::string name)
 	{
-		// Stream
+		// Create a new input stream for the bitstream which is defined via its name as argument 
 		ifstream f_bit(name, ios::ate | ios::binary);
 		if (!f_bit)
 			throw std::runtime_error("Shell bitstream could not be opened");
 
-		
+		// Read the bitstream in (call of the previously defined function)
 		bStream bstream = readBitstream(f_bit);
+
+		// Close the input stream 
 		f_bit.close();
 
+		// Reconfigure the FPGA with the new loaded bitstream 
 		reconfigureBase(std::get<0>(bstream), std::get<1>(bstream));
 	}
 
diff --git a/sw/src/cSched.cpp b/sw/src/cSched.cpp
index ffda1cf6..1733f248 100644
--- a/sw/src/cSched.cpp
+++ b/sw/src/cSched.cpp
@@ -24,6 +24,8 @@ namespace fpga
 
 	/**
 	 * @brief Construct a new cSched, bitstream handler
+	 * 
+	 * Constructor of the scheduler. Directly creates a new bitstream handler and a request-queue and sets vfid, priority and reorder
 	 *
 	 * @param vfid - vFPGA id
 	 */
@@ -37,15 +39,19 @@ namespace fpga
 		// Cnfg
 		uint64_t tmp[2];
 
+		// System call to driver to enable configuration of programmable region 
 		if (ioctl(fd, IOCTL_PR_CNFG, &tmp))
 			throw std::runtime_error("ioctl_pr_cnfg() failed, vfid: " + to_string(vfid));
 
+		// Set configuration programmability based on return value from the ioctl-call 
 		fcnfg.en_pr = tmp[0];
 	}
 
 	/**
 	 * @brief Destructor cSched
 	 *
+	 * Set run to false, end the scheduler thread, remove all bitstreams from the list 
+	 * 
 	 */
 	cSched::~cSched()
 	{
@@ -53,19 +59,20 @@ namespace fpga
 		run = false;
 
 		DBG3("cSched:  joining");
-		scheduler_thread.join();
+		scheduler_thread.join(); // Stop the scheduling thread 
 
-		// Mapped
+		// Iterate over all bitstreams and remove them one after each other 
 		for (auto &it : bstreams)
 		{
 			removeBitstream(it.first);
 		}
 
+		// Remove the mutex that has been created previously in the constructor 
 		named_mutex::remove(("vpga_mtx_user_" + std::to_string(vfid)).c_str());
 	}
 
 	/**
-	 * @brief Run the thread
+	 * @brief Run the thread: Obtain an initial lock, create a scheduler_thread with the function to process requests and wait
 	 *
 	 */
 	void cSched::runSched()
@@ -75,6 +82,7 @@ namespace fpga
 		// Thread
 		DBG3("cSched:  initial lock");
 
+		// Create the scheduler-thread to execute the processRequests-function 
 		scheduler_thread = thread(&cSched::processRequests, this);
 		DBG3("cSched:  thread started, vfid: " << vfid);
 
@@ -85,11 +93,13 @@ namespace fpga
 	// ======-------------------------------------------------------------------------------
 	// (Thread) Process requests
 	// ======-------------------------------------------------------------------------------
+	
+	// Function to run in the scheduler_thread to process the scheduled tasks 
 	void cSched::processRequests()
 	{
 		unique_lock<mutex> lck_q(mtx_queue);
 		unique_lock<mutex> lck_r(mtx_rcnfg);
-		run = true;
+		run = true; // Set run to true 
 		bool recIssued = false;
 		int32_t curr_oid = -1;
 		cv_queue.notify_one();
@@ -97,28 +107,32 @@ namespace fpga
 		lck_r.unlock();
 		;
 
+		// Busy-loop: Keep processing while run is true or the request_queue still has elements that need to be processed 
 		while (run || !request_queue.empty())
 		{
 			lck_q.lock();
+
+			// Check if there are still requests in the queue 
 			if (!request_queue.empty())
 			{
 
-				// Grab next reconfig request
+				// Grab next reconfig request from the top of the queue 
 				auto curr_req = std::move(const_cast<std::unique_ptr<cLoad> &>(request_queue.top()));
 				request_queue.pop();
 				lck_q.unlock();
 
-				// Obtain vFPGA
+				// Obtain vFPGA-lock
 				plock.lock();
 
-				// Check whether reconfiguration is needed
+				// Check whether reconfiguration is possible
 				if (isReconfigurable())
 				{
+					// Check if the current operation ID is different to the one pulled from the request queue. Only then a reconfiguration is actually required. 
 					if (curr_oid != curr_req->oid)
 					{
-						reconfigure(curr_req->oid);
+						reconfigure(curr_req->oid); // If reconfiguration is possible and oid has changed, start a reconfiguration 
 						recIssued = true;
-						curr_oid = curr_req->oid;
+						curr_oid = curr_req->oid; // Update current operator ID 
 					}
 					else
 					{
@@ -158,6 +172,7 @@ namespace fpga
 		}
 	}
 
+	// Place a new load in the request_queue
 	void cSched::pLock(int32_t ctid, int32_t oid, uint32_t priority)
 	{
 		unique_lock<std::mutex> lck_q(mtx_queue);
@@ -184,7 +199,7 @@ namespace fpga
 	// ======-------------------------------------------------------------------------------
 
 	/**
-	 * @brief Reconfiguration IO
+	 * @brief Reconfiguration IO - calls the bitstream handler to trigger a reconfiguration of the FPGA
 	 *
 	 * @param oid - operator id
 	 */
@@ -205,22 +220,25 @@ namespace fpga
 	 */
 	void cSched::addBitstream(std::string name, int32_t oid)
 	{
+		// Check that the new bitstream (identified by the operator ID) is not yet stored in the bitstream-map
 		if (bstreams.find(oid) == bstreams.end())
 		{
-			// Stream
+			// Create an input-stream of the bitstream, from it's original file 
 			ifstream f_bit(name, ios::ate | ios::binary);
 			if (!f_bit)
 				throw std::runtime_error("Shell bitstream could not be opened");
 			
+			// Read bitstream from the input-stream 
 			bStream bstream = readBitstream(f_bit);
 			f_bit.close();
 			DBG3("Bitstream loaded, oid: " << oid);
 			
-
+			// Insert the new bitstream with the operator ID in the bitstream-map 
 			bstreams.insert({oid, bstream});
 			return;
 		}
 
+		// Error if the bitstream with this operator ID is already present 
 		throw std::runtime_error("bitstream with same operation ID already present");
 	}
 
@@ -231,11 +249,12 @@ namespace fpga
 	 */
 	void cSched::removeBitstream(int32_t oid)
 	{
+		// Check if the operator ID of the bitstream to be removed can actually be found in the Bitstream-Map
 		if (bstreams.find(oid) != bstreams.end())
 		{
 			auto bstream = bstreams[oid];
-			freeMem(bstream.first);
-			bstreams.erase(oid);
+			freeMem(bstream.first);	// memory for the bitstream is freed
+			bstreams.erase(oid); // entry is erased from the from bitstream-map 
 		}
 	}
 
@@ -246,6 +265,7 @@ namespace fpga
 	 */
 	bool cSched::checkBitstream(int32_t oid)
 	{
+		// Check bitstream-map with the operator-ID 
 		if (bstreams.find(oid) != bstreams.end())
 		{
 			return true;
diff --git a/sw/src/cService.cpp b/sw/src/cService.cpp
index 771d7f46..8d81f240 100644
--- a/sw/src/cService.cpp
+++ b/sw/src/cService.cpp
@@ -9,13 +9,13 @@ cService* cService::cservice = nullptr;
 // ======-------------------------------------------------------------------------------
 
 /**
- * @brief Constructor
+ * @brief Constructor for the cService object, protected to keep singleton-status 
  * 
  * @param vfid
  */
 cService::cService(string name, bool remote, int32_t vfid, uint32_t dev, void (*uisr)(int), uint16_t port, bool priority, bool reorder) 
     : remote(remote), vfid(vfid), dev(dev), uisr(uisr), port(port), cSched(vfid, dev, priority, reorder)  {
-    // ID
+    // ID - create both a service-ID and a socket-name for communication 
     service_id = ("coyote-daemon-vfid-" + std::to_string(vfid) + "-" + name).c_str();
     socket_name = ("/tmp/coyote-daemon-vfid-" + std::to_string(vfid) + "-" + name).c_str();
 }
@@ -24,18 +24,25 @@ cService::cService(string name, bool remote, int32_t vfid, uint32_t dev, void (*
 // Sig handler
 // ======-------------------------------------------------------------------------------
 
+// Set the signum-handler based on the given signum-number (basically can only handle SIGTERM)
 void cService::sigHandler(int signum) {
     cservice->myHandler(signum);
 }
 
+// Can handle signum-calls (only SIGTERM supported as of now)
 void cService::myHandler(int signum) {
+    // Handle termination signals 
     if(signum == SIGTERM) {
         syslog(LOG_NOTICE, "SIGTERM received\n");//cService::getPid());
+
+        // Unlink the socket 
         unlink(socket_name.c_str());
 
         //kill(getpid(), SIGTERM);
         syslog(LOG_NOTICE, "Exiting");
         closelog();
+
+        // Exit the daemon with success-code
         exit(EXIT_SUCCESS);
     } else {
         syslog(LOG_NOTICE, "Signal %d not handled", signum);
@@ -51,7 +58,7 @@ void cService::myHandler(int signum) {
  * 
  */
 void cService::daemonInit() {
-    // Fork
+    // Fork: Create a new child-process, check success by the created PID 
     DBG3("Forking...");
     pid = fork();
     if(pid < 0 ) 
@@ -59,26 +66,26 @@ void cService::daemonInit() {
     if(pid > 0 ) 
         exit(EXIT_SUCCESS);
 
-    // Sid
+    // Sid - create a new session, of which the process is now the session-leader
     if(setsid() < 0) 
         exit(EXIT_FAILURE);
 
     // Signal handler
-    signal(SIGTERM, cService::sigHandler);
-    signal(SIGCHLD, SIG_IGN);
-    signal(SIGHUP, SIG_IGN);
+    signal(SIGTERM, cService::sigHandler); // set up the custom handler for a SIGTERM signal 
+    signal(SIGCHLD, SIG_IGN); // ignore the SIGCHLD command to prevent the creation of zombie processes 
+    signal(SIGHUP, SIG_IGN); // ignore the SIGHUP command so that the process keeps running even if the terminal is killed
 
-    // Fork
+    // Fork again. The new process is not a session leader and has no controlling terminal 
     pid = fork();
     if(pid < 0 ) 
         exit(EXIT_FAILURE);
     if(pid > 0 ) 
         exit(EXIT_SUCCESS);
 
-    // Permissions
+    // Permissions - the daemon can create files with any required permission 
     umask(0);
 
-    // Cd
+    // Cd: Change directory to the working directory to avoid locking any directory 
     if((chdir("/")) < 0) {
         exit(EXIT_FAILURE);
     }
@@ -102,31 +109,38 @@ void cService::socketInit() {
 
     sockfd = -1;
 
+    // In case remote is set to true, initialize a network socket for network communication 
     if(remote) {
         struct sockaddr_in server;
 
+        // Create the socket and check if it's successful
         sockfd = ::socket(AF_INET, SOCK_STREAM, 0);
         if (sockfd == -1) 
             throw std::runtime_error("Could not create a socket");
 
+        // Select network and adress for connection 
         server.sin_family = AF_INET;
         server.sin_addr.s_addr = INADDR_ANY;
         server.sin_port = htons(port);
 
+        // Try to connect the socket 
         if (::bind(sockfd, (struct sockaddr*)&server, sizeof(server)) < 0)
             throw std::runtime_error("Could not bind a socket");
 
         if (sockfd < 0 )
             throw std::runtime_error("Could not listen to a port: " + to_string(port));
     } else {
+        // Create a local socket for Inter-Process Communication 
         struct sockaddr_un server;
         socklen_t len;
 
+        // Check for successful creation of the IPC-socket 
         if((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
             syslog(LOG_ERR, "Error creating a server socket");
             exit(EXIT_FAILURE);
         }
 
+        // Try to bind the socket to remote side for network-based exchange 
         server.sun_family = AF_UNIX;
         strcpy(server.sun_path, socket_name.c_str());
         unlink(server.sun_path);
@@ -138,6 +152,7 @@ void cService::socketInit() {
         }
     }
 
+    // Try to listen to the network socket 
     if(listen(sockfd, maxNumClients) == -1) {
         syslog(LOG_ERR, "Error listen()");
         exit(EXIT_FAILURE);
@@ -148,6 +163,8 @@ void cService::socketInit() {
  * @brief Accept connections
  * 
  */
+
+// Accept a local connection (I guess that's a IPC - inter-process communication)
 void cService::acceptConnectionLocal() {
     sockaddr_un client;
     socklen_t len = sizeof(client); 
@@ -157,10 +174,11 @@ void cService::acceptConnectionLocal() {
     pid_t rpid;
     int fid;
 
+    // Try to accept an incoming connection 
     if((connfd = accept(sockfd, (struct sockaddr *)&client, &len)) != -1) {
         syslog(LOG_NOTICE, "Connection accepted local, connfd: %d", connfd);
 
-        // Read rpid
+        // Read rpid (registered process ID)
         if((n = read(connfd, recv_buf, sizeof(pid_t))) == sizeof(pid_t)) {
             memcpy(&rpid, recv_buf, sizeof(pid_t));
             syslog(LOG_NOTICE, "Registered pid: %d", rpid);
@@ -170,7 +188,7 @@ void cService::acceptConnectionLocal() {
             exit(EXIT_FAILURE);
         }
 
-        // Read fid
+        // Read fid (function ID)
         if((n = read(connfd, recv_buf, sizeof(int))) == sizeof(int)) {
             memcpy(&fid, recv_buf, sizeof(int));
             syslog(LOG_NOTICE, "Function id: %d", fid);
@@ -180,12 +198,14 @@ void cService::acceptConnectionLocal() {
             exit(EXIT_FAILURE);
         }
 
+        // Create a new client thread for the function in the function-struct 
         functions[fid]->registerClientThread(connfd, vfid, rpid, dev, this, uisr);
     }
 
     std::this_thread::sleep_for(std::chrono::nanoseconds(sleepIntervalDaemon));
 }
 
+// Accept a remote connection (that's probably for RDMA-usecase to exchange the QP)
 void cService::acceptConnectionRemote() {
     uint32_t recv_qpid;
     uint8_t ack;
@@ -198,6 +218,7 @@ void cService::acceptConnectionRemote() {
     ibvQ r_qp;
     bThread *cthread;
 
+    // Try to accept the incoming connection 
     if((connfd = ::accept(sockfd, NULL, 0)) != -1) {
         syslog(LOG_NOTICE, "Connection accepted remote, connfd: %d", connfd);
 
@@ -221,18 +242,19 @@ void cService::acceptConnectionRemote() {
             exit(EXIT_FAILURE);
         }
         
+        // Get a cThread from the function registered in the func-struct
         cthread = functions[fid]->registerClientThread(connfd, vfid, getpid(), dev, this, uisr);
-        cthread->getQpair()->remote = r_qp;
-        cthread->getMem({CoyoteAlloc::HPF, r_qp.size, true});
+        cthread->getQpair()->remote = r_qp; // store the received remote QP 
+        cthread->getMem({CoyoteAlloc::HPF, r_qp.size, true}); // Allocate memory for receiving data for RDMA 
 
-        // Send local queue pair
+        // Send local queue pair to the remote side 
         if (::write(connfd, &cthread->getQpair()->local, sizeof(ibvQ)) != sizeof(ibvQ))  {
             ::close(connfd);
             syslog(LOG_ERR, "Could not write a local queue");
             exit(EXIT_FAILURE);
         }
 
-        // Write context and connection
+        // Write context and connection to the config-space of Coyote 
         cthread->writeQpContext(port);
         // ARP lookup
         cthread->doArpLookup(cthread->getQpair()->remote.ip_addr);
@@ -245,14 +267,14 @@ void cService::acceptConnectionRemote() {
 }
 
 /**
- * @brief Main run service
+ * @brief Main run service for the daemon 
  * 
  */
 void cService::start() {
     // Init daemon
     daemonInit();
 
-    // Run scheduler
+    // Run scheduler - creates a scheduler-thread which waits for incoming requests 
     if(isReconfigurable()) runSched();
 
     // Init socket
@@ -261,11 +283,13 @@ void cService::start() {
     // Init threads
     syslog(LOG_NOTICE, "Thread initialization");
 
+    // Iterate over entries in the func-struct and start all of these functions 
+    // Going back to the definition of func-start(): Starting a clean-up-thread? 
     for (auto it = functions.begin(); it != functions.end(); it++) {
         it->second->start();
     }
 
-    // Main
+    // Main - exchange of QP or local connection, depending on remote-setting 
     while(1) {
         if(!remote)
             acceptConnectionLocal();
@@ -277,6 +301,8 @@ void cService::start() {
 // ======-------------------------------------------------------------------------------
 // Functions
 // ======-------------------------------------------------------------------------------
+
+// Place an additional function in the func-struct if the function ID doesn't already exist
 void cService::addFunction(int32_t fid, std::unique_ptr<bFunc> f) {
     if(functions.find(fid) == functions.end()) {
         functions.emplace(fid,std::move(f));