Fixed issue with header-data split and CPU-only mode in ANO examples

cliffburdick · cliffburdick · commit b384f38b4991 · 2024-08-13T20:54:47.000-07:00
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_rx.yaml b/applications/adv_networking_bench/adv_networking_bench_default_rx.yaml
@@ -0,0 +1,91 @@
+%YAML 1.2
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+multithreaded: true
+num_delay_ops: 32
+delay: 0.1
+delay_step: 0.01
+
+scheduler:
+  check_recession_period_ms: 0
+  worker_thread_number: 5
+  stop_on_deadlock: true
+  stop_on_deadlock_timeout: 500
+
+advanced_network:
+  cfg:
+    version: 1
+    manager: "dpdk"
+    master_core: 3
+    debug: false    
+
+    memory_regions:
+    - name: "Data_RX_CPU"
+      kind: "huge"
+      affinity: 0
+      access:
+        - local
+      num_bufs: 30720
+      buf_size: 64
+    - name: "Data_RX_GPU"
+      kind: "device"
+      affinity: 0
+      access:
+        - local
+      num_bufs: 30720
+      buf_size: 1064
+    - name: "Default_RX_CPU"
+      kind: "huge"
+      affinity: 0
+      access:
+        - local
+      num_bufs: 30720
+      buf_size: 1064        
+    
+    interfaces:      
+    - name: data2
+      address: 0005:03:00.0
+      rx:
+        - queues:
+          - name: "Default"
+            id: 0
+            cpu_core: 7
+            batch_size: 10240
+            output_port: "bench_rx_out"
+            memory_regions: 
+              - "Default_RX_CPU"
+          - name: "Data"
+            id: 1
+            cpu_core: 8
+            batch_size: 10240
+            output_port: "bench_rx_out"
+            memory_regions: 
+              - "Data_RX_CPU"
+              - "Data_RX_GPU"
+          flows:
+            - name: "ADC Samples"
+              action:
+                type: queue
+                id: 1
+              match:
+                udp_src: 4096 #12288
+                udp_dst: 4096 #12288                          
+bench_rx:
+  split_boundary: true
+  gpu_direct: true
+  batch_size: 10240
+  max_packet_size: 1064
+  header_size: 64
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_tx.yaml b/applications/adv_networking_bench/adv_networking_bench_default_tx.yaml
@@ -0,0 +1,71 @@
+%YAML 1.2
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+multithreaded: true
+num_delay_ops: 32
+delay: 0.1
+delay_step: 0.01
+
+scheduler:
+  check_recession_period_ms: 0
+  worker_thread_number: 5
+  stop_on_deadlock: true
+  stop_on_deadlock_timeout: 500
+
+advanced_network:
+  cfg:
+    version: 1
+    manager: "dpdk"
+    master_core: 3
+    debug: false    
+
+    memory_regions:
+    - name: "Data_TX_GPU"
+      kind: "huge"
+      affinity: 0
+      access:
+        - local
+      num_bufs: 51200
+      buf_size: 1064
+
+    interfaces:
+    - name: data1
+      address: 0005:03:00.0
+      tx:
+        - queues:
+          - name: "ADC Samples"
+            id: 0
+            batch_size: 10240
+            split_boundary: 0
+            cpu_core: 4
+            memory_regions:
+              - "Data_TX_GPU"
+            offloads:
+              - "tx_eth_src"         
+                     
+
+bench_tx:
+  eth_dst_addr: 48:b0:2d:ed:d0:20   # Destination MAC
+  udp_dst_port: 4096                  # UDP destination port
+  udp_src_port: 4096                  # UDP source port
+  gpu_direct: false
+  split_boundary: 0
+  batch_size: 10000
+  payload_size: 1000
+  header_size: 64
+  ip_src_addr: 192.168.100.5          # Source IP send from
+  ip_dst_addr: 10.10.100.4          # Destination IP to send to
+  address: 0005:03:00.0
diff --git a/applications/adv_networking_bench/cpp/dpdk_bench_op_rx.h b/applications/adv_networking_bench/cpp/dpdk_bench_op_rx.h
@@ -47,12 +47,11 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
     // For this example assume all packets are the same size, specified in the config
     nom_payload_size_ = max_packet_size_.get() - header_size_.get();
 
-    if (!gpu_direct_.get()) {
-      cudaMallocHost(&full_batch_data_h_, batch_size_.get() * nom_payload_size_);
-    }
-
     for (int n = 0; n < num_concurrent; n++) {
       cudaMalloc(&full_batch_data_d_[n], batch_size_.get() * nom_payload_size_);
+      if (!gpu_direct_.get()) {
+        cudaMallocHost(&full_batch_data_h_[n], batch_size_.get() * nom_payload_size_);
+      }      
 
       if (gpu_direct_.get()) {
         cudaMallocHost((void**)&h_dev_ptrs_[n], sizeof(void*) * batch_size_.get());
@@ -154,12 +153,10 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
       auto batch_offset = aggr_pkts_recv_ * nom_payload_size_;
       for (int p = 0; p < adv_net_get_num_pkts(burst); p++) {
         auto pkt = static_cast<UDPIPV4Pkt*>(adv_net_get_seg_pkt_ptr(burst, 0, p));
-        auto len = ntohs(pkt->udp.len) - 8;
+        auto len = adv_net_get_seg_pkt_len(burst, 0, p) - header_size_.get();
 
-        // assert(len + sizeof(UDPIPV4Pkt) == max_packet_size_.get());
-
-        memcpy((char*)full_batch_data_h_ + batch_offset + p * nom_payload_size_,
-               (pkt + sizeof(*pkt)),
+        memcpy((char*)full_batch_data_h_[cur_idx] + batch_offset + p * nom_payload_size_,
+               pkt + 1,
                len);
 
         ttl_bytes_recv_ += len + sizeof(UDPIPV4Pkt);
@@ -169,45 +166,56 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
 
     aggr_pkts_recv_ += adv_net_get_num_pkts(burst);
     cur_msg_.msg[cur_msg_.num_batches++] = burst;
-
     if (aggr_pkts_recv_ >= batch_size_.get()) {
       // Do some work on full_batch_data_h_ or full_batch_data_d_
       aggr_pkts_recv_ = 0;
 
-      if (gpu_direct_.get()) {
-        free_bufs();
+      // In CPU-only mode we can free earlier, but to keep it simple we free at the same point
+      // as we do in GPU-only mode
+      free_bufs();
 
-        if (out_q.size() == num_concurrent) {
-          HOLOSCAN_LOG_ERROR("Fell behind in processing on GPU!");
-          adv_net_free_all_pkts_and_burst(burst);
-          return;
-        }
+      if (out_q.size() == num_concurrent) {
+        HOLOSCAN_LOG_ERROR("Fell behind in processing on GPU!");
+        adv_net_free_all_pkts_and_burst(burst);
+        return;
+      }      
 
+      if (gpu_direct_.get()) {
         simple_packet_reorder(static_cast<uint8_t*>(full_batch_data_d_[cur_idx]),
                               h_dev_ptrs_[cur_idx],
                               nom_payload_size_,
                               batch_size_.get(),
                               streams_[cur_idx]);
 
-        cudaEventRecord(events_[cur_idx], streams_[cur_idx]);
+      } else { 
+          if (out_q.size() == num_concurrent) {
+            HOLOSCAN_LOG_ERROR("Fell behind in copying to the GPU!");
+            adv_net_free_all_pkts_and_burst(burst);
+            return;
+          }
+
+          cudaMemcpyAsync(full_batch_data_d_[cur_idx], 
+                          full_batch_data_h_[cur_idx], 
+                          batch_size_.get() * nom_payload_size_, 
+                          cudaMemcpyDefault, 
+                          streams_[cur_idx]);
+      }
 
-        cur_msg_.evt = events_[cur_idx];
-        out_q.push(cur_msg_);
-        cur_msg_.num_batches = 0;
+      cudaEventRecord(events_[cur_idx], streams_[cur_idx]);
 
-        if (cudaGetLastError() != cudaSuccess) {
-          HOLOSCAN_LOG_ERROR("CUDA error with {} packets in batch and {} bytes total",
-                             batch_size_.get(),
-                             batch_size_.get() * nom_payload_size_);
-          exit(1);
-        }
+      cur_msg_.evt = events_[cur_idx];
+      out_q.push(cur_msg_);
+      cur_msg_.num_batches = 0;
 
-      } else {
-        adv_net_free_all_pkts_and_burst(burst);
-      }
+      if (cudaGetLastError() != cudaSuccess) {
+        HOLOSCAN_LOG_ERROR("CUDA error with {} packets in batch and {} bytes total",
+                            batch_size_.get(),
+                            batch_size_.get() * nom_payload_size_);
+        exit(1);
+      }      
 
       cur_idx = (++cur_idx % num_concurrent);
-    }
+    } 
   }
 
  private:
@@ -218,6 +226,7 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
   struct RxMsg {
     std::array<std::shared_ptr<AdvNetBurstParams>, MAX_ANO_BATCHES> msg;
     int num_batches;
+    void* full_batch_data_h_;
     cudaEvent_t evt;
   };
 
@@ -229,8 +238,8 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
   int64_t aggr_pkts_recv_ = 0;                     // Aggregate packets received in processing batch
   uint16_t nom_payload_size_;                      // Nominal payload size (no headers)
   std::array<void**, num_concurrent> h_dev_ptrs_;  // Host-pinned list of device pointers
-  void* full_batch_data_h_;                        // Host-pinned aggregated batch
   std::array<void*, num_concurrent> full_batch_data_d_;  // Device aggregated batch
+  std::array<void*, num_concurrent> full_batch_data_h_;  // Host aggregated batch
   Parameter<bool> hds_;                                  // Header-data split enabled
   Parameter<bool> gpu_direct_;                           // GPUDirect enabled
   Parameter<uint32_t> batch_size_;                       // Batch size for one processing block
diff --git a/applications/adv_networking_bench/cpp/dpdk_bench_op_tx.h b/applications/adv_networking_bench/cpp/dpdk_bench_op_tx.h
@@ -294,6 +294,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
     } else {
       op_output.emit(msg, "burst_out");
     }
+
   };
 
  private:
diff --git a/operators/advanced_network/adv_network_types.h b/operators/advanced_network/adv_network_types.h
@@ -188,6 +188,8 @@ struct MemoryRegion {
   uint16_t affinity_;
   uint32_t access_;
   size_t buf_size_;
+  size_t adj_size_; // Populated by driver
+  size_t ttl_size_; // Populated by driver
   size_t num_bufs_;
   bool owned_;
 };
diff --git a/operators/advanced_network/managers/adv_network_mgr.cpp b/operators/advanced_network/managers/adv_network_mgr.cpp
@@ -41,32 +41,28 @@ void set_ano_mgr(const AdvNetConfigYaml& cfg) {
 AdvNetStatus ANOMgr::allocate_memory_regions() {
   HOLOSCAN_LOG_INFO("Registering memory regions");
 
-  for (const auto& mr : cfg_.mrs_) {
+  for (auto& mr : cfg_.mrs_) {
     void* ptr;
     AllocRegion ar;
-    size_t buf_size = mr.second.buf_size_ * mr.second.num_bufs_;
+    mr.second.ttl_size_ = RTE_ALIGN_CEIL(mr.second.adj_size_ * mr.second.num_bufs_, GPU_PAGE_SIZE);
 
-    if (buf_size & 0x3) {
-      HOLOSCAN_LOG_CRITICAL("Total buffer size must be multiple of 4 for MR {}", mr.second.name_);
-      return AdvNetStatus::NULL_PTR;
-    }
     if (mr.second.owned_) {
       switch (mr.second.kind_) {
         case MemoryKind::HOST:
-          ptr = malloc(buf_size);
+          ptr = malloc(mr.second.ttl_size_);
           break;
         case MemoryKind::HOST_PINNED:
-          if (cudaHostAlloc(&ptr, buf_size, 0) != cudaSuccess) {
+          if (cudaHostAlloc(&ptr, mr.second.ttl_size_, 0) != cudaSuccess) {
             HOLOSCAN_LOG_CRITICAL("Failed to allocate CUDA pinned host memory!");
             return AdvNetStatus::NULL_PTR;
           }
           break;
         case MemoryKind::HUGE:
-          ptr = rte_malloc_socket(nullptr, buf_size, RTE_PKTMBUF_HEADROOM, mr.second.affinity_);
+          ptr = rte_malloc_socket(nullptr, mr.second.ttl_size_, 0, mr.second.affinity_);
           break;
         case MemoryKind::DEVICE: {
           unsigned int flag = 1;
-          const auto align = RTE_ALIGN_CEIL(buf_size, GPU_PAGE_SIZE);
+          const auto align = RTE_ALIGN_CEIL(mr.second.ttl_size_, GPU_PAGE_SIZE);
           CUdeviceptr cuptr;
 
           cudaSetDevice(mr.second.affinity_);
@@ -95,18 +91,20 @@ AdvNetStatus ANOMgr::allocate_memory_regions() {
 
       if (ptr == nullptr) {
         HOLOSCAN_LOG_CRITICAL(
-            "Fatal to allocate {} of type {} for MR", buf_size, static_cast<int>(mr.second.kind_));
+            "Fatal to allocate {} of type {} for MR", mr.second.ttl_size_, static_cast<int>(mr.second.kind_));
         return AdvNetStatus::NULL_PTR;
       }
     }
 
     HOLOSCAN_LOG_INFO(
-        "Successfully allocated memory region {} at {} with {} bytes ({} elements @ {} bytes)",
+        "Successfully allocated memory region {} at {} type {} with {} bytes ({} elements @ {} bytes total {})",
         mr.second.name_,
         ptr,
-        buf_size,
+        (int)mr.second.kind_,
+        mr.second.buf_size_,
         mr.second.num_bufs_,
-        mr.second.buf_size_);
+        mr.second.adj_size_,
+        mr.second.ttl_size_);
     ar_[mr.second.name_] = {mr.second.name_, ptr};
   }
 
diff --git a/operators/advanced_network/managers/dpdk/adv_network_dpdk_mgr.cpp b/operators/advanced_network/managers/dpdk/adv_network_dpdk_mgr.cpp

Original file line number	Diff line number	Diff line change
`@@ -294,6 +294,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {`
`294`	`294`	`} else {`
`295`	`295`	`op_output.emit(msg, "burst_out");`
`296`	`296`	`}`
	`297`	`+`
`297`	`298`	`};`
`298`	`299`
`299`	`300`	`private:`