Skip to content

Commit b384f38

Browse files
committed
Fixed issue with header-data split and CPU-only mode in ANO examples
1 parent e1453b3 commit b384f38

File tree

7 files changed

+251
-84
lines changed

7 files changed

+251
-84
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
%YAML 1.2
2+
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
---
17+
multithreaded: true
18+
num_delay_ops: 32
19+
delay: 0.1
20+
delay_step: 0.01
21+
22+
scheduler:
23+
check_recession_period_ms: 0
24+
worker_thread_number: 5
25+
stop_on_deadlock: true
26+
stop_on_deadlock_timeout: 500
27+
28+
advanced_network:
29+
cfg:
30+
version: 1
31+
manager: "dpdk"
32+
master_core: 3
33+
debug: false
34+
35+
memory_regions:
36+
- name: "Data_RX_CPU"
37+
kind: "huge"
38+
affinity: 0
39+
access:
40+
- local
41+
num_bufs: 30720
42+
buf_size: 64
43+
- name: "Data_RX_GPU"
44+
kind: "device"
45+
affinity: 0
46+
access:
47+
- local
48+
num_bufs: 30720
49+
buf_size: 1064
50+
- name: "Default_RX_CPU"
51+
kind: "huge"
52+
affinity: 0
53+
access:
54+
- local
55+
num_bufs: 30720
56+
buf_size: 1064
57+
58+
interfaces:
59+
- name: data2
60+
address: 0005:03:00.0
61+
rx:
62+
- queues:
63+
- name: "Default"
64+
id: 0
65+
cpu_core: 7
66+
batch_size: 10240
67+
output_port: "bench_rx_out"
68+
memory_regions:
69+
- "Default_RX_CPU"
70+
- name: "Data"
71+
id: 1
72+
cpu_core: 8
73+
batch_size: 10240
74+
output_port: "bench_rx_out"
75+
memory_regions:
76+
- "Data_RX_CPU"
77+
- "Data_RX_GPU"
78+
flows:
79+
- name: "ADC Samples"
80+
action:
81+
type: queue
82+
id: 1
83+
match:
84+
udp_src: 4096 #12288
85+
udp_dst: 4096 #12288
86+
bench_rx:
87+
split_boundary: true
88+
gpu_direct: true
89+
batch_size: 10240
90+
max_packet_size: 1064
91+
header_size: 64
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
%YAML 1.2
2+
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
---
17+
multithreaded: true
18+
num_delay_ops: 32
19+
delay: 0.1
20+
delay_step: 0.01
21+
22+
scheduler:
23+
check_recession_period_ms: 0
24+
worker_thread_number: 5
25+
stop_on_deadlock: true
26+
stop_on_deadlock_timeout: 500
27+
28+
advanced_network:
29+
cfg:
30+
version: 1
31+
manager: "dpdk"
32+
master_core: 3
33+
debug: false
34+
35+
memory_regions:
36+
- name: "Data_TX_GPU"
37+
kind: "huge"
38+
affinity: 0
39+
access:
40+
- local
41+
num_bufs: 51200
42+
buf_size: 1064
43+
44+
interfaces:
45+
- name: data1
46+
address: 0005:03:00.0
47+
tx:
48+
- queues:
49+
- name: "ADC Samples"
50+
id: 0
51+
batch_size: 10240
52+
split_boundary: 0
53+
cpu_core: 4
54+
memory_regions:
55+
- "Data_TX_GPU"
56+
offloads:
57+
- "tx_eth_src"
58+
59+
60+
bench_tx:
61+
eth_dst_addr: 48:b0:2d:ed:d0:20 # Destination MAC
62+
udp_dst_port: 4096 # UDP destination port
63+
udp_src_port: 4096 # UDP source port
64+
gpu_direct: false
65+
split_boundary: 0
66+
batch_size: 10000
67+
payload_size: 1000
68+
header_size: 64
69+
ip_src_addr: 192.168.100.5 # Source IP send from
70+
ip_dst_addr: 10.10.100.4 # Destination IP to send to
71+
address: 0005:03:00.0

applications/adv_networking_bench/cpp/dpdk_bench_op_rx.h

Lines changed: 41 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,11 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
4747
// For this example assume all packets are the same size, specified in the config
4848
nom_payload_size_ = max_packet_size_.get() - header_size_.get();
4949

50-
if (!gpu_direct_.get()) {
51-
cudaMallocHost(&full_batch_data_h_, batch_size_.get() * nom_payload_size_);
52-
}
53-
5450
for (int n = 0; n < num_concurrent; n++) {
5551
cudaMalloc(&full_batch_data_d_[n], batch_size_.get() * nom_payload_size_);
52+
if (!gpu_direct_.get()) {
53+
cudaMallocHost(&full_batch_data_h_[n], batch_size_.get() * nom_payload_size_);
54+
}
5655

5756
if (gpu_direct_.get()) {
5857
cudaMallocHost((void**)&h_dev_ptrs_[n], sizeof(void*) * batch_size_.get());
@@ -154,12 +153,10 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
154153
auto batch_offset = aggr_pkts_recv_ * nom_payload_size_;
155154
for (int p = 0; p < adv_net_get_num_pkts(burst); p++) {
156155
auto pkt = static_cast<UDPIPV4Pkt*>(adv_net_get_seg_pkt_ptr(burst, 0, p));
157-
auto len = ntohs(pkt->udp.len) - 8;
156+
auto len = adv_net_get_seg_pkt_len(burst, 0, p) - header_size_.get();
158157

159-
// assert(len + sizeof(UDPIPV4Pkt) == max_packet_size_.get());
160-
161-
memcpy((char*)full_batch_data_h_ + batch_offset + p * nom_payload_size_,
162-
(pkt + sizeof(*pkt)),
158+
memcpy((char*)full_batch_data_h_[cur_idx] + batch_offset + p * nom_payload_size_,
159+
pkt + 1,
163160
len);
164161

165162
ttl_bytes_recv_ += len + sizeof(UDPIPV4Pkt);
@@ -169,45 +166,56 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
169166

170167
aggr_pkts_recv_ += adv_net_get_num_pkts(burst);
171168
cur_msg_.msg[cur_msg_.num_batches++] = burst;
172-
173169
if (aggr_pkts_recv_ >= batch_size_.get()) {
174170
// Do some work on full_batch_data_h_ or full_batch_data_d_
175171
aggr_pkts_recv_ = 0;
176172

177-
if (gpu_direct_.get()) {
178-
free_bufs();
173+
// In CPU-only mode we can free earlier, but to keep it simple we free at the same point
174+
// as we do in GPU-only mode
175+
free_bufs();
179176

180-
if (out_q.size() == num_concurrent) {
181-
HOLOSCAN_LOG_ERROR("Fell behind in processing on GPU!");
182-
adv_net_free_all_pkts_and_burst(burst);
183-
return;
184-
}
177+
if (out_q.size() == num_concurrent) {
178+
HOLOSCAN_LOG_ERROR("Fell behind in processing on GPU!");
179+
adv_net_free_all_pkts_and_burst(burst);
180+
return;
181+
}
185182

183+
if (gpu_direct_.get()) {
186184
simple_packet_reorder(static_cast<uint8_t*>(full_batch_data_d_[cur_idx]),
187185
h_dev_ptrs_[cur_idx],
188186
nom_payload_size_,
189187
batch_size_.get(),
190188
streams_[cur_idx]);
191189

192-
cudaEventRecord(events_[cur_idx], streams_[cur_idx]);
190+
} else {
191+
if (out_q.size() == num_concurrent) {
192+
HOLOSCAN_LOG_ERROR("Fell behind in copying to the GPU!");
193+
adv_net_free_all_pkts_and_burst(burst);
194+
return;
195+
}
196+
197+
cudaMemcpyAsync(full_batch_data_d_[cur_idx],
198+
full_batch_data_h_[cur_idx],
199+
batch_size_.get() * nom_payload_size_,
200+
cudaMemcpyDefault,
201+
streams_[cur_idx]);
202+
}
193203

194-
cur_msg_.evt = events_[cur_idx];
195-
out_q.push(cur_msg_);
196-
cur_msg_.num_batches = 0;
204+
cudaEventRecord(events_[cur_idx], streams_[cur_idx]);
197205

198-
if (cudaGetLastError() != cudaSuccess) {
199-
HOLOSCAN_LOG_ERROR("CUDA error with {} packets in batch and {} bytes total",
200-
batch_size_.get(),
201-
batch_size_.get() * nom_payload_size_);
202-
exit(1);
203-
}
206+
cur_msg_.evt = events_[cur_idx];
207+
out_q.push(cur_msg_);
208+
cur_msg_.num_batches = 0;
204209

205-
} else {
206-
adv_net_free_all_pkts_and_burst(burst);
207-
}
210+
if (cudaGetLastError() != cudaSuccess) {
211+
HOLOSCAN_LOG_ERROR("CUDA error with {} packets in batch and {} bytes total",
212+
batch_size_.get(),
213+
batch_size_.get() * nom_payload_size_);
214+
exit(1);
215+
}
208216

209217
cur_idx = (++cur_idx % num_concurrent);
210-
}
218+
}
211219
}
212220

213221
private:
@@ -218,6 +226,7 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
218226
struct RxMsg {
219227
std::array<std::shared_ptr<AdvNetBurstParams>, MAX_ANO_BATCHES> msg;
220228
int num_batches;
229+
void* full_batch_data_h_;
221230
cudaEvent_t evt;
222231
};
223232

@@ -229,8 +238,8 @@ class AdvNetworkingBenchDefaultRxOp : public Operator {
229238
int64_t aggr_pkts_recv_ = 0; // Aggregate packets received in processing batch
230239
uint16_t nom_payload_size_; // Nominal payload size (no headers)
231240
std::array<void**, num_concurrent> h_dev_ptrs_; // Host-pinned list of device pointers
232-
void* full_batch_data_h_; // Host-pinned aggregated batch
233241
std::array<void*, num_concurrent> full_batch_data_d_; // Device aggregated batch
242+
std::array<void*, num_concurrent> full_batch_data_h_; // Host aggregated batch
234243
Parameter<bool> hds_; // Header-data split enabled
235244
Parameter<bool> gpu_direct_; // GPUDirect enabled
236245
Parameter<uint32_t> batch_size_; // Batch size for one processing block

applications/adv_networking_bench/cpp/dpdk_bench_op_tx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
294294
} else {
295295
op_output.emit(msg, "burst_out");
296296
}
297+
297298
};
298299

299300
private:

operators/advanced_network/adv_network_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ struct MemoryRegion {
188188
uint16_t affinity_;
189189
uint32_t access_;
190190
size_t buf_size_;
191+
size_t adj_size_; // Populated by driver
192+
size_t ttl_size_; // Populated by driver
191193
size_t num_bufs_;
192194
bool owned_;
193195
};

operators/advanced_network/managers/adv_network_mgr.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,32 +41,28 @@ void set_ano_mgr(const AdvNetConfigYaml& cfg) {
4141
AdvNetStatus ANOMgr::allocate_memory_regions() {
4242
HOLOSCAN_LOG_INFO("Registering memory regions");
4343

44-
for (const auto& mr : cfg_.mrs_) {
44+
for (auto& mr : cfg_.mrs_) {
4545
void* ptr;
4646
AllocRegion ar;
47-
size_t buf_size = mr.second.buf_size_ * mr.second.num_bufs_;
47+
mr.second.ttl_size_ = RTE_ALIGN_CEIL(mr.second.adj_size_ * mr.second.num_bufs_, GPU_PAGE_SIZE);
4848

49-
if (buf_size & 0x3) {
50-
HOLOSCAN_LOG_CRITICAL("Total buffer size must be multiple of 4 for MR {}", mr.second.name_);
51-
return AdvNetStatus::NULL_PTR;
52-
}
5349
if (mr.second.owned_) {
5450
switch (mr.second.kind_) {
5551
case MemoryKind::HOST:
56-
ptr = malloc(buf_size);
52+
ptr = malloc(mr.second.ttl_size_);
5753
break;
5854
case MemoryKind::HOST_PINNED:
59-
if (cudaHostAlloc(&ptr, buf_size, 0) != cudaSuccess) {
55+
if (cudaHostAlloc(&ptr, mr.second.ttl_size_, 0) != cudaSuccess) {
6056
HOLOSCAN_LOG_CRITICAL("Failed to allocate CUDA pinned host memory!");
6157
return AdvNetStatus::NULL_PTR;
6258
}
6359
break;
6460
case MemoryKind::HUGE:
65-
ptr = rte_malloc_socket(nullptr, buf_size, RTE_PKTMBUF_HEADROOM, mr.second.affinity_);
61+
ptr = rte_malloc_socket(nullptr, mr.second.ttl_size_, 0, mr.second.affinity_);
6662
break;
6763
case MemoryKind::DEVICE: {
6864
unsigned int flag = 1;
69-
const auto align = RTE_ALIGN_CEIL(buf_size, GPU_PAGE_SIZE);
65+
const auto align = RTE_ALIGN_CEIL(mr.second.ttl_size_, GPU_PAGE_SIZE);
7066
CUdeviceptr cuptr;
7167

7268
cudaSetDevice(mr.second.affinity_);
@@ -95,18 +91,20 @@ AdvNetStatus ANOMgr::allocate_memory_regions() {
9591

9692
if (ptr == nullptr) {
9793
HOLOSCAN_LOG_CRITICAL(
98-
"Fatal to allocate {} of type {} for MR", buf_size, static_cast<int>(mr.second.kind_));
94+
"Fatal to allocate {} of type {} for MR", mr.second.ttl_size_, static_cast<int>(mr.second.kind_));
9995
return AdvNetStatus::NULL_PTR;
10096
}
10197
}
10298

10399
HOLOSCAN_LOG_INFO(
104-
"Successfully allocated memory region {} at {} with {} bytes ({} elements @ {} bytes)",
100+
"Successfully allocated memory region {} at {} type {} with {} bytes ({} elements @ {} bytes total {})",
105101
mr.second.name_,
106102
ptr,
107-
buf_size,
103+
(int)mr.second.kind_,
104+
mr.second.buf_size_,
108105
mr.second.num_bufs_,
109-
mr.second.buf_size_);
106+
mr.second.adj_size_,
107+
mr.second.ttl_size_);
110108
ar_[mr.second.name_] = {mr.second.name_, ptr};
111109
}
112110

0 commit comments

Comments
 (0)