diff --git a/elements/ipsec/IPsecDatablocks.hh b/elements/ipsec/IPsecDatablocks.hh index 202529d..38e3a38 100644 --- a/elements/ipsec/IPsecDatablocks.hh +++ b/elements/ipsec/IPsecDatablocks.hh @@ -198,7 +198,7 @@ public: assert(!has_pending_data); #ifdef DEBUG - memset(&block_info[0], 0xcc, sizeof(struct aes_block_info) * NBA_MAX_COMPBATCH_SIZE * (NBA_MAX_PACKET_SIZE / AES_BLOCK_SIZE)); + memset(&block_info[0], 0xcc, sizeof(struct aes_block_info) * NBA_MAX_COMP_BATCH_SIZE * (NBA_MAX_PACKET_SIZE / AES_BLOCK_SIZE)); #endif for (unsigned p = 0; p < batch->count; ++p) { @@ -247,7 +247,7 @@ private: bool has_pending_data; size_t global_block_cnt; - struct aes_block_info block_info[NBA_MAX_COMPBATCH_SIZE * (NBA_MAX_PACKET_SIZE / AES_BLOCK_SIZE)]; + struct aes_block_info block_info[NBA_MAX_COMP_BATCH_SIZE * (NBA_MAX_PACKET_SIZE / AES_BLOCK_SIZE)]; }; } diff --git a/lib/computation.cc b/lib/computation.cc index 1d6a741..a08bee6 100644 --- a/lib/computation.cc +++ b/lib/computation.cc @@ -59,10 +59,7 @@ comp_thread_context::comp_thread_context() { num_combatch_size = 0; num_batchpool_size = 0; num_taskpool_size = 0; - num_comp_ppdepth = 0; num_coproc_ppdepth = 0; - rx_queue_size = 0; - rx_wakeup_threshold = 0; batch_pool = nullptr; task_pool = nullptr; diff --git a/lib/config.cc b/lib/config.cc index 06b31bf..50c3cca 100644 --- a/lib/config.cc +++ b/lib/config.cc @@ -537,24 +537,25 @@ bool load_config(const char *pyfilename) if (p_sys_params == NULL) goto exit_load_config; -#define LOAD_PARAM(name, defval) system_params.insert({{name, pymap_getlong(p_sys_params, name, defval)}}) - LOAD_PARAM("IO_BATCH_SIZE", 64); // to use vPMD - LOAD_PARAM("IO_RXDESC_PER_HWRXQ", 1024); // to use vPMD - LOAD_PARAM("IO_TXDESC_PER_HWTXQ", 1024); - - LOAD_PARAM("COMP_BATCH_SIZE", 64); - LOAD_PARAM("COMP_PPDEPTH", 32); // unused - LOAD_PARAM("COMP_RXQ_LENGTH", 2048); // unused - LOAD_PARAM("COMP_RXQ_THRES", 256); // unused - LOAD_PARAM("COMP_PREPKTQ_LENGTH", 32); - - LOAD_PARAM("COPROC_PPDEPTH", 64); - LOAD_PARAM("COPROC_INPUTQ_LENGTH", 64); - LOAD_PARAM("COPROC_COMPLETIONQ_LENGTH", 64); - LOAD_PARAM("COPROC_CTX_PER_COMPTHREAD", 1); - - LOAD_PARAM("TASKPOOL_SIZE", 256); - LOAD_PARAM("BATCHPOOL_SIZE", 512); +#define LOAD_PARAM(name, defval) { \ + long val = pymap_getlong(p_sys_params, #name, defval); \ + assert(val <= NBA_MAX_ ## name); \ + system_params.insert({{#name, val}}); \ +} + LOAD_PARAM(IO_BATCH_SIZE, 64); + LOAD_PARAM(IO_DESC_PER_HWRXQ, 1024); + LOAD_PARAM(IO_DESC_PER_HWTXQ, 1024); + + LOAD_PARAM(COMP_BATCH_SIZE, 64); + LOAD_PARAM(COMP_PREPKTQ_LENGTH, 32); + + LOAD_PARAM(COPROC_PPDEPTH, 64); + LOAD_PARAM(COPROC_INPUTQ_LENGTH, 64); + LOAD_PARAM(COPROC_COMPLETIONQ_LENGTH, 64); + LOAD_PARAM(COPROC_CTX_PER_COMPTHREAD, 1); + + LOAD_PARAM(TASKPOOL_SIZE, 256); + LOAD_PARAM(BATCHPOOL_SIZE, 512); #undef LOAD_PARAM /* Retrieve io thread configurations. */ diff --git a/lib/config.hh b/lib/config.hh index e324557..3f35698 100644 --- a/lib/config.hh +++ b/lib/config.hh @@ -11,22 +11,32 @@ #define NBA_MAX_QUEUES_PER_PORT (128) #define NBA_MAX_COPROCESSORS (2) // Max number of coprocessor devices #define NBA_MAX_COPROCESSOR_TYPES (1) // Max number of device types + #define NBA_MAX_PACKET_SIZE (2048) #ifdef NBA_NO_HUGE #define NBA_MAX_IOBATCH_SIZE (4u) #define NBA_MAX_COMPBATCH_SIZE (4u) #else - #define NBA_MAX_IOBATCH_SIZE (256u) - #define NBA_MAX_COMPBATCH_SIZE (256u) + #define NBA_MAX_IO_BATCH_SIZE (256u) + #define NBA_MAX_COMP_BATCH_SIZE (256u) #endif -#define NBA_MAX_SW_RXRING_LENGTH (2048u) -#define NBA_MAX_COMP_PPDEPTH (256u) -#define NBA_MAX_COPROC_PPDEPTH (32u) +#define NBA_MAX_COMP_PREPKTQ_LENGTH (256u) +#define NBA_MAX_IO_DESC_PER_HWRXQ (1024) +#define NBA_MAX_IO_DESC_PER_HWTXQ (1024) + +#define NBA_MAX_COPROC_PPDEPTH (64u) +#define NBA_MAX_COPROC_INPUTQ_LENGTH (64) +#define NBA_MAX_COPROC_COMPLETIONQ_LENGTH (64) +#define NBA_MAX_COPROC_CTX_PER_COMPTHREAD (1) + +#define NBA_MAX_TASKPOOL_SIZE (2048u) #define NBA_MAX_BATCHPOOL_SIZE (2048u) + #define NBA_MAX_ANNOTATION_SET_SIZE (7) #define NBA_MAX_NODELOCALSTORAGE_ENTRIES (16) #define NBA_MAX_KERNEL_OVERLAP (8) #define NBA_MAX_DATABLOCKS (12) // If too large (e.g., 64), batch_pool can not be allocated. + #define NBA_OQ (true) // Use output-queuing semantics when possible. #define NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks. diff --git a/lib/datablock.hh b/lib/datablock.hh index ecce207..fba3a7a 100644 --- a/lib/datablock.hh +++ b/lib/datablock.hh @@ -76,17 +76,17 @@ struct write_roi_info { struct item_size_info { union { uint16_t size; - uint16_t sizes[NBA_MAX_COMPBATCH_SIZE * 12]; + uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12]; }; - uint16_t offsets[NBA_MAX_COMPBATCH_SIZE * 12]; + uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12]; }; #else struct item_size_info { union { uint16_t size; - uint16_t sizes[NBA_MAX_COMPBATCH_SIZE * 96]; + uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96]; }; - uint16_t offsets[NBA_MAX_COMPBATCH_SIZE * 96]; + uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96]; }; #endif diff --git a/lib/elementgraph.cc b/lib/elementgraph.cc index b5c4dc2..28066cc 100644 --- a/lib/elementgraph.cc +++ b/lib/elementgraph.cc @@ -185,7 +185,6 @@ void ElementGraph::run(PacketBatch *batch, Element *start_elem, int input_port) OffloadableElement *offloadable = dynamic_cast(current_elem); assert(offloadable != nullptr); if (lb_decision != -1) { - /* Get or initialize the task object. * This step is always executed for every input batch * passing every offloadable element. */ @@ -276,7 +275,6 @@ void ElementGraph::run(PacketBatch *batch, Element *start_elem, int input_port) } else { /* If not offloaded, run the element's CPU-version handler. */ - batch_disposition = current_elem->_process_batch(input_port, batch); double _cpu_end = rte_rdtsc(); batch->compute_time += (_cpu_end - _cpu_start); diff --git a/lib/io.cc b/lib/io.cc index 55e5275..5397066 100644 --- a/lib/io.cc +++ b/lib/io.cc @@ -152,7 +152,7 @@ struct rx_state { #ifdef TEST_MINIMAL_L2FWD struct packet_batch { unsigned count; - struct rte_mbuf *pkts[NBA_MAX_COMPBATCH_SIZE]; + struct rte_mbuf *pkts[NBA_MAX_COMP_BATCH_SIZE]; }; #endif @@ -371,6 +371,7 @@ static void io_local_stat_timer_cb(struct ev_loop *loop, struct ev_timer *watche ctx->tx_pkt_thruput += ctx->port_stats[j].num_sent_pkts; memset(&ctx->port_stats[j], 0, sizeof(struct io_port_stat)); } + #ifdef NBA_CPU_MICROBENCH char buf[2048]; char *bufp = &buf[0]; for (int e = 0; e < 5; e++) { @@ -380,6 +381,7 @@ static void io_local_stat_timer_cb(struct ev_loop *loop, struct ev_timer *watche memset(ctx->papi_ctr_rx, 0, sizeof(long long) * 5); memset(ctx->papi_ctr_tx, 0, sizeof(long long) * 5); memset(ctx->papi_ctr_comp, 0, sizeof(long long) * 5); +#endif /* Inform the master to check updates. */ rte_atomic16_inc(ctx->node_master_flag); ev_async_send(ctx->node_master_ctx->loop, ctx->node_stat_watcher); @@ -693,8 +695,8 @@ int io_loop(void *arg) // the way numa index numbered for each cpu core is checked in main(). (see 'is_numa_idx_grouped' in main()) const unsigned num_nodes = numa_num_configured_nodes(); - struct rte_mbuf *pkts[NBA_MAX_IOBATCH_SIZE * NBA_MAX_QUEUES_PER_PORT]; - struct rte_mbuf *drop_pkts[NBA_MAX_IOBATCH_SIZE]; + struct rte_mbuf *pkts[NBA_MAX_IO_BATCH_SIZE * NBA_MAX_QUEUES_PER_PORT]; + struct rte_mbuf *drop_pkts[NBA_MAX_IO_BATCH_SIZE]; struct timespec sleep_ts; unsigned i, j; char temp[1024]; @@ -764,7 +766,7 @@ int io_loop(void *arg) snprintf(temp, RTE_MEMPOOL_NAMESIZE, "comp.batch.%u:%u@%u", ctx->loc.node_id, ctx->loc.local_thread_idx, ctx->loc.core_id); ctx->comp_ctx->batch_pool = rte_mempool_create(temp, ctx->comp_ctx->num_batchpool_size + 1, - sizeof(PacketBatch), CACHE_LINE_SIZE, + sizeof(PacketBatch), 0, //(unsigned) (ctx->comp_ctx->num_batchpool_size / 1.5), 0, nullptr, nullptr, comp_packetbatch_init, nullptr, ctx->loc.node_id, 0); @@ -776,7 +778,7 @@ int io_loop(void *arg) size_t dbstate_pool_size = NBA_MAX_COPROC_PPDEPTH; size_t dbstate_item_size = sizeof(struct datablock_tracker) * NBA_MAX_DATABLOCKS; ctx->comp_ctx->dbstate_pool = rte_mempool_create(temp, dbstate_pool_size + 1, - dbstate_item_size, CACHE_LINE_SIZE, + dbstate_item_size, 0, //(unsigned) (dbstate_pool_size / 1.5), 0, nullptr, nullptr, comp_dbstate_init, nullptr, ctx->loc.node_id, 0); @@ -788,7 +790,7 @@ int io_loop(void *arg) snprintf(temp, RTE_MEMPOOL_NAMESIZE, "comp.task.%u:%u@%u", ctx->loc.node_id, ctx->loc.local_thread_idx, ctx->loc.core_id); ctx->comp_ctx->task_pool = rte_mempool_create(temp, ctx->comp_ctx->num_taskpool_size + 1, - sizeof(OffloadTask), CACHE_LINE_SIZE, + sizeof(OffloadTask), 0, //(unsigned) (ctx->comp_ctx->num_taskpool_size / 1.5), 0, nullptr, nullptr, comp_task_init, nullptr, ctx->loc.node_id, 0); @@ -1054,7 +1056,7 @@ int io_loop(void *arg) prev_tsc = cur_tsc; } // end of rxq scanning - assert(total_recv_cnt <= NBA_MAX_IOBATCH_SIZE * NBA_MAX_COMP_PPDEPTH); + assert(total_recv_cnt <= NBA_MAX_IO_BATCH_SIZE * ctx->num_hw_rx_queues); #ifdef NBA_CPU_MICROBENCH { long long ctr[5]; diff --git a/lib/packetbatch.hh b/lib/packetbatch.hh index 09bd41c..ea26e02 100644 --- a/lib/packetbatch.hh +++ b/lib/packetbatch.hh @@ -28,9 +28,9 @@ public: delay_start(0), compute_time(0) { #ifdef DEBUG - memset(&results[0], 0xdd, sizeof(int) * NBA_MAX_COMPBATCH_SIZE); - memset(&excluded[0], 0xcc, sizeof(bool) * NBA_MAX_COMPBATCH_SIZE); - memset(&packets[0], 0xbb, sizeof(struct rte_mbuf*) * NBA_MAX_COMPBATCH_SIZE); + memset(&results[0], 0xdd, sizeof(int) * NBA_MAX_COMP_BATCH_SIZE); + memset(&excluded[0], 0xcc, sizeof(bool) * NBA_MAX_COMP_BATCH_SIZE); + memset(&packets[0], 0xbb, sizeof(struct rte_mbuf*) * NBA_MAX_COMP_BATCH_SIZE); #endif } @@ -50,9 +50,9 @@ public: double compute_time; struct annotation_set banno __rte_cache_aligned; /** Batch-level annotations. */ - bool excluded[NBA_MAX_COMPBATCH_SIZE] __rte_cache_aligned; - struct rte_mbuf *packets[NBA_MAX_COMPBATCH_SIZE] __rte_cache_aligned; - int results[NBA_MAX_COMPBATCH_SIZE] __rte_cache_aligned; + bool excluded[NBA_MAX_COMP_BATCH_SIZE] __rte_cache_aligned; + struct rte_mbuf *packets[NBA_MAX_COMP_BATCH_SIZE] __rte_cache_aligned; + int results[NBA_MAX_COMP_BATCH_SIZE] __rte_cache_aligned; }; } diff --git a/lib/types.hh b/lib/types.hh index e889478..b7f165a 100644 --- a/lib/types.hh +++ b/lib/types.hh @@ -161,14 +161,11 @@ public: struct core_location loc; unsigned num_tx_ports; unsigned num_nodes; - unsigned num_comp_ppdepth; unsigned num_coproc_ppdepth; unsigned num_combatch_size; unsigned num_batchpool_size; unsigned num_taskpool_size; unsigned task_completion_queue_size; - unsigned rx_queue_size; - unsigned rx_wakeup_threshold; struct rte_mempool *batch_pool; struct rte_mempool *dbstate_pool; diff --git a/main.cc b/main.cc index 3ffdba0..39d7f8d 100644 --- a/main.cc +++ b/main.cc @@ -358,7 +358,7 @@ int main(int argc, char **argv) rx_conf.rx_thresh.wthresh = 4; rx_conf.rx_free_thresh = 32; rx_conf.rx_drop_en = 0; /* when enabled, drop packets if no descriptors are available */ - const unsigned num_rx_desc = system_params["IO_RXDESC_PER_HWRXQ"]; + const unsigned num_rx_desc = system_params["IO_DESC_PER_HWRXQ"]; /* Per TX-queue configuration */ struct rte_eth_txconf tx_conf; @@ -370,7 +370,7 @@ int main(int argc, char **argv) tx_conf.tx_rs_thresh = 32; tx_conf.tx_free_thresh = 0; /* use PMD default value */ tx_conf.txq_flags = ETH_TXQ_FLAGS_NOMULTSEGS | ETH_TXQ_FLAGS_NOOFFLOADS; - const unsigned num_tx_desc = system_params["IO_TXDESC_PER_HWTXQ"]; + const unsigned num_tx_desc = system_params["IO_DESC_PER_HWTXQ"]; /* According to dpdk-dev mailing list, * num_mbufs for the whole system should be greater than: @@ -499,7 +499,7 @@ int main(int argc, char **argv) unsigned queue_length = 0; switch (conf.template_) { case SWRXQ: - queue_length = system_params["COMP_RXQ_LENGTH"]; + queue_length = 32; // FIXME: unsued break; case TASKINQ: queue_length = system_params["COPROC_INPUTQ_LENGTH"]; @@ -536,7 +536,7 @@ int main(int argc, char **argv) /* Some sanity checks... */ if (emulate_io) { - long expected_inflight_batches = num_mbufs / num_io_threads / system_params["COMP_PPDEPTH"]; + long expected_inflight_batches = num_mbufs / num_io_threads / system_params["COMP_BATCH_SIZE"]; RTE_LOG(DEBUG, MAIN, "coproc_ppdepth = %ld, max.# in-flight batches per IO thread = %ld\n", system_params["COPROC_PPDEPTH"], expected_inflight_batches); //if (system_params["COPROC_PPDEPTH"] > expected_inflight_batches) { @@ -717,9 +717,6 @@ int main(int argc, char **argv) ctx->inspector = NULL; ctx->num_combatch_size = system_params["COMP_BATCH_SIZE"]; - ctx->rx_queue_size = system_params["COMP_RXQ_LENGTH"]; - ctx->rx_wakeup_threshold = system_params["COMP_RXQ_THRES"]; - ctx->num_comp_ppdepth = system_params["COMP_PPDEPTH"]; ctx->num_coproc_ppdepth = system_params["COPROC_PPDEPTH"]; ctx->num_batchpool_size = system_params["BATCHPOOL_SIZE"]; ctx->num_taskpool_size = system_params["TASKPOOL_SIZE"]; @@ -947,7 +944,7 @@ int main(int argc, char **argv) */ snprintf(ring_name, RTE_RING_NAMESIZE, "dropq.%u:%u@%u", ctx->loc.node_id, ctx->loc.local_thread_idx, ctx->loc.core_id); - ctx->drop_queue = rte_ring_create(ring_name, 8 * NBA_MAX_COMPBATCH_SIZE, + ctx->drop_queue = rte_ring_create(ring_name, 8 * NBA_MAX_COMP_BATCH_SIZE, node_id, RING_F_SC_DEQ); assert(NULL != ctx->drop_queue); @@ -955,10 +952,10 @@ int main(int argc, char **argv) for (k = 0; k < num_ports; k++) { snprintf(ring_name, RTE_RING_NAMESIZE, "txq%u.%u:%u@%u", k, ctx->loc.node_id, ctx->loc.local_thread_idx, ctx->loc.core_id); - ctx->tx_queues[k] = rte_ring_create(ring_name, 8 * NBA_MAX_COMPBATCH_SIZE, + ctx->tx_queues[k] = rte_ring_create(ring_name, 8 * NBA_MAX_COMP_BATCH_SIZE, node_id, RING_F_SC_DEQ); assert(NULL != ctx->tx_queues[k]); - assert(0 == rte_ring_set_water_mark(ctx->tx_queues[k], (8 * NBA_MAX_COMPBATCH_SIZE) - 16)); + assert(0 == rte_ring_set_water_mark(ctx->tx_queues[k], (8 * NBA_MAX_COMP_BATCH_SIZE) - 16)); } snprintf(ring_name, RTE_RING_NAMESIZE, "reqring.%u:%u@%u",