-
-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial implementation transactions in new hastable mpmc (#285)
This PR introduces a basic transaction support for the new mpmc hashtable: - it introduces two new data structures, both mpmc and spsc, to fetch and release slot ids using a bitmap, whereas each slot is mapped to each bit of the bitmap - this special bitmap uses a table lookup to quickly find which is the first bit available in an uint16_t drammatically reducing the number of comparisons the algorithm has to carry out - the mpmc variant offers some additional functions over the spsc one to avoid cache-conflicts in the vast majority of the cases - it restructures the transactions related structs and fields, now the id is a plain uint32_t number - it can even be reduced in size to recoup space to be used for different purposes, probably a 20 bit number would be plenty as having 1048575 transactions seems unlikely even on very high end systems - it updates the internal interfaces to provide the needed hooks and checks for the transactions needed for the Read-Modify-Write operations This PR is extremely important because it introduces a blazing fast new data structure, the `slots_bitmap_mpmc` which performs extremely well under contention. The main idea is that each transaction is just a slot of the bitmap and each thread will search in this bitmap first in its own shard (as the bitmap allows localized searches) and then in the whole bitmaps as fallback. The result is that in a normal situation, where slots are available for the thread, to acquire and release 51200 slots per threads it takes just a few nanoseconds (more details in `benches/bench-slots-bitmap-mpmc.cpp`) ``` threads:1 15099228 ns threads:2 1027204 ns threads:4 74470 ns threads:8 5860 ns threads:16 499 ns threads:32 42 ns ``` With 4 thread and in avg 800k ops/s in total (therefore potentially 800k transactions/s) the time required would be 1.16ms, an average overhead of 1.45ns per transaction to be acquired and released meanwhile with with just 1 thread and in avg 200k ops/s would be 295us. In both cases the time to acquire and release a transaction id is close enough to zero to be considered irrelevant!
- Loading branch information
1 parent
73325e3
commit 36fcca8
Showing
15 changed files
with
2,413 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
/** | ||
* Copyright (C) 2018-2023 Daniele Salvatore Albano | ||
* All rights reserved. | ||
* | ||
* This software may be modified and distributed under the terms | ||
* of the BSD license. See the LICENSE file for details. | ||
**/ | ||
|
||
//Run on (32 X 4200.07 MHz CPU s) | ||
//CPU Caches: | ||
//L1 Data 32 KiB (x16) | ||
//L1 Instruction 32 KiB (x16) | ||
//L2 Unified 512 KiB (x16) | ||
//L3 Unified 16384 KiB (x4) | ||
//Load Average: 0.82, 1.41, 2.10 | ||
//-------------------------------------------------------------------------------------------------------------------------- | ||
//Benchmark Time CPU Iterations | ||
//-------------------------------------------------------------------------------------------------------------------------- | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_mean 296 ns 296 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_median 285 ns 285 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_stddev 43.9 ns 43.9 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_cv 14.84 % 14.85 % 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_mean 75.8 ns 152 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_median 72.2 ns 144 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_stddev 12.4 ns 24.9 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_cv 16.39 % 16.40 % 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_mean 18.7 ns 74.8 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_median 18.6 ns 74.4 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_stddev 0.412 ns 1.65 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_cv 2.20 % 2.20 % 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_mean 4.97 ns 39.7 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_median 4.95 ns 39.6 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_stddev 0.105 ns 0.837 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_cv 2.10 % 2.11 % 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_mean 4.70 ns 75.2 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_median 4.70 ns 75.2 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_stddev 0.021 ns 0.337 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_cv 0.44 % 0.45 % 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_mean 3.85 ns 123 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_median 3.84 ns 123 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_stddev 0.034 ns 1.10 ns 50 | ||
//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_cv 0.89 % 0.89 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_mean 15224414 ns 15221858 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_median 15099228 ns 15096381 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_stddev 695354 ns 694733 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_cv 4.57 % 4.56 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_mean 1027646 ns 2054940 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_median 1027204 ns 2054054 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_stddev 2107 ns 4188 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_cv 0.21 % 0.20 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_mean 74484 ns 297899 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_median 74470 ns 297824 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_stddev 151 ns 606 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_cv 0.20 % 0.20 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_mean 5854 ns 46821 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_median 5860 ns 46846 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_stddev 41.6 ns 331 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_cv 0.71 % 0.71 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_mean 497 ns 7940 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_median 499 ns 7978 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_stddev 9.04 ns 145 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_cv 1.82 % 1.82 % 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_mean 42.1 ns 1340 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_median 42.0 ns 1340 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_stddev 0.192 ns 5.79 ns 50 | ||
//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_cv 0.46 % 0.43 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_mean 285 ns 285 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_median 285 ns 285 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_stddev 0.968 ns 0.959 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_cv 0.34 % 0.34 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_mean 301 ns 601 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_median 301 ns 601 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_stddev 1.09 ns 2.18 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_cv 0.36 % 0.36 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_mean 294 ns 1175 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_median 289 ns 1157 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_stddev 7.48 ns 29.8 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_cv 2.55 % 2.54 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_mean 310 ns 2476 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_median 309 ns 2473 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_stddev 2.73 ns 20.9 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_cv 0.88 % 0.84 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_mean 300 ns 4802 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_median 300 ns 4792 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_stddev 2.75 ns 43.2 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_cv 0.92 % 0.90 % 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_mean 560 ns 17623 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_median 558 ns 17662 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_stddev 5.24 ns 156 ns 50 | ||
//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_cv 0.94 % 0.88 % 50 | ||
|
||
#include <cstdio> | ||
#include <cstring> | ||
#include <cstdint> | ||
#include <cstdbool> | ||
|
||
#include <benchmark/benchmark.h> | ||
|
||
#include "misc.h" | ||
#include "memory_fences.h" | ||
#include "exttypes.h" | ||
#include "utils_cpu.h" | ||
#include "thread.h" | ||
|
||
#include "data_structures/slots_bitmap_mpmc/slots_bitmap_mpmc.h" | ||
|
||
#include "benchmark-program-simple.hpp" | ||
|
||
// It is possible to control the amount of threads used for the test tuning the two defines below | ||
#define TEST_THREADS_RANGE_BEGIN (1) | ||
#define TEST_THREADS_RANGE_END (utils_cpu_count()) | ||
#define BITS_PER_THREAD (((sizeof(uint64_t) * 8) * (64 / sizeof(uint64_t))) * 100) | ||
|
||
static slots_bitmap_mpmc_t *bitmap = nullptr; | ||
|
||
static void slots_bitmap_mpmc_fill_parallel(benchmark::State& state) { | ||
int current_thread = state.thread_index(); | ||
int total_threads = state.threads(); | ||
const uint64_t size_per_thread = BITS_PER_THREAD; | ||
const uint64_t size = size_per_thread * total_threads; | ||
uint64_t start = (size / total_threads) * current_thread; | ||
|
||
if (current_thread == 0) { | ||
bitmap = slots_bitmap_mpmc_init(size); | ||
} | ||
|
||
thread_current_set_affinity(state.thread_index()); | ||
|
||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, start, 1)); | ||
} | ||
|
||
if (current_thread == 0) { | ||
slots_bitmap_mpmc_free(bitmap); | ||
bitmap = nullptr; | ||
} | ||
} | ||
|
||
static void slots_bitmap_mpmc_fill_all_and_release_parallel(benchmark::State& state) { | ||
int current_thread = state.thread_index(); | ||
int total_threads = state.threads(); | ||
const uint64_t size_per_thread = BITS_PER_THREAD; | ||
const uint64_t size = size_per_thread * total_threads; | ||
uint64_t start = (size / total_threads) * current_thread; | ||
|
||
if (current_thread == 0) { | ||
bitmap = slots_bitmap_mpmc_init(size); | ||
} | ||
|
||
thread_current_set_affinity(state.thread_index()); | ||
|
||
for (auto _ : state) { | ||
for(uint64_t i = start; i < size_per_thread / total_threads; i++) { | ||
benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, start, 1)); | ||
} | ||
|
||
for(uint64_t i = start; i < size_per_thread / total_threads; i++) { | ||
slots_bitmap_mpmc_release(bitmap, i); | ||
} | ||
} | ||
|
||
if (current_thread == 0) { | ||
slots_bitmap_mpmc_free(bitmap); | ||
bitmap = nullptr; | ||
} | ||
} | ||
|
||
static void slots_bitmap_mpmc_fill_sequential(benchmark::State& state) { | ||
int current_thread = state.thread_index(); | ||
int total_threads = state.threads(); | ||
const uint64_t size_per_thread = BITS_PER_THREAD; | ||
const uint64_t size = size_per_thread * total_threads; | ||
|
||
if (current_thread == 0) { | ||
bitmap = slots_bitmap_mpmc_init(size); | ||
} | ||
|
||
thread_current_set_affinity(state.thread_index()); | ||
|
||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, 1, 1)); | ||
} | ||
|
||
if (current_thread == 0) { | ||
slots_bitmap_mpmc_free(bitmap); | ||
bitmap = nullptr; | ||
} | ||
} | ||
|
||
static void BenchArguments(benchmark::internal::Benchmark* b) { | ||
// To run more than 131072 iterations is necessary to increase EPOCH_OPERATION_QUEUE_RING_SIZE in | ||
// epoch_operations_queue.h as there is no processing of the queue included with the test | ||
b | ||
->ThreadRange(TEST_THREADS_RANGE_BEGIN, TEST_THREADS_RANGE_END) | ||
->Iterations(BITS_PER_THREAD) | ||
->Repetitions(50) | ||
->DisplayAggregatesOnly(true); | ||
} | ||
|
||
BENCHMARK(slots_bitmap_mpmc_fill_parallel) | ||
->Apply(BenchArguments); | ||
|
||
BENCHMARK(slots_bitmap_mpmc_fill_all_and_release_parallel) | ||
->Apply(BenchArguments)->Iterations(50); | ||
|
||
BENCHMARK(slots_bitmap_mpmc_fill_sequential) | ||
->Apply(BenchArguments); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
/** | ||
* Copyright (C) 2018-2023 Daniele Salvatore Albano | ||
* All rights reserved. | ||
* | ||
* This software may be modified and distributed under the terms | ||
* of the BSD license. See the LICENSE file for details. | ||
**/ | ||
|
||
//Run on (32 X 4199.98 MHz CPU s) | ||
//CPU Caches: | ||
//L1 Data 32 KiB (x16) | ||
//L1 Instruction 32 KiB (x16) | ||
//L2 Unified 512 KiB (x16) | ||
//L3 Unified 16384 KiB (x4) | ||
//Load Average: 14.45, 7.05, 3.46 | ||
//--------------------------------------------------------------------------------------------------------------- | ||
//Benchmark Time CPU Iterations | ||
//--------------------------------------------------------------------------------------------------------------- | ||
//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_mean 877 ns 877 ns 50 | ||
//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_median 876 ns 876 ns 50 | ||
//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_stddev 3.17 ns 3.16 ns 50 | ||
//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_cv 0.36 % 0.36 % 50 | ||
|
||
#include <cstdio> | ||
#include <cstring> | ||
#include <cstdint> | ||
#include <cstdbool> | ||
|
||
#include <benchmark/benchmark.h> | ||
|
||
#include "exttypes.h" | ||
|
||
#include "data_structures/slots_bitmap_spsc/slots_bitmap_spsc.h" | ||
|
||
#include "benchmark-program-simple.hpp" | ||
|
||
#define BITS_PER_THREAD (((sizeof(uint64_t) * 8) * (64 / sizeof(uint64_t))) * 100) | ||
|
||
static void slots_bitmap_spsc_fill_sequential(benchmark::State& state) { | ||
const uint64_t size = BITS_PER_THREAD; | ||
slots_bitmap_spsc_t *bitmap = slots_bitmap_spsc_init(size); | ||
|
||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(slots_bitmap_spsc_get_next_available(bitmap)); | ||
} | ||
|
||
slots_bitmap_spsc_free(bitmap); | ||
} | ||
|
||
static void BenchArguments(benchmark::internal::Benchmark* b) { | ||
// To run more than 131072 iterations is necessary to increase EPOCH_OPERATION_QUEUE_RING_SIZE in | ||
// epoch_operations_queue.h as there is no processing of the queue included with the test | ||
b | ||
->Iterations(BITS_PER_THREAD) | ||
->Repetitions(50) | ||
->DisplayAggregatesOnly(true); | ||
} | ||
|
||
BENCHMARK(slots_bitmap_spsc_fill_sequential) | ||
->Apply(BenchArguments); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
/** | ||
* Copyright (C) 2018-2022 Daniele Salvatore Albano | ||
* All rights reserved. | ||
* | ||
* This software may be modified and distributed under the terms | ||
* of the BSD license. See the LICENSE file for details. | ||
**/ | ||
|
||
#include <numa.h> | ||
|
||
#include <benchmark/benchmark.h> | ||
|
||
#include "exttypes.h" | ||
#include "spinlock.h" | ||
#include "misc.h" | ||
#include "signals_support.h" | ||
#include "thread.h" | ||
#include "log/log.h" | ||
#include "log/sink/log_sink.h" | ||
#include "log/sink/log_sink_console.h" | ||
|
||
class BenchmarkProgramSimple { | ||
private: | ||
const char* tag; | ||
|
||
void setup_initial_log_sink_console() { | ||
log_level_t level = LOG_LEVEL_ALL; | ||
log_sink_settings_t settings = { 0 }; | ||
settings.console.use_stdout_for_errors = false; | ||
|
||
log_sink_register(log_sink_console_init(level, &settings)); | ||
} | ||
|
||
public: | ||
explicit BenchmarkProgramSimple(const char *tag) { | ||
this->tag = tag; | ||
} | ||
|
||
int Main(int argc, char** argv) { | ||
signals_support_register_sigsegv_fatal_handler(); | ||
|
||
// Setup the log sink | ||
BenchmarkProgramSimple::setup_initial_log_sink_console(); | ||
|
||
// Ensure that the current thread is pinned to the core 0 otherwise some tests can fail if the kernel shift around | ||
// the main thread of the process | ||
thread_current_set_affinity(0); | ||
|
||
::benchmark::Initialize(&argc, argv); | ||
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) { | ||
return 1; | ||
} | ||
::benchmark::RunSpecifiedBenchmarks(); | ||
|
||
return 0; | ||
} | ||
}; | ||
|
||
int main(int argc, char** argv) { | ||
return BenchmarkProgramSimple(__FILE__).Main(argc, argv); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.