Initial implementation transactions in new hastable mpmc (#285)

This PR introduces a basic transaction support for the new mpmc hashtable: - it introduces two new data structures, both mpmc and spsc, to fetch and release slot ids using a bitmap, whereas each slot is mapped to each bit of the bitmap - this special bitmap uses a table lookup to quickly find which is the first bit available in an uint16_t drammatically reducing the number of comparisons the algorithm has to carry out - the mpmc variant offers some additional functions over the spsc one to avoid cache-conflicts in the vast majority of the cases - it restructures the transactions related structs and fields, now the id is a plain uint32_t number - it can even be reduced in size to recoup space to be used for different purposes, probably a 20 bit number would be plenty as having 1048575 transactions seems unlikely even on very high end systems - it updates the internal interfaces to provide the needed hooks and checks for the transactions needed for the Read-Modify-Write operations This PR is extremely important because it introduces a blazing fast new data structure, the `slots_bitmap_mpmc` which performs extremely well under contention. The main idea is that each transaction is just a slot of the bitmap and each thread will search in this bitmap first in its own shard (as the bitmap allows localized searches) and then in the whole bitmaps as fallback. The result is that in a normal situation, where slots are available for the thread, to acquire and release 51200 slots per threads it takes just a few nanoseconds (more details in `benches/bench-slots-bitmap-mpmc.cpp`) ``` threads:1 15099228 ns threads:2 1027204 ns threads:4 74470 ns threads:8 5860 ns threads:16 499 ns threads:32 42 ns ``` With 4 thread and in avg 800k ops/s in total (therefore potentially 800k transactions/s) the time required would be 1.16ms, an average overhead of 1.45ns per transaction to be acquired and released meanwhile with with just 1 thread and in avg 200k ops/s would be 295us. In both cases the time to acquire and release a transaction id is close enough to zero to be considered irrelevant!
danielealbano · Mar 1, 2023 · 36fcca8 · 36fcca8
1 parent 73325e3
commit 36fcca8
Show file tree

Hide file tree

Showing 15 changed files with 2,413 additions and 20 deletions.
diff --git a/benches/bench-slots-bitmap-mpmc.cpp b/benches/bench-slots-bitmap-mpmc.cpp
@@ -0,0 +1,207 @@
+/**
+ * Copyright (C) 2018-2023 Daniele Salvatore Albano
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ **/
+
+//Run on (32 X 4200.07 MHz CPU s)
+//CPU Caches:
+//L1 Data 32 KiB (x16)
+//L1 Instruction 32 KiB (x16)
+//L2 Unified 512 KiB (x16)
+//L3 Unified 16384 KiB (x4)
+//Load Average: 0.82, 1.41, 2.10
+//--------------------------------------------------------------------------------------------------------------------------
+//Benchmark                                                                                Time             CPU   Iterations
+//--------------------------------------------------------------------------------------------------------------------------
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_mean                        296 ns          296 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_median                      285 ns          285 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_stddev                     43.9 ns         43.9 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:1_cv                        14.84 %         14.85 %            50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_mean                       75.8 ns          152 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_median                     72.2 ns          144 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_stddev                     12.4 ns         24.9 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:2_cv                        16.39 %         16.40 %            50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_mean                       18.7 ns         74.8 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_median                     18.6 ns         74.4 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_stddev                    0.412 ns         1.65 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:4_cv                         2.20 %          2.20 %            50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_mean                       4.97 ns         39.7 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_median                     4.95 ns         39.6 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_stddev                    0.105 ns        0.837 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:8_cv                         2.10 %          2.11 %            50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_mean                      4.70 ns         75.2 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_median                    4.70 ns         75.2 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_stddev                   0.021 ns        0.337 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:16_cv                        0.44 %          0.45 %            50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_mean                      3.85 ns          123 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_median                    3.84 ns          123 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_stddev                   0.034 ns         1.10 ns           50
+//slots_bitmap_mpmc_fill_parallel/iterations:51200/repeats:50/threads:32_cv                        0.89 %          0.89 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_mean      15224414 ns     15221858 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_median    15099228 ns     15096381 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_stddev      695354 ns       694733 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:1_cv            4.57 %          4.56 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_mean       1027646 ns      2054940 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_median     1027204 ns      2054054 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_stddev        2107 ns         4188 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:2_cv            0.21 %          0.20 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_mean         74484 ns       297899 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_median       74470 ns       297824 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_stddev         151 ns          606 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:4_cv            0.20 %          0.20 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_mean          5854 ns        46821 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_median        5860 ns        46846 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_stddev        41.6 ns          331 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:8_cv            0.71 %          0.71 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_mean          497 ns         7940 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_median        499 ns         7978 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_stddev       9.04 ns          145 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:16_cv           1.82 %          1.82 %            50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_mean         42.1 ns         1340 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_median       42.0 ns         1340 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_stddev      0.192 ns         5.79 ns           50
+//slots_bitmap_mpmc_fill_all_and_release_parallel/iterations:50/repeats:50/threads:32_cv           0.46 %          0.43 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_mean                      285 ns          285 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_median                    285 ns          285 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_stddev                  0.968 ns        0.959 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:1_cv                       0.34 %          0.34 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_mean                      301 ns          601 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_median                    301 ns          601 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_stddev                   1.09 ns         2.18 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:2_cv                       0.36 %          0.36 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_mean                      294 ns         1175 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_median                    289 ns         1157 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_stddev                   7.48 ns         29.8 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:4_cv                       2.55 %          2.54 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_mean                      310 ns         2476 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_median                    309 ns         2473 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_stddev                   2.73 ns         20.9 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:8_cv                       0.88 %          0.84 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_mean                     300 ns         4802 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_median                   300 ns         4792 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_stddev                  2.75 ns         43.2 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:16_cv                      0.92 %          0.90 %            50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_mean                     560 ns        17623 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_median                   558 ns        17662 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_stddev                  5.24 ns          156 ns           50
+//slots_bitmap_mpmc_fill_sequential/iterations:51200/repeats:50/threads:32_cv                      0.94 %          0.88 %            50
+
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+#include <cstdbool>
+
+#include <benchmark/benchmark.h>
+
+#include "misc.h"
+#include "memory_fences.h"
+#include "exttypes.h"
+#include "utils_cpu.h"
+#include "thread.h"
+
+#include "data_structures/slots_bitmap_mpmc/slots_bitmap_mpmc.h"
+
+#include "benchmark-program-simple.hpp"
+
+// It is possible to control the amount of threads used for the test tuning the two defines below
+#define TEST_THREADS_RANGE_BEGIN (1)
+#define TEST_THREADS_RANGE_END (utils_cpu_count())
+#define BITS_PER_THREAD (((sizeof(uint64_t) * 8) * (64 / sizeof(uint64_t))) * 100)
+
+static slots_bitmap_mpmc_t *bitmap = nullptr;
+
+static void slots_bitmap_mpmc_fill_parallel(benchmark::State& state) {
+    int current_thread = state.thread_index();
+    int total_threads = state.threads();
+    const uint64_t size_per_thread = BITS_PER_THREAD;
+    const uint64_t size = size_per_thread * total_threads;
+    uint64_t start = (size / total_threads) * current_thread;
+
+    if (current_thread == 0) {
+        bitmap = slots_bitmap_mpmc_init(size);
+    }
+
+    thread_current_set_affinity(state.thread_index());
+
+    for (auto _ : state) {
+        benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, start, 1));
+    }
+
+    if (current_thread == 0) {
+        slots_bitmap_mpmc_free(bitmap);
+        bitmap = nullptr;
+    }
+}
+
+static void slots_bitmap_mpmc_fill_all_and_release_parallel(benchmark::State& state) {
+    int current_thread = state.thread_index();
+    int total_threads = state.threads();
+    const uint64_t size_per_thread = BITS_PER_THREAD;
+    const uint64_t size = size_per_thread * total_threads;
+    uint64_t start = (size / total_threads) * current_thread;
+
+    if (current_thread == 0) {
+        bitmap = slots_bitmap_mpmc_init(size);
+    }
+
+    thread_current_set_affinity(state.thread_index());
+
+    for (auto _ : state) {
+        for(uint64_t i = start; i < size_per_thread / total_threads; i++) {
+            benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, start, 1));
+        }
+
+        for(uint64_t i = start; i < size_per_thread / total_threads; i++) {
+            slots_bitmap_mpmc_release(bitmap, i);
+        }
+    }
+
+    if (current_thread == 0) {
+        slots_bitmap_mpmc_free(bitmap);
+        bitmap = nullptr;
+    }
+}
+
+static void slots_bitmap_mpmc_fill_sequential(benchmark::State& state) {
+    int current_thread = state.thread_index();
+    int total_threads = state.threads();
+    const uint64_t size_per_thread = BITS_PER_THREAD;
+    const uint64_t size = size_per_thread * total_threads;
+
+    if (current_thread == 0) {
+        bitmap = slots_bitmap_mpmc_init(size);
+    }
+
+    thread_current_set_affinity(state.thread_index());
+
+    for (auto _ : state) {
+        benchmark::DoNotOptimize(slots_bitmap_mpmc_get_next_available_with_step(bitmap, 1, 1));
+    }
+
+    if (current_thread == 0) {
+        slots_bitmap_mpmc_free(bitmap);
+        bitmap = nullptr;
+    }
+}
+
+static void BenchArguments(benchmark::internal::Benchmark* b) {
+    // To run more than 131072 iterations is necessary to increase EPOCH_OPERATION_QUEUE_RING_SIZE in
+    // epoch_operations_queue.h as there is no processing of the queue included with the test
+    b
+            ->ThreadRange(TEST_THREADS_RANGE_BEGIN, TEST_THREADS_RANGE_END)
+            ->Iterations(BITS_PER_THREAD)
+            ->Repetitions(50)
+            ->DisplayAggregatesOnly(true);
+}
+
+BENCHMARK(slots_bitmap_mpmc_fill_parallel)
+->Apply(BenchArguments);
+
+BENCHMARK(slots_bitmap_mpmc_fill_all_and_release_parallel)
+    ->Apply(BenchArguments)->Iterations(50);
+
+BENCHMARK(slots_bitmap_mpmc_fill_sequential)
+    ->Apply(BenchArguments);
diff --git a/benches/bench-slots-bitmap-spsc.cpp b/benches/bench-slots-bitmap-spsc.cpp
@@ -0,0 +1,60 @@
+/**
+ * Copyright (C) 2018-2023 Daniele Salvatore Albano
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ **/
+
+//Run on (32 X 4199.98 MHz CPU s)
+//CPU Caches:
+//L1 Data 32 KiB (x16)
+//L1 Instruction 32 KiB (x16)
+//L2 Unified 512 KiB (x16)
+//L3 Unified 16384 KiB (x4)
+//Load Average: 14.45, 7.05, 3.46
+//---------------------------------------------------------------------------------------------------------------
+//Benchmark                                                                     Time             CPU   Iterations
+//---------------------------------------------------------------------------------------------------------------
+//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_mean          877 ns          877 ns           50
+//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_median        876 ns          876 ns           50
+//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_stddev       3.17 ns         3.16 ns           50
+//slots_bitmap_spsc_fill_sequential/iterations:51200/repeats:50_cv           0.36 %          0.36 %            50
+
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+#include <cstdbool>
+
+#include <benchmark/benchmark.h>
+
+#include "exttypes.h"
+
+#include "data_structures/slots_bitmap_spsc/slots_bitmap_spsc.h"
+
+#include "benchmark-program-simple.hpp"
+
+#define BITS_PER_THREAD (((sizeof(uint64_t) * 8) * (64 / sizeof(uint64_t))) * 100)
+
+static void slots_bitmap_spsc_fill_sequential(benchmark::State& state) {
+    const uint64_t size = BITS_PER_THREAD;
+    slots_bitmap_spsc_t *bitmap = slots_bitmap_spsc_init(size);
+
+    for (auto _ : state) {
+        benchmark::DoNotOptimize(slots_bitmap_spsc_get_next_available(bitmap));
+    }
+
+    slots_bitmap_spsc_free(bitmap);
+}
+
+static void BenchArguments(benchmark::internal::Benchmark* b) {
+    // To run more than 131072 iterations is necessary to increase EPOCH_OPERATION_QUEUE_RING_SIZE in
+    // epoch_operations_queue.h as there is no processing of the queue included with the test
+    b
+            ->Iterations(BITS_PER_THREAD)
+            ->Repetitions(50)
+            ->DisplayAggregatesOnly(true);
+}
+
+BENCHMARK(slots_bitmap_spsc_fill_sequential)
+    ->Apply(BenchArguments);
diff --git a/benches/benchmark-program-simple.hpp b/benches/benchmark-program-simple.hpp
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) 2018-2022 Daniele Salvatore Albano
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ **/
+
+#include <numa.h>
+
+#include <benchmark/benchmark.h>
+
+#include "exttypes.h"
+#include "spinlock.h"
+#include "misc.h"
+#include "signals_support.h"
+#include "thread.h"
+#include "log/log.h"
+#include "log/sink/log_sink.h"
+#include "log/sink/log_sink_console.h"
+
+class BenchmarkProgramSimple {
+private:
+    const char* tag;
+
+    void setup_initial_log_sink_console() {
+        log_level_t level = LOG_LEVEL_ALL;
+        log_sink_settings_t settings = { 0 };
+        settings.console.use_stdout_for_errors = false;
+
+        log_sink_register(log_sink_console_init(level, &settings));
+    }
+
+public:
+    explicit BenchmarkProgramSimple(const char *tag) {
+        this->tag = tag;
+    }
+
+    int Main(int argc, char** argv) {
+        signals_support_register_sigsegv_fatal_handler();
+
+        // Setup the log sink
+        BenchmarkProgramSimple::setup_initial_log_sink_console();
+
+        // Ensure that the current thread is pinned to the core 0 otherwise some tests can fail if the kernel shift around
+        // the main thread of the process
+        thread_current_set_affinity(0);
+
+        ::benchmark::Initialize(&argc, argv);
+        if (::benchmark::ReportUnrecognizedArguments(argc, argv)) {
+            return 1;
+        }
+        ::benchmark::RunSpecifiedBenchmarks();
+
+        return 0;
+    }
+};
+
+int main(int argc, char** argv) {
+    return BenchmarkProgramSimple(__FILE__).Main(argc, argv);
+}
diff --git a/src/data_structures/hashtable/spsc/hashtable_spsc.c b/src/data_structures/hashtable/spsc/hashtable_spsc.c
@@ -1,3 +1,11 @@
+/**
+ * Copyright (C) 2018-2023 Daniele Salvatore Albano
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license. See the LICENSE file for details.
+ **/
+
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>