Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@
[submodule "external/zstd"]
path = external/zstd
url = https://github.com/facebook/zstd.git
[submodule "external/abseil-cpp"]
path = external/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
[submodule "external/oneTBB"]
path = external/oneTBB
url = https://github.com/oneapi-src/oneTBB.git
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ else()
endif()

add_subdirectory(external ${EXLUSION_SPECIFIER})

add_subdirectory(src)
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
add_subdirectory(bench ${EXLUSION_SPECIFIER})
Expand Down
4 changes: 2 additions & 2 deletions bench/riptide_bench/riptide_bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ add_executable(${TARGET_NAME} ${HEADERS} ${SOURCES})

get_target_property(RT_SOURCE_DIR riptide_cpp SOURCE_DIR)

target_include_directories(${TARGET_NAME} PRIVATE ${RT_SOURCE_DIR} ${Python3_INCLUDE_DIRS} ${Python3_NumPy_INCLUDE_DIRS})
target_include_directories(${TARGET_NAME} PRIVATE ${RT_SOURCE_DIR} external/oneTBB/include ${Python3_INCLUDE_DIRS} ${Python3_NumPy_INCLUDE_DIRS})

target_link_directories(${TARGET_NAME} PRIVATE ${Python3_LIBRARY_DIRS})

target_link_libraries(${TARGET_NAME} PRIVATE riptide_cpp ${Python3_Libraries} benchmark::benchmark $<$<PLATFORM_ID:Linux>:pthread> $<$<PLATFORM_ID:Linux>:rt>)
target_link_libraries(${TARGET_NAME} PRIVATE riptide_cpp TBB::tbb ${Python3_Libraries} benchmark::benchmark $<$<PLATFORM_ID:Linux>:pthread> $<$<PLATFORM_ID:Linux>:rt>)

if(WIN32)
set(_TARGET_DIR $<TARGET_FILE_DIR:${TARGET_NAME}>)
Expand Down
67 changes: 63 additions & 4 deletions bench/riptide_bench/riptide_bench/hash_linear_bench.cpp
Original file line number Diff line number Diff line change
@@ -1,19 +1,45 @@
#include "RipTide.h"
#include "HashLinear.h"
#include "flat_hash_map.h"

#include "benchmark/benchmark.h"

#include <array>
#include <vector>
#include <numeric>
#include <algorithm>
#include <random>

namespace
{
#if 0
std::vector<uint64_t> test_data(1024ULL * 1024ULL * 1024ULL);
std::vector<uint64_t> test_data(2ULL * 1024ULL * 1024ULL);
std::random_device dev{};
CHashLinear<uint64_t, int64_t> hasher{};
fhm_hasher<uint64_t, int64_t> new_hasher{};
std::vector<uint64_t> needles(1024ULL * 1024ULL);
std::array<int64_t, 1024ULL * 1024ULL> output{};
std::array<int8_t, 1024ULL * 1024ULL> bools{};

void bench_IsMemberHash64(benchmark::State & state)
{
std::mt19937 engine(dev());
std::uniform_int_distribution<uint64_t> dist(3002950000, test_data.size() + 3002950000);
std::iota(std::begin(test_data), std::end(test_data), 3002954500);
std::generate(std::begin(needles), std::end(needles), [&] { return dist(engine); });

for (auto _ : state)
{
IsMemberHash64(needles.size(), needles.data(), test_data.size(), test_data.data(), output.data(), bools.data(), 8, HASH_MODE(1), 0);
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
benchmark::DoNotOptimize(bools.data());
benchmark::DoNotOptimize(needles.data());
benchmark::ClobberMemory();
}

}

BENCHMARK(bench_IsMemberHash64)->Unit(benchmark::kMillisecond)->UseRealTime();

void bench_MakeHashLocation(benchmark::State & state)
{
Expand All @@ -27,6 +53,39 @@ namespace
}
}

BENCHMARK(bench_MakeHashLocation);
#endif
BENCHMARK(bench_MakeHashLocation)->Unit(benchmark::kMillisecond)->UseRealTime();

void bench_is_member(benchmark::State & state)
{
std::mt19937 engine(dev());
std::uniform_int_distribution<uint64_t> dist(3002950000, test_data.size() + 3002950000);
std::iota(std::begin(test_data), std::end(test_data), 3002954500);
std::generate(std::begin(needles), std::end(needles), [&] { return dist(engine); });

for (auto _ : state)
{
is_member(needles.size(), reinterpret_cast<char const *>(needles.data()), test_data.size(), reinterpret_cast<char const *>(test_data.data()), output.data(), bools.data(), uint64_t{});
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
benchmark::DoNotOptimize(bools.data());
benchmark::DoNotOptimize(needles.data());
benchmark::ClobberMemory();
}
}

BENCHMARK(bench_is_member)->Unit(benchmark::kMillisecond)->UseRealTime();

void bench_make_hash(benchmark::State & state)
{
std::iota(std::begin(test_data), std::end(test_data), 3002954500);
std::shuffle(std::begin(test_data), std::end(test_data), std::mt19937{ dev() });
for (auto _ : state)
{
new_hasher.make_hash(test_data.size(), reinterpret_cast<char const*>(test_data.data()), 0);
benchmark::DoNotOptimize(test_data.data());
benchmark::ClobberMemory();
}
}

BENCHMARK(bench_make_hash)->Unit(benchmark::kMillisecond)->UseRealTime();
}
8 changes: 8 additions & 0 deletions external/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,11 @@ set(ZSTD_MULTITHREAD_SUPPORT OFF CACHE BOOL "" FORCE)
add_subdirectory(zstd/build/cmake)

set(EXTERNAL_ZSTD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/zstd/lib PARENT_SCOPE)

# abseil-cpp
set(ABSL_BUILD_TESTING OFF CACHE BOOL "" FORCE)
set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
add_subdirectory(abseil-cpp)

set(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH ON CACHE BOOL "" FORCE)
add_subdirectory(oneTBB)
1 change: 1 addition & 0 deletions external/abseil-cpp
Submodule abseil-cpp added at 0c6302
1 change: 1 addition & 0 deletions external/oneTBB
Submodule oneTBB added at 9d2a34
9 changes: 8 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ set(HEADERS
DateTime.h
Ema.h
#FileReadWrite.h
flat_hash_map.h
GroupBy.h
HashFunctions.h
HashLinear.h
Expand Down Expand Up @@ -86,6 +87,7 @@ set(SOURCES
DateTime.cpp
Ema.cpp
#FileReadWrite.cpp
flat_hash_map.cpp
GroupBy.cpp
HashFunctions.cpp
HashLinear.cpp
Expand Down Expand Up @@ -117,12 +119,17 @@ get_target_property(RT_SOURCE_DIR riptide_cpp SOURCE_DIR)

target_include_directories(${TARGET_NAME} PRIVATE
${EXTERNAL_ZSTD_INCLUDE_DIR}
../external/oneTBB/include
${Python3_INCLUDE_DIRS}
${Python3_NumPy_INCLUDE_DIRS} )
${Python3_NumPy_INCLUDE_DIRS}
${ABSL_COMMON_INCLUDE_DIRS})

target_link_libraries(${TARGET_NAME}
Python3::Python
libzstd_static
absl::base
absl::flat_hash_map
TBB::tbb
$<$<PLATFORM_ID:Linux>:rt>)

# Configure the library name to identify it as a Python extension module.
Expand Down
29 changes: 23 additions & 6 deletions src/HashFunctions.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#include "RipTide.h"
#include "flat_hash_map.h"
#include "HashFunctions.h"
#include "HashLinear.h"
#include "ndarray.h"
#include "one_input.h"

#include <cstdlib>

// struct ndbuf;
// typedef struct ndbuf {
Expand Down Expand Up @@ -107,15 +111,27 @@ PyObject * IsMember64(PyObject * self, PyObject * args)
}
else
{
if (arrayType1 == NPY_FLOAT32 || arrayType1 == NPY_FLOAT64)
if (std::getenv("RT_NEW_HASH"))
{
IsMemberHash64(arraySize1, pDataIn1, arraySize2, pDataIn2, pDataOut2, pDataOut1, sizeType1 + 100,
HASH_MODE(hashMode), hintSize);
throw std::runtime_error("We're executing the new code path");
auto [opt_op_trait, opt_type_trait] = riptable_cpp::set_traits(0, arrayType1);
riptable_cpp::data_type_t variant = *opt_type_trait;
[[maybe_unused]] int retval = is_member_for_type(
arraySize1, reinterpret_cast<char const *>(pDataIn1), arraySize2, reinterpret_cast<char const *>(pDataIn2),
pDataOut2, pDataOut1, variant, std::make_index_sequence<std::variant_size_v<riptable_cpp::data_type_t>>{});
}
else
{
IsMemberHash64(arraySize1, pDataIn1, arraySize2, pDataIn2, pDataOut2, pDataOut1, sizeType1,
HASH_MODE(hashMode), hintSize);
if (arrayType1 == NPY_FLOAT32 || arrayType1 == NPY_FLOAT64)
{
IsMemberHash64(arraySize1, pDataIn1, arraySize2, pDataIn2, pDataOut2, pDataOut1, sizeType1 + 100,
HASH_MODE(hashMode), hintSize);
}
else
{
IsMemberHash64(arraySize1, pDataIn1, arraySize2, pDataIn2, pDataOut2, pDataOut1, sizeType1,
HASH_MODE(hashMode), hintSize);
}
}

PyObject * retObject = Py_BuildValue("(OO)", boolArray, indexArray);
Expand Down Expand Up @@ -168,7 +184,8 @@ PyObject * IsMemberCategorical(PyObject * self, PyObject * args)
int sizeType1 = (int)NpyItemSize((PyObject *)inArr1);
int sizeType2 = (int)NpyItemSize((PyObject *)inArr2);

LOGGING("IsMember32 %s vs %s size: %d %d\n", NpyToString(arrayType1), NpyToString(arrayType2), sizeType1, sizeType2);
LOGGING("IsMemberCategorical %s vs %s size: %d %d\n", NpyToString(arrayType1), NpyToString(arrayType2), sizeType1,
sizeType2);

switch (arrayType1)
{
Expand Down
Loading